From 5a0b8cb46624cc17fda676d6ae44fb72504f0ad9 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Tue, 13 Mar 2018 14:23:28 -0700 Subject: iio: cros_ec: Move cros_ec_sensors_core.h in /include Similar to other common iio frameworks, move cros_ec_sensors_core.h from drivers/iio/common/cros_ec_sensors/ to include/linux/iio/common. Signed-off-by: Gwendal Grignou Signed-off-by: Jonathan Cameron --- include/linux/iio/common/cros_ec_sensors_core.h | 180 ++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 include/linux/iio/common/cros_ec_sensors_core.h (limited to 'include') diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h new file mode 100644 index 000000000000..ce16445411ac --- /dev/null +++ b/include/linux/iio/common/cros_ec_sensors_core.h @@ -0,0 +1,180 @@ +/* + * ChromeOS EC sensor hub + * + * Copyright (C) 2016 Google, Inc + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CROS_EC_SENSORS_CORE_H +#define __CROS_EC_SENSORS_CORE_H + +#include +#include +#include + +enum { + CROS_EC_SENSOR_X, + CROS_EC_SENSOR_Y, + CROS_EC_SENSOR_Z, + CROS_EC_SENSOR_MAX_AXIS, +}; + +/* EC returns sensor values using signed 16 bit registers */ +#define CROS_EC_SENSOR_BITS 16 + +/* + * 4 16 bit channels are allowed. + * Good enough for current sensors, they use up to 3 16 bit vectors. + */ +#define CROS_EC_SAMPLE_SIZE (sizeof(s64) * 2) + +/* Minimum sampling period to use when device is suspending */ +#define CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY 1000 /* 1 second */ + +/** + * struct cros_ec_sensors_core_state - state data for EC sensors IIO driver + * @ec: cros EC device structure + * @cmd_lock: lock used to prevent simultaneous access to the + * commands. + * @msg: cros EC command structure + * @param: motion sensor parameters structure + * @resp: motion sensor response structure + * @type: type of motion sensor + * @loc: location where the motion sensor is placed + * @calib: calibration parameters. Note that trigger + * captured data will always provide the calibrated + * data + * @samples: static array to hold data from a single capture. + * For each channel we need 2 bytes, except for + * the timestamp. The timestamp is always last and + * is always 8-byte aligned. + * @read_ec_sensors_data: function used for accessing sensors values + * @cuur_sampl_freq: current sampling period + */ +struct cros_ec_sensors_core_state { + struct cros_ec_device *ec; + struct mutex cmd_lock; + + struct cros_ec_command *msg; + struct ec_params_motion_sense param; + struct ec_response_motion_sense *resp; + + enum motionsensor_type type; + enum motionsensor_location loc; + + s16 calib[CROS_EC_SENSOR_MAX_AXIS]; + + u8 samples[CROS_EC_SAMPLE_SIZE]; + + int (*read_ec_sensors_data)(struct iio_dev *indio_dev, + unsigned long scan_mask, s16 *data); + + int curr_sampl_freq; +}; + +/** + * cros_ec_sensors_read_lpc() - retrieve data from EC shared memory + * @indio_dev: pointer to IIO device + * @scan_mask: bitmap of the sensor indices to scan + * @data: location to store data + * + * This is the safe function for reading the EC data. It guarantees that the + * data sampled was not modified by the EC while being read. + * + * Return: 0 on success, -errno on failure. + */ +int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev, unsigned long scan_mask, + s16 *data); + +/** + * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol + * @indio_dev: pointer to IIO device + * @scan_mask: bitmap of the sensor indices to scan + * @data: location to store data + * + * Return: 0 on success, -errno on failure. + */ +int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev, unsigned long scan_mask, + s16 *data); + +struct platform_device; +/** + * cros_ec_sensors_core_init() - basic initialization of the core structure + * @pdev: platform device created for the sensors + * @indio_dev: iio device structure of the device + * @physical_device: true if the device refers to a physical device + * + * Return: 0 on success, -errno on failure. + */ +int cros_ec_sensors_core_init(struct platform_device *pdev, + struct iio_dev *indio_dev, bool physical_device); + +/** + * cros_ec_sensors_capture() - the trigger handler function + * @irq: the interrupt number. + * @p: a pointer to the poll function. + * + * On a trigger event occurring, if the pollfunc is attached then this + * handler is called as a threaded interrupt (and hence may sleep). It + * is responsible for grabbing data from the device and pushing it into + * the associated buffer. + * + * Return: IRQ_HANDLED + */ +irqreturn_t cros_ec_sensors_capture(int irq, void *p); + +/** + * cros_ec_motion_send_host_cmd() - send motion sense host command + * @st: pointer to state information for device + * @opt_length: optional length to reduce the response size, useful on the data + * path. Otherwise, the maximal allowed response size is used + * + * When called, the sub-command is assumed to be set in param->cmd. + * + * Return: 0 on success, -errno on failure. + */ +int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *st, + u16 opt_length); + +/** + * cros_ec_sensors_core_read() - function to request a value from the sensor + * @st: pointer to state information for device + * @chan: channel specification structure table + * @val: will contain one element making up the returned value + * @val2: will contain another element making up the returned value + * @mask: specifies which values to be requested + * + * Return: the type of value returned by the device + */ +int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask); + +/** + * cros_ec_sensors_core_write() - function to write a value to the sensor + * @st: pointer to state information for device + * @chan: channel specification structure table + * @val: first part of value to write + * @val2: second part of value to write + * @mask: specifies which values to write + * + * Return: the type of value returned by the device + */ +int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st, + struct iio_chan_spec const *chan, + int val, int val2, long mask); + +extern const struct dev_pm_ops cros_ec_sensors_pm_ops; + +/* List of extended channel specification for all sensors */ +extern const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[]; + +#endif /* __CROS_EC_SENSORS_CORE_H */ -- cgit v1.2.3 From 192af06a287bbfcae0de8fa456dca15767ecd056 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 12 Mar 2018 17:48:15 +0200 Subject: iio: adc: ad7780: remove IIO_CHAN_INFO_SAMP_FREQ support The `ad7780` driver does not implement setting/getting the sampling frequency. For the ad7780/ad7781 devices, the control is done via an external pin, and the ad7170/ad7171 devices have a fixed sampling rate (so, no control). For these devices, and similar other that may be added later on, a AD_SD_CHANNEL_NO_SAMPLE_FREQ() macro has been added, which doesn't set the IIO_CHAN_INFO_SAMP_FREQ flag. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/staging/iio/adc/ad7780.c | 2 +- include/linux/iio/adc/ad_sigma_delta.h | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/staging/iio/adc/ad7780.c b/drivers/staging/iio/adc/ad7780.c index a7797af579b9..16d72072c076 100644 --- a/drivers/staging/iio/adc/ad7780.c +++ b/drivers/staging/iio/adc/ad7780.c @@ -128,7 +128,7 @@ static const struct ad_sigma_delta_info ad7780_sigma_delta_info = { }; #define AD7780_CHANNEL(bits, wordsize) \ - AD_SD_CHANNEL(1, 0, 0, bits, 32, wordsize - bits) + AD_SD_CHANNEL_NO_SAMP_FREQ(1, 0, 0, bits, 32, wordsize - bits) static const struct ad7780_chip_info ad7780_chip_info_tbl[] = { [ID_AD7170] = { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 1fc7abd28b0b..730ead1a46df 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -127,7 +127,7 @@ void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev); int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig); #define __AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ - _storagebits, _shift, _extend_name, _type) \ + _storagebits, _shift, _extend_name, _type, _mask_all) \ { \ .type = (_type), \ .differential = (_channel2 == -1 ? 0 : 1), \ @@ -139,7 +139,7 @@ int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig); .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ BIT(IIO_CHAN_INFO_OFFSET), \ .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \ - .info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SAMP_FREQ), \ + .info_mask_shared_by_all = _mask_all, \ .scan_index = (_si), \ .scan_type = { \ .sign = 'u', \ @@ -153,25 +153,35 @@ int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig); #define AD_SD_DIFF_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ _storagebits, _shift) \ __AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ - _storagebits, _shift, NULL, IIO_VOLTAGE) + _storagebits, _shift, NULL, IIO_VOLTAGE, \ + BIT(IIO_CHAN_INFO_SAMP_FREQ)) #define AD_SD_SHORTED_CHANNEL(_si, _channel, _address, _bits, \ _storagebits, _shift) \ __AD_SD_CHANNEL(_si, _channel, _channel, _address, _bits, \ - _storagebits, _shift, "shorted", IIO_VOLTAGE) + _storagebits, _shift, "shorted", IIO_VOLTAGE, \ + BIT(IIO_CHAN_INFO_SAMP_FREQ)) #define AD_SD_CHANNEL(_si, _channel, _address, _bits, \ _storagebits, _shift) \ __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ - _storagebits, _shift, NULL, IIO_VOLTAGE) + _storagebits, _shift, NULL, IIO_VOLTAGE, \ + BIT(IIO_CHAN_INFO_SAMP_FREQ)) + +#define AD_SD_CHANNEL_NO_SAMP_FREQ(_si, _channel, _address, _bits, \ + _storagebits, _shift) \ + __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ + _storagebits, _shift, NULL, IIO_VOLTAGE, 0) #define AD_SD_TEMP_CHANNEL(_si, _address, _bits, _storagebits, _shift) \ __AD_SD_CHANNEL(_si, 0, -1, _address, _bits, \ - _storagebits, _shift, NULL, IIO_TEMP) + _storagebits, _shift, NULL, IIO_TEMP, \ + BIT(IIO_CHAN_INFO_SAMP_FREQ)) #define AD_SD_SUPPLY_CHANNEL(_si, _channel, _address, _bits, _storagebits, \ _shift) \ __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ - _storagebits, _shift, "supply", IIO_VOLTAGE) + _storagebits, _shift, "supply", IIO_VOLTAGE, \ + BIT(IIO_CHAN_INFO_SAMP_FREQ)) #endif -- cgit v1.2.3 From 06d42212e69bfa953aec5887c75a1781f27c2a2e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 17 Mar 2018 15:39:41 +0530 Subject: dt-bindings: clock: Add Actions S900 clock bindings Add Actions Semi S900 clock bindings. Signed-off-by: Manivannan Sadhasivam Acked-by: Rob Herring Signed-off-by: Stephen Boyd --- .../devicetree/bindings/clock/actions,s900-cmu.txt | 47 ++++++++ include/dt-bindings/clock/actions,s900-cmu.h | 129 +++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/actions,s900-cmu.txt create mode 100644 include/dt-bindings/clock/actions,s900-cmu.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt b/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt new file mode 100644 index 000000000000..93e4fb827cd6 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt @@ -0,0 +1,47 @@ +* Actions S900 Clock Management Unit (CMU) + +The Actions S900 clock management unit generates and supplies clock to various +controllers within the SoC. The clock binding described here is applicable to +S900 SoC. + +Required Properties: + +- compatible: should be "actions,s900-cmu" +- reg: physical base address of the controller and length of memory mapped + region. +- clocks: Reference to the parent clocks ("hosc", "losc") +- #clock-cells: should be 1. + +Each clock is assigned an identifier, and client nodes can use this identifier +to specify the clock which they consume. + +All available clocks are defined as preprocessor macros in +dt-bindings/clock/actions,s900-cmu.h header and can be used in device +tree sources. + +External clocks: + +The hosc clock used as input for the plls is generated outside the SoC. It is +expected that it is defined using standard clock bindings as "hosc". + +Actions S900 CMU also requires one more clock: + - "losc" - internal low frequency oscillator + +Example: Clock Management Unit node: + + cmu: clock-controller@e0160000 { + compatible = "actions,s900-cmu"; + reg = <0x0 0xe0160000 0x0 0x1000>; + clocks = <&hosc>, <&losc>; + #clock-cells = <1>; + }; + +Example: UART controller node that consumes clock generated by the clock +management unit: + + uart: serial@e012a000 { + compatible = "actions,s900-uart", "actions,owl-uart"; + reg = <0x0 0xe012a000 0x0 0x2000>; + interrupts = ; + clocks = <&cmu CLK_UART5>; + }; diff --git a/include/dt-bindings/clock/actions,s900-cmu.h b/include/dt-bindings/clock/actions,s900-cmu.h new file mode 100644 index 000000000000..7c1251565f43 --- /dev/null +++ b/include/dt-bindings/clock/actions,s900-cmu.h @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Device Tree binding constants for Actions Semi S900 Clock Management Unit +// +// Copyright (c) 2014 Actions Semi Inc. +// Copyright (c) 2018 Linaro Ltd. + +#ifndef __DT_BINDINGS_CLOCK_S900_CMU_H +#define __DT_BINDINGS_CLOCK_S900_CMU_H + +#define CLK_NONE 0 + +/* fixed rate clocks */ +#define CLK_LOSC 1 +#define CLK_HOSC 2 + +/* pll clocks */ +#define CLK_CORE_PLL 3 +#define CLK_DEV_PLL 4 +#define CLK_DDR_PLL 5 +#define CLK_NAND_PLL 6 +#define CLK_DISPLAY_PLL 7 +#define CLK_DSI_PLL 8 +#define CLK_ASSIST_PLL 9 +#define CLK_AUDIO_PLL 10 + +/* system clock */ +#define CLK_CPU 15 +#define CLK_DEV 16 +#define CLK_NOC 17 +#define CLK_NOC_MUX 18 +#define CLK_NOC_DIV 19 +#define CLK_AHB 20 +#define CLK_APB 21 +#define CLK_DMAC 22 + +/* peripheral device clock */ +#define CLK_GPIO 23 + +#define CLK_BISP 24 +#define CLK_CSI0 25 +#define CLK_CSI1 26 + +#define CLK_DE0 27 +#define CLK_DE1 28 +#define CLK_DE2 29 +#define CLK_DE3 30 +#define CLK_DSI 32 + +#define CLK_GPU 33 +#define CLK_GPU_CORE 34 +#define CLK_GPU_MEM 35 +#define CLK_GPU_SYS 36 + +#define CLK_HDE 37 +#define CLK_I2C0 38 +#define CLK_I2C1 39 +#define CLK_I2C2 40 +#define CLK_I2C3 41 +#define CLK_I2C4 42 +#define CLK_I2C5 43 +#define CLK_I2SRX 44 +#define CLK_I2STX 45 +#define CLK_IMX 46 +#define CLK_LCD 47 +#define CLK_NAND0 48 +#define CLK_NAND1 49 +#define CLK_PWM0 50 +#define CLK_PWM1 51 +#define CLK_PWM2 52 +#define CLK_PWM3 53 +#define CLK_PWM4 54 +#define CLK_PWM5 55 +#define CLK_SD0 56 +#define CLK_SD1 57 +#define CLK_SD2 58 +#define CLK_SD3 59 +#define CLK_SENSOR 60 +#define CLK_SPEED_SENSOR 61 +#define CLK_SPI0 62 +#define CLK_SPI1 63 +#define CLK_SPI2 64 +#define CLK_SPI3 65 +#define CLK_THERMAL_SENSOR 66 +#define CLK_UART0 67 +#define CLK_UART1 68 +#define CLK_UART2 69 +#define CLK_UART3 70 +#define CLK_UART4 71 +#define CLK_UART5 72 +#define CLK_UART6 73 +#define CLK_VCE 74 +#define CLK_VDE 75 + +#define CLK_USB3_480MPLL0 76 +#define CLK_USB3_480MPHY0 77 +#define CLK_USB3_5GPHY 78 +#define CLK_USB3_CCE 79 +#define CLK_USB3_MAC 80 + +#define CLK_TIMER 83 + +#define CLK_HDMI_AUDIO 84 + +#define CLK_24M 85 + +#define CLK_EDP 86 + +#define CLK_24M_EDP 87 +#define CLK_EDP_PLL 88 +#define CLK_EDP_LINK 89 + +#define CLK_USB2H0_PLLEN 90 +#define CLK_USB2H0_PHY 91 +#define CLK_USB2H0_CCE 92 +#define CLK_USB2H1_PLLEN 93 +#define CLK_USB2H1_PHY 94 +#define CLK_USB2H1_CCE 95 + +#define CLK_DDR0 96 +#define CLK_DDR1 97 +#define CLK_DMM 98 + +#define CLK_ETH_MAC 99 +#define CLK_RMII_REF 100 + +#define CLK_NR_CLKS (CLK_RMII_REF + 1) + +#endif /* __DT_BINDINGS_CLOCK_S900_CMU_H */ -- cgit v1.2.3 From 3b7008b226f3de811d4ac34238e9cf670f7c9fe7 Mon Sep 17 00:00:00 2001 From: Szymon Lukasz Date: Thu, 9 Nov 2017 21:23:35 +0100 Subject: fuse: return -ECONNABORTED on /dev/fuse read after abort Currently the userspace has no way of knowing whether the fuse connection ended because of umount or abort via sysfs. It makes it hard for filesystems to free the mountpoint after abort without worrying about removing some new mount. The patch fixes it by returning different errors when userspace reads from /dev/fuse (-ENODEV for umount and -ECONNABORTED for abort). Add a new capability flag FUSE_ABORT_ERROR. If set and the connection is gone because of sysfs abort, reading from the device will return -ECONNABORTED. Signed-off-by: Szymon Lukasz Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 2 +- fs/fuse/cuse.c | 4 ++-- fs/fuse/dev.c | 12 +++++++----- fs/fuse/fuse_i.h | 8 +++++++- fs/fuse/inode.c | 9 ++++++--- include/uapi/linux/fuse.h | 7 ++++++- 6 files changed, 29 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b9ea99c5b5b3..78fb7a07c5ca 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc); + fuse_abort_conn(fc, true); fuse_conn_put(fc); } return count; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e9e97803442a..31d33b69957f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -406,7 +406,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); goto out; } @@ -581,7 +581,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc); + fuse_abort_conn(&cc->fc, false); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5d06384c2cae..78b6293bd04d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1234,9 +1234,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (err) goto err_unlock; - err = -ENODEV; - if (!fiq->connected) + if (!fiq->connected) { + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto err_unlock; + } if (!list_empty(&fiq->interrupts)) { req = list_entry(fiq->interrupts.next, struct fuse_req, @@ -1287,7 +1288,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = -ENODEV; + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -2076,7 +2077,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc) +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) { struct fuse_iqueue *fiq = &fc->iq; @@ -2089,6 +2090,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2151,7 +2153,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); } fuse_dev_free(fud); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c4c093bbf456..7d2e7deea64b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -515,6 +515,9 @@ struct fuse_conn { abort and device release */ unsigned connected; + /** Connection aborted via sysfs */ + bool aborted; + /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ @@ -526,6 +529,9 @@ struct fuse_conn { /** Do readpages asynchronously? Only set in INIT */ unsigned async_read:1; + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc:1; @@ -851,7 +857,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc); +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); /** * Invalidate inode attributes diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..3c9b675d99da 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + fuse_abort_conn(get_fuse_conn_super(sb), false); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb) fuse_send_destroy(fc); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -918,6 +918,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_ABORT_ERROR) + fc->abort_err = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -948,7 +950,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | + FUSE_ABORT_ERROR; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 4b5001c57f46..92fa24c24c92 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -113,6 +113,9 @@ * 7.26 * - add FUSE_HANDLE_KILLPRIV * - add FUSE_POSIX_ACL + * + * 7.27 + * - add FUSE_ABORT_ERROR */ #ifndef _LINUX_FUSE_H @@ -148,7 +151,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 26 +#define FUSE_KERNEL_MINOR_VERSION 27 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -245,6 +248,7 @@ struct fuse_file_lock { * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc * FUSE_POSIX_ACL: filesystem supports posix acls + * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -267,6 +271,7 @@ struct fuse_file_lock { #define FUSE_PARALLEL_DIROPS (1 << 18) #define FUSE_HANDLE_KILLPRIV (1 << 19) #define FUSE_POSIX_ACL (1 << 20) +#define FUSE_ABORT_ERROR (1 << 21) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From 3ae7fb202d86b7847f237daa474f3946bdc3b0c6 Mon Sep 17 00:00:00 2001 From: Haneen Mohammed Date: Tue, 20 Mar 2018 09:37:49 -0400 Subject: drm: Remove drm_property_{un/reference}_blob aliases This patch remove the compatibility aliases drm_property_{reference/unreference}_blob of drm_property_blob_{get/put} since all callers have been converted to the prefered _{get/put}. Remove the helpers from the semantic patch drm-get-put-cocci. Signed-off-by: Haneen Mohammed Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20180320133749.GA11695@haneen-VirtualBox --- include/drm/drm_property.h | 26 -------------------------- scripts/coccinelle/api/drm-get-put.cocci | 10 ---------- 2 files changed, 36 deletions(-) (limited to 'include') diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h index d1423c7f3c73..ab8167baade5 100644 --- a/include/drm/drm_property.h +++ b/include/drm/drm_property.h @@ -280,32 +280,6 @@ bool drm_property_replace_blob(struct drm_property_blob **blob, struct drm_property_blob *drm_property_blob_get(struct drm_property_blob *blob); void drm_property_blob_put(struct drm_property_blob *blob); -/** - * drm_property_reference_blob - acquire a blob property reference - * @blob: DRM blob property - * - * This is a compatibility alias for drm_property_blob_get() and should not be - * used by new code. - */ -static inline struct drm_property_blob * -drm_property_reference_blob(struct drm_property_blob *blob) -{ - return drm_property_blob_get(blob); -} - -/** - * drm_property_unreference_blob - release a blob property reference - * @blob: DRM blob property - * - * This is a compatibility alias for drm_property_blob_put() and should not be - * used by new code. - */ -static inline void -drm_property_unreference_blob(struct drm_property_blob *blob) -{ - drm_property_blob_put(blob); -} - /** * drm_property_find - find property object * @dev: DRM device diff --git a/scripts/coccinelle/api/drm-get-put.cocci b/scripts/coccinelle/api/drm-get-put.cocci index ceb71ea7f61c..3a09c97ad87d 100644 --- a/scripts/coccinelle/api/drm-get-put.cocci +++ b/scripts/coccinelle/api/drm-get-put.cocci @@ -40,12 +40,6 @@ expression object; - drm_gem_object_unreference_unlocked(object) + drm_gem_object_put_unlocked(object) | -- drm_property_reference_blob(object) -+ drm_property_blob_get(object) -| -- drm_property_unreference_blob(object) -+ drm_property_blob_put(object) -| - drm_dev_unref(object) + drm_dev_put(object) ) @@ -72,10 +66,6 @@ __drm_gem_object_unreference(object) | drm_gem_object_unreference_unlocked(object) | -drm_property_unreference_blob@p(object) -| -drm_property_reference_blob@p(object) -| drm_dev_unref@p(object) ) -- cgit v1.2.3 From 56859d310c0eacc4a77b0cb1c9087c161b463e5d Mon Sep 17 00:00:00 2001 From: Tali Perry Date: Tue, 20 Mar 2018 15:40:48 +0200 Subject: dt-binding: clk: npcm750: Add binding for Nuvoton NPCM7XX Clock * Nuvoton NPCM7XX Clock Controller Nuvoton Poleg BMC NPCM7XX contains an integrated clock controller, which generates and supplies clocks to all modules within the BMC. Signed-off-by: Tali Perry Reviewed-by: Rob Herring Signed-off-by: Stephen Boyd --- .../bindings/clock/nuvoton,npcm750-clk.txt | 100 +++++++++++++++++++++ include/dt-bindings/clock/nuvoton,npcm7xx-clock.h | 44 +++++++++ 2 files changed, 144 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt create mode 100644 include/dt-bindings/clock/nuvoton,npcm7xx-clock.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt b/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt new file mode 100644 index 000000000000..f82064546d11 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt @@ -0,0 +1,100 @@ +* Nuvoton NPCM7XX Clock Controller + +Nuvoton Poleg BMC NPCM7XX contains an integrated clock controller, which +generates and supplies clocks to all modules within the BMC. + +External clocks: + +There are six fixed clocks that are generated outside the BMC. All clocks are of +a known fixed value that cannot be changed. clk_refclk, clk_mcbypck and +clk_sysbypck are inputs to the clock controller. +clk_rg1refck, clk_rg2refck and clk_xin are external clocks suppling the +network. They are set on the device tree, but not used by the clock module. The +network devices use them directly. +Example can be found below. + +All available clocks are defined as preprocessor macros in: +dt-bindings/clock/nuvoton,npcm7xx-clock.h +and can be reused as DT sources. + +Required Properties of clock controller: + + - compatible: "nuvoton,npcm750-clk" : for clock controller of Nuvoton + Poleg BMC NPCM750 + + - reg: physical base address of the clock controller and length of + memory mapped region. + + - #clock-cells: should be 1. + +Example: Clock controller node: + + clk: clock-controller@f0801000 { + compatible = "nuvoton,npcm750-clk"; + #clock-cells = <1>; + reg = <0xf0801000 0x1000>; + clock-names = "refclk", "sysbypck", "mcbypck"; + clocks = <&clk_refclk>, <&clk_sysbypck>, <&clk_mcbypck>; + }; + +Example: Required external clocks for network: + + /* external reference clock */ + clk_refclk: clk-refclk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <25000000>; + clock-output-names = "refclk"; + }; + + /* external reference clock for cpu. float in normal operation */ + clk_sysbypck: clk-sysbypck { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <800000000>; + clock-output-names = "sysbypck"; + }; + + /* external reference clock for MC. float in normal operation */ + clk_mcbypck: clk-mcbypck { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <800000000>; + clock-output-names = "mcbypck"; + }; + + /* external clock signal rg1refck, supplied by the phy */ + clk_rg1refck: clk-rg1refck { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <125000000>; + clock-output-names = "clk_rg1refck"; + }; + + /* external clock signal rg2refck, supplied by the phy */ + clk_rg2refck: clk-rg2refck { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <125000000>; + clock-output-names = "clk_rg2refck"; + }; + + clk_xin: clk-xin { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <50000000>; + clock-output-names = "clk_xin"; + }; + + +Example: GMAC controller node that consumes two clocks: a generated clk by the +clock controller and a fixed clock from DT (clk_rg1refck). + + ethernet0: ethernet@f0802000 { + compatible = "snps,dwmac"; + reg = <0xf0802000 0x2000>; + interrupts = <0 14 4>; + interrupt-names = "macirq"; + clocks = <&clk_rg1refck>, <&clk NPCM7XX_CLK_AHB>; + clock-names = "stmmaceth", "clk_gmac"; + }; diff --git a/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h b/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h new file mode 100644 index 000000000000..f21522605b94 --- /dev/null +++ b/include/dt-bindings/clock/nuvoton,npcm7xx-clock.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Nuvoton NPCM7xx Clock Generator binding + * clock binding number for all clocks supportted by nuvoton,npcm7xx-clk + * + * Copyright (C) 2018 Nuvoton Technologies tali.perry@nuvoton.com + * + */ + +#ifndef __DT_BINDINGS_CLOCK_NPCM7XX_H +#define __DT_BINDINGS_CLOCK_NPCM7XX_H + + +#define NPCM7XX_CLK_CPU 0 +#define NPCM7XX_CLK_GFX_PIXEL 1 +#define NPCM7XX_CLK_MC 2 +#define NPCM7XX_CLK_ADC 3 +#define NPCM7XX_CLK_AHB 4 +#define NPCM7XX_CLK_TIMER 5 +#define NPCM7XX_CLK_UART 6 +#define NPCM7XX_CLK_MMC 7 +#define NPCM7XX_CLK_SPI3 8 +#define NPCM7XX_CLK_PCI 9 +#define NPCM7XX_CLK_AXI 10 +#define NPCM7XX_CLK_APB4 11 +#define NPCM7XX_CLK_APB3 12 +#define NPCM7XX_CLK_APB2 13 +#define NPCM7XX_CLK_APB1 14 +#define NPCM7XX_CLK_APB5 15 +#define NPCM7XX_CLK_CLKOUT 16 +#define NPCM7XX_CLK_GFX 17 +#define NPCM7XX_CLK_SU 18 +#define NPCM7XX_CLK_SU48 19 +#define NPCM7XX_CLK_SDHC 20 +#define NPCM7XX_CLK_SPI0 21 +#define NPCM7XX_CLK_SPIX 22 + +#define NPCM7XX_CLK_REFCLK 23 +#define NPCM7XX_CLK_SYSBYPCK 24 +#define NPCM7XX_CLK_MCBYPCK 25 + +#define NPCM7XX_NUM_CLOCKS (NPCM7XX_CLK_MCBYPCK+1) + +#endif -- cgit v1.2.3 From a7d2a87e99deb5481b5dd723408c42f460de25a3 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 22 Mar 2018 11:51:28 +0100 Subject: drm/tinydrm: Use gem_free_object_unlocked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tinydrm doesn't use dev->struct_mutex and therefore has no need to use gem_free_object. Signed-off-by: Daniel Vetter Cc: "Noralf Trønnes" Acked-by: Noralf Trønnes Link: https://patchwork.freedesktop.org/patch/msgid/20180322105133.11211-2-daniel.vetter@ffwll.ch --- drivers/gpu/drm/tinydrm/core/tinydrm-core.c | 2 +- include/drm/tinydrm/tinydrm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-core.c b/drivers/gpu/drm/tinydrm/core/tinydrm-core.c index 4c6616278c48..24a33bf862fa 100644 --- a/drivers/gpu/drm/tinydrm/core/tinydrm-core.c +++ b/drivers/gpu/drm/tinydrm/core/tinydrm-core.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL(tinydrm_gem_cma_prime_import_sg_table); * GEM object state and frees the memory used to store the object itself using * drm_gem_cma_free_object(). It also handles PRIME buffers which has the kernel * virtual address set by tinydrm_gem_cma_prime_import_sg_table(). Drivers - * can use this as their &drm_driver->gem_free_object callback. + * can use this as their &drm_driver->gem_free_object_unlocked callback. */ void tinydrm_gem_cma_free_object(struct drm_gem_object *gem_obj) { diff --git a/include/drm/tinydrm/tinydrm.h b/include/drm/tinydrm/tinydrm.h index 07a9a11fe19d..77a93ec577fd 100644 --- a/include/drm/tinydrm/tinydrm.h +++ b/include/drm/tinydrm/tinydrm.h @@ -41,7 +41,7 @@ pipe_to_tinydrm(struct drm_simple_display_pipe *pipe) * the &drm_driver structure. */ #define TINYDRM_GEM_DRIVER_OPS \ - .gem_free_object = tinydrm_gem_cma_free_object, \ + .gem_free_object_unlocked = tinydrm_gem_cma_free_object, \ .gem_print_info = drm_gem_cma_print_info, \ .gem_vm_ops = &drm_gem_cma_vm_ops, \ .prime_handle_to_fd = drm_gem_prime_handle_to_fd, \ -- cgit v1.2.3 From d1a9d710d12485710d424daaa2430fe93bc767f8 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 27 Mar 2018 23:47:20 +0300 Subject: drm: prefer inline over __inline__ Remove last users of __inline__. Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-1-jani.nikula@intel.com --- include/drm/drmP.h | 5 ++--- include/drm/drm_legacy.h | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/drm/drmP.h b/include/drm/drmP.h index c6666cd09347..4bbef061c9c0 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -123,8 +123,7 @@ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) #define DRM_SWITCH_POWER_CHANGING 2 #define DRM_SWITCH_POWER_DYNAMIC_OFF 3 -static __inline__ int drm_core_check_feature(struct drm_device *dev, - int feature) +static inline int drm_core_check_feature(struct drm_device *dev, int feature) { return ((dev->driver->driver_features & feature) ? 1 : 0); } @@ -143,7 +142,7 @@ static __inline__ int drm_core_check_feature(struct drm_device *dev, /*@}*/ /* returns true if currently okay to sleep */ -static __inline__ bool drm_can_sleep(void) +static inline bool drm_can_sleep(void) { if (in_atomic() || in_dbg_master() || irqs_disabled()) return false; diff --git a/include/drm/drm_legacy.h b/include/drm/drm_legacy.h index cf0e7d89bcdf..8fad66f88e4f 100644 --- a/include/drm/drm_legacy.h +++ b/include/drm/drm_legacy.h @@ -194,8 +194,8 @@ void drm_legacy_ioremap(struct drm_local_map *map, struct drm_device *dev); void drm_legacy_ioremap_wc(struct drm_local_map *map, struct drm_device *dev); void drm_legacy_ioremapfree(struct drm_local_map *map, struct drm_device *dev); -static __inline__ struct drm_local_map *drm_legacy_findmap(struct drm_device *dev, - unsigned int token) +static inline struct drm_local_map *drm_legacy_findmap(struct drm_device *dev, + unsigned int token) { struct drm_map_list *_entry; list_for_each_entry(_entry, &dev->maplist, head) -- cgit v1.2.3 From 885a31cb6c752d5403adc6389894c27560fc6e6c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 27 Mar 2018 23:47:21 +0300 Subject: drm: remove old documentation comment cruft from drmP.h Throw out the leftovers. Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-2-jani.nikula@intel.com --- include/drm/drmP.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include') diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 4bbef061c9c0..b5d52a3d7d19 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -95,14 +95,6 @@ struct dma_buf_attachment; struct pci_dev; struct pci_controller; -/***********************************************************************/ -/** \name DRM template customization defaults */ -/*@{*/ - -/***********************************************************************/ -/** \name Internal types and structures */ -/*@{*/ - #define DRM_IF_VERSION(maj, min) (maj << 16 | min) /** @@ -128,19 +120,6 @@ static inline int drm_core_check_feature(struct drm_device *dev, int feature) return ((dev->driver->driver_features & feature) ? 1 : 0); } -/******************************************************************/ -/** \name Internal function definitions */ -/*@{*/ - - /* Driver support (drm_drv.h) */ - -/* - * These are exported to drivers so that they can implement fencing using - * DMA quiscent + idle. DMA quiescent usually requires the hardware lock. - */ - -/*@}*/ - /* returns true if currently okay to sleep */ static inline bool drm_can_sleep(void) { -- cgit v1.2.3 From f4392860b4fe55d7d7cadaa64743c9b2466e4fd8 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 27 Mar 2018 23:47:22 +0300 Subject: drm: make drm_core_check_feature() bool that it is Bool is the more appropriate return type here, use it. Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20180327204722.31246-3-jani.nikula@intel.com --- include/drm/drmP.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drmP.h b/include/drm/drmP.h index b5d52a3d7d19..f5099c12c6a6 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -115,9 +115,9 @@ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) #define DRM_SWITCH_POWER_CHANGING 2 #define DRM_SWITCH_POWER_DYNAMIC_OFF 3 -static inline int drm_core_check_feature(struct drm_device *dev, int feature) +static inline bool drm_core_check_feature(struct drm_device *dev, int feature) { - return ((dev->driver->driver_features & feature) ? 1 : 0); + return dev->driver->driver_features & feature; } /* returns true if currently okay to sleep */ -- cgit v1.2.3 From 49efffc7fbd48d5ea3d0dd60c218c7502d4a179d Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 21 Mar 2018 12:20:24 +0200 Subject: drm: Add drm_mode_config->normalize_zpos boolean Instead of drivers duplicating the drm_atomic_helper_check() code to be able to normalize the zpos they can use the normalize_zpos flag to let the drm core to do it. Signed-off-by: Peter Ujfalusi Signed-off-by: Tomi Valkeinen Link: https://patchwork.freedesktop.org/patch/msgid/20180321102029.15248-2-peter.ujfalusi@ti.com --- drivers/gpu/drm/drm_atomic_helper.c | 11 +++++++++++ include/drm/drm_mode_config.h | 8 ++++++++ include/drm/drm_plane.h | 4 ++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index c35654591c12..d63c806e7d38 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -875,6 +875,11 @@ EXPORT_SYMBOL(drm_atomic_helper_check_planes); * functions depend upon an updated adjusted_mode.clock to e.g. properly compute * watermarks. * + * Note that zpos normalization will add all enable planes to the state which + * might not desired for some drivers. + * For example enable/disable of a cursor plane which have fixed zpos value + * would trigger all other enabled planes to be forced to the state change. + * * RETURNS: * Zero for success or -errno */ @@ -887,6 +892,12 @@ int drm_atomic_helper_check(struct drm_device *dev, if (ret) return ret; + if (dev->mode_config.normalize_zpos) { + ret = drm_atomic_normalize_zpos(dev, state); + if (ret) + return ret; + } + ret = drm_atomic_helper_check_planes(dev, state); if (ret) return ret; diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index 7569f22ffef6..33b3a96d66d0 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -795,6 +795,14 @@ struct drm_mode_config { */ bool allow_fb_modifiers; + /** + * @normalize_zpos: + * + * If true the drm core will call drm_atomic_normalize_zpos() as part of + * atomic mode checking from drm_atomic_helper_check() + */ + bool normalize_zpos; + /** * @modifiers_property: Plane property to list support modifier/format * combination. diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h index f7bf4a48b1c3..d6da26d66a4b 100644 --- a/include/drm/drm_plane.h +++ b/include/drm/drm_plane.h @@ -51,8 +51,8 @@ struct drm_modeset_acquire_ctx; * plane with a lower ID. * @normalized_zpos: normalized value of zpos: unique, range from 0 to N-1 * where N is the number of active planes for given crtc. Note that - * the driver must call drm_atomic_normalize_zpos() to update this before - * it can be trusted. + * the driver must set drm_mode_config.normalize_zpos or call + * drm_atomic_normalize_zpos() to update this before it can be trusted. * @src: clipped source coordinates of the plane (in 16.16) * @dst: clipped destination coordinates of the plane * @state: backpointer to global drm_atomic_state -- cgit v1.2.3 From 0c9c7fd00e17907efb35697ecb9f2df39a0b536c Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 22 Mar 2018 22:27:37 +0200 Subject: drm/simple-kms-helper: Plumb plane state to the enable hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tinydrm enable hook wants to play around with the new fb in .atomic_enable(), thus we'll need access to the plane state. Performed with coccinelle: @r1@ identifier F =~ ".*enable$"; identifier P, CS; @@ F( struct drm_simple_display_pipe *P ,struct drm_crtc_state *CS + ,struct drm_plane_state *plane_state ) { ... } @@ struct drm_simple_display_pipe *P; expression E; @@ { + struct drm_plane *plane; ... + plane = &P->plane; P->funcs->enable(P ,E + ,plane->state ); ... } @@ identifier P, CS; @@ struct drm_simple_display_pipe_funcs { ... void (*enable)(struct drm_simple_display_pipe *P ,struct drm_crtc_state *CS + ,struct drm_plane_state *plane_state ); ... }; v2: Pimp the commit message (David) Cc: Marek Vasut Cc: Eric Anholt Cc: David Lechner Cc: "Noralf Trønnes" Cc: Linus Walleij Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20180322202738.25817-1-ville.syrjala@linux.intel.com Reviewed-by: Noralf Trønnes --- drivers/gpu/drm/drm_simple_kms_helper.c | 4 +++- drivers/gpu/drm/mxsfb/mxsfb_drv.c | 3 ++- drivers/gpu/drm/pl111/pl111_display.c | 3 ++- drivers/gpu/drm/tinydrm/ili9225.c | 3 ++- drivers/gpu/drm/tinydrm/mi0283qt.c | 3 ++- drivers/gpu/drm/tinydrm/repaper.c | 3 ++- drivers/gpu/drm/tinydrm/st7586.c | 3 ++- drivers/gpu/drm/tinydrm/st7735r.c | 3 ++- drivers/gpu/drm/tve200/tve200_display.c | 3 ++- include/drm/drm_simple_kms_helper.h | 3 ++- 10 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_simple_kms_helper.c b/drivers/gpu/drm/drm_simple_kms_helper.c index 987a353c7f72..7a00455ca568 100644 --- a/drivers/gpu/drm/drm_simple_kms_helper.c +++ b/drivers/gpu/drm/drm_simple_kms_helper.c @@ -64,13 +64,15 @@ static int drm_simple_kms_crtc_check(struct drm_crtc *crtc, static void drm_simple_kms_crtc_enable(struct drm_crtc *crtc, struct drm_crtc_state *old_state) { + struct drm_plane *plane; struct drm_simple_display_pipe *pipe; pipe = container_of(crtc, struct drm_simple_display_pipe, crtc); if (!pipe->funcs || !pipe->funcs->enable) return; - pipe->funcs->enable(pipe, crtc->state); + plane = &pipe->plane; + pipe->funcs->enable(pipe, crtc->state, plane->state); } static void drm_simple_kms_crtc_disable(struct drm_crtc *crtc, diff --git a/drivers/gpu/drm/mxsfb/mxsfb_drv.c b/drivers/gpu/drm/mxsfb/mxsfb_drv.c index 5cae8db9dcd4..b9c7507813db 100644 --- a/drivers/gpu/drm/mxsfb/mxsfb_drv.c +++ b/drivers/gpu/drm/mxsfb/mxsfb_drv.c @@ -99,7 +99,8 @@ static const struct drm_mode_config_funcs mxsfb_mode_config_funcs = { }; static void mxsfb_pipe_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct mxsfb_drm_private *mxsfb = drm_pipe_to_mxsfb_drm_private(pipe); diff --git a/drivers/gpu/drm/pl111/pl111_display.c b/drivers/gpu/drm/pl111/pl111_display.c index 310646427907..1fee578e05b0 100644 --- a/drivers/gpu/drm/pl111/pl111_display.c +++ b/drivers/gpu/drm/pl111/pl111_display.c @@ -120,7 +120,8 @@ static int pl111_display_check(struct drm_simple_display_pipe *pipe, } static void pl111_display_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *cstate) + struct drm_crtc_state *cstate, + struct drm_plane_state *plane_state) { struct drm_crtc *crtc = &pipe->crtc; struct drm_plane *plane = &pipe->plane; diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c index a0759502b81a..089d22798c8b 100644 --- a/drivers/gpu/drm/tinydrm/ili9225.c +++ b/drivers/gpu/drm/tinydrm/ili9225.c @@ -176,7 +176,8 @@ static const struct drm_framebuffer_funcs ili9225_fb_funcs = { }; static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct tinydrm_device *tdev = pipe_to_tinydrm(pipe); struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev); diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c index d8ed6e6f8e05..82ad9b61898e 100644 --- a/drivers/gpu/drm/tinydrm/mi0283qt.c +++ b/drivers/gpu/drm/tinydrm/mi0283qt.c @@ -49,7 +49,8 @@ #define ILI9341_MADCTL_MY BIT(7) static void mi0283qt_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct tinydrm_device *tdev = pipe_to_tinydrm(pipe); struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev); diff --git a/drivers/gpu/drm/tinydrm/repaper.c b/drivers/gpu/drm/tinydrm/repaper.c index 75740630c410..33b4a71916e4 100644 --- a/drivers/gpu/drm/tinydrm/repaper.c +++ b/drivers/gpu/drm/tinydrm/repaper.c @@ -659,7 +659,8 @@ static void power_off(struct repaper_epd *epd) } static void repaper_pipe_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct tinydrm_device *tdev = pipe_to_tinydrm(pipe); struct repaper_epd *epd = epd_from_tinydrm(tdev); diff --git a/drivers/gpu/drm/tinydrm/st7586.c b/drivers/gpu/drm/tinydrm/st7586.c index a6396ef9cc4a..bb08b293c8ce 100644 --- a/drivers/gpu/drm/tinydrm/st7586.c +++ b/drivers/gpu/drm/tinydrm/st7586.c @@ -175,7 +175,8 @@ static const struct drm_framebuffer_funcs st7586_fb_funcs = { }; static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct tinydrm_device *tdev = pipe_to_tinydrm(pipe); struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev); diff --git a/drivers/gpu/drm/tinydrm/st7735r.c b/drivers/gpu/drm/tinydrm/st7735r.c index 67d197ecfc4b..19b28f8c78db 100644 --- a/drivers/gpu/drm/tinydrm/st7735r.c +++ b/drivers/gpu/drm/tinydrm/st7735r.c @@ -37,7 +37,8 @@ #define ST7735R_MV BIT(5) static void jd_t18003_t01_pipe_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state) + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { struct tinydrm_device *tdev = pipe_to_tinydrm(pipe); struct mipi_dbi *mipi = mipi_dbi_from_tinydrm(tdev); diff --git a/drivers/gpu/drm/tve200/tve200_display.c b/drivers/gpu/drm/tve200/tve200_display.c index db397fcb345a..108f3b2b5d25 100644 --- a/drivers/gpu/drm/tve200/tve200_display.c +++ b/drivers/gpu/drm/tve200/tve200_display.c @@ -120,7 +120,8 @@ static int tve200_display_check(struct drm_simple_display_pipe *pipe, } static void tve200_display_enable(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *cstate) + struct drm_crtc_state *cstate, + struct drm_plane_state *plane_state) { struct drm_crtc *crtc = &pipe->crtc; struct drm_plane *plane = &pipe->plane; diff --git a/include/drm/drm_simple_kms_helper.h b/include/drm/drm_simple_kms_helper.h index 1b4e352143fd..b02793742317 100644 --- a/include/drm/drm_simple_kms_helper.h +++ b/include/drm/drm_simple_kms_helper.h @@ -64,7 +64,8 @@ struct drm_simple_display_pipe_funcs { * This hook is optional. */ void (*enable)(struct drm_simple_display_pipe *pipe, - struct drm_crtc_state *crtc_state); + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state); /** * @disable: * -- cgit v1.2.3 From e85d30060eddccfcfbf7fdbd61a23cfbda05cc59 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Fri, 23 Mar 2018 17:35:09 +0200 Subject: drm/tinydrm: Make fb_dirty into a lower level hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mipi_dbi_enable_flush() wants to call the fb->dirty() hook from the bowels of the .atomic_enable() hook. That prevents us from taking the plane mutex in fb->dirty() unless we also plumb down the acquire context. Instead it seems simpler to split the fb->dirty() into a tinydrm specific lower level hook that can be called from mipi_dbi_enable_flush() and from a generic higher level tinydrm_fb_dirty() helper. As we don't have a tinydrm specific vfuncs table we'll just stick it into tinydrm_device directly for now. v2: Deal with the fb->dirty() in tinydrm_display_pipe_update() as well (Noralf) Cc: "Noralf Trønnes" Cc: David Lechner Signed-off-by: Ville Syrjälä Reviewed-by: Noralf Trønnes Link: https://patchwork.freedesktop.org/patch/msgid/20180323153509.15287-1-ville.syrjala@linux.intel.com Reviewed-by: Noralf Trønnes Tested-by: Noralf Trønnes --- drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 30 ++++++++++++++++++++++++++ drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c | 5 ++--- drivers/gpu/drm/tinydrm/ili9225.c | 23 ++++++-------------- drivers/gpu/drm/tinydrm/mi0283qt.c | 2 +- drivers/gpu/drm/tinydrm/mipi-dbi.c | 30 ++++++++++---------------- drivers/gpu/drm/tinydrm/repaper.c | 28 ++++++++---------------- drivers/gpu/drm/tinydrm/st7586.c | 23 ++++++-------------- drivers/gpu/drm/tinydrm/st7735r.c | 2 +- include/drm/tinydrm/mipi-dbi.h | 4 +++- include/drm/tinydrm/tinydrm-helpers.h | 5 +++++ include/drm/tinydrm/tinydrm.h | 4 ++++ 11 files changed, 78 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c index d1c3ce9ab294..dcd390163a4a 100644 --- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c +++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c @@ -78,6 +78,36 @@ bool tinydrm_merge_clips(struct drm_clip_rect *dst, } EXPORT_SYMBOL(tinydrm_merge_clips); +int tinydrm_fb_dirty(struct drm_framebuffer *fb, + struct drm_file *file_priv, + unsigned int flags, unsigned int color, + struct drm_clip_rect *clips, + unsigned int num_clips) +{ + struct tinydrm_device *tdev = fb->dev->dev_private; + struct drm_plane *plane = &tdev->pipe.plane; + int ret = 0; + + drm_modeset_lock(&plane->mutex, NULL); + + /* fbdev can flush even when we're not interested */ + if (plane->state->fb == fb) { + mutex_lock(&tdev->dirty_lock); + ret = tdev->fb_dirty(fb, file_priv, flags, + color, clips, num_clips); + mutex_unlock(&tdev->dirty_lock); + } + + drm_modeset_unlock(&plane->mutex); + + if (ret) + dev_err_once(fb->dev->dev, + "Failed to update display %d\n", ret); + + return ret; +} +EXPORT_SYMBOL(tinydrm_fb_dirty); + /** * tinydrm_memcpy - Copy clip buffer * @dst: Destination buffer diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c index 11ae950b0fc9..e68b528ae64d 100644 --- a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c +++ b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c @@ -125,9 +125,8 @@ void tinydrm_display_pipe_update(struct drm_simple_display_pipe *pipe, struct drm_crtc *crtc = &tdev->pipe.crtc; if (fb && (fb != old_state->fb)) { - pipe->plane.fb = fb; - if (fb->funcs->dirty) - fb->funcs->dirty(fb, NULL, 0, 0, NULL, 0); + if (tdev->fb_dirty) + tdev->fb_dirty(fb, NULL, 0, 0, NULL, 0); } if (crtc->state->event) { diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c index 089d22798c8b..0874e877b111 100644 --- a/drivers/gpu/drm/tinydrm/ili9225.c +++ b/drivers/gpu/drm/tinydrm/ili9225.c @@ -88,14 +88,8 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb, bool full; void *tr; - mutex_lock(&tdev->dirty_lock); - if (!mipi->enabled) - goto out_unlock; - - /* fbdev can flush even when we're not interested */ - if (tdev->pipe.plane.fb != fb) - goto out_unlock; + return 0; full = tinydrm_merge_clips(&clip, clips, num_clips, flags, fb->width, fb->height); @@ -108,7 +102,7 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb, tr = mipi->tx_buf; ret = mipi_dbi_buf_copy(mipi->tx_buf, fb, &clip, swap); if (ret) - goto out_unlock; + return ret; } else { tr = cma_obj->vaddr; } @@ -159,20 +153,13 @@ static int ili9225_fb_dirty(struct drm_framebuffer *fb, ret = mipi_dbi_command_buf(mipi, ILI9225_WRITE_DATA_TO_GRAM, tr, (clip.x2 - clip.x1) * (clip.y2 - clip.y1) * 2); -out_unlock: - mutex_unlock(&tdev->dirty_lock); - - if (ret) - dev_err_once(fb->dev->dev, "Failed to update display %d\n", - ret); - return ret; } static const struct drm_framebuffer_funcs ili9225_fb_funcs = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, - .dirty = ili9225_fb_dirty, + .dirty = tinydrm_fb_dirty, }; static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe, @@ -269,7 +256,7 @@ static void ili9225_pipe_enable(struct drm_simple_display_pipe *pipe, ili9225_command(mipi, ILI9225_DISPLAY_CONTROL_1, 0x1017); - mipi_dbi_enable_flush(mipi); + mipi_dbi_enable_flush(mipi, crtc_state, plane_state); } static void ili9225_pipe_disable(struct drm_simple_display_pipe *pipe) @@ -342,6 +329,8 @@ static int ili9225_init(struct device *dev, struct mipi_dbi *mipi, if (ret) return ret; + tdev->fb_dirty = ili9225_fb_dirty; + ret = tinydrm_display_pipe_init(tdev, pipe_funcs, DRM_MODE_CONNECTOR_VIRTUAL, ili9225_formats, diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c index 82ad9b61898e..4e6d2ee94e55 100644 --- a/drivers/gpu/drm/tinydrm/mi0283qt.c +++ b/drivers/gpu/drm/tinydrm/mi0283qt.c @@ -127,7 +127,7 @@ static void mi0283qt_enable(struct drm_simple_display_pipe *pipe, msleep(100); out_enable: - mipi_dbi_enable_flush(mipi); + mipi_dbi_enable_flush(mipi, crtc_state, plane_state); } static const struct drm_simple_display_pipe_funcs mi0283qt_pipe_funcs = { diff --git a/drivers/gpu/drm/tinydrm/mipi-dbi.c b/drivers/gpu/drm/tinydrm/mipi-dbi.c index 9e903812b573..4d1fb31a781f 100644 --- a/drivers/gpu/drm/tinydrm/mipi-dbi.c +++ b/drivers/gpu/drm/tinydrm/mipi-dbi.c @@ -219,14 +219,8 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb, bool full; void *tr; - mutex_lock(&tdev->dirty_lock); - if (!mipi->enabled) - goto out_unlock; - - /* fbdev can flush even when we're not interested */ - if (tdev->pipe.plane.fb != fb) - goto out_unlock; + return 0; full = tinydrm_merge_clips(&clip, clips, num_clips, flags, fb->width, fb->height); @@ -239,7 +233,7 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb, tr = mipi->tx_buf; ret = mipi_dbi_buf_copy(mipi->tx_buf, fb, &clip, swap); if (ret) - goto out_unlock; + return ret; } else { tr = cma_obj->vaddr; } @@ -254,20 +248,13 @@ static int mipi_dbi_fb_dirty(struct drm_framebuffer *fb, ret = mipi_dbi_command_buf(mipi, MIPI_DCS_WRITE_MEMORY_START, tr, (clip.x2 - clip.x1) * (clip.y2 - clip.y1) * 2); -out_unlock: - mutex_unlock(&tdev->dirty_lock); - - if (ret) - dev_err_once(fb->dev->dev, "Failed to update display %d\n", - ret); - return ret; } static const struct drm_framebuffer_funcs mipi_dbi_fb_funcs = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, - .dirty = mipi_dbi_fb_dirty, + .dirty = tinydrm_fb_dirty, }; /** @@ -278,13 +265,16 @@ static const struct drm_framebuffer_funcs mipi_dbi_fb_funcs = { * enables the backlight. Drivers can use this in their * &drm_simple_display_pipe_funcs->enable callback. */ -void mipi_dbi_enable_flush(struct mipi_dbi *mipi) +void mipi_dbi_enable_flush(struct mipi_dbi *mipi, + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plane_state) { - struct drm_framebuffer *fb = mipi->tinydrm.pipe.plane.fb; + struct tinydrm_device *tdev = &mipi->tinydrm; + struct drm_framebuffer *fb = plane_state->fb; mipi->enabled = true; if (fb) - fb->funcs->dirty(fb, NULL, 0, 0, NULL, 0); + tdev->fb_dirty(fb, NULL, 0, 0, NULL, 0); backlight_enable(mipi->backlight); } @@ -381,6 +371,8 @@ int mipi_dbi_init(struct device *dev, struct mipi_dbi *mipi, if (ret) return ret; + tdev->fb_dirty = mipi_dbi_fb_dirty; + /* TODO: Maybe add DRM_MODE_CONNECTOR_SPI */ ret = tinydrm_display_pipe_init(tdev, pipe_funcs, DRM_MODE_CONNECTOR_VIRTUAL, diff --git a/drivers/gpu/drm/tinydrm/repaper.c b/drivers/gpu/drm/tinydrm/repaper.c index 33b4a71916e4..bb6f80a81899 100644 --- a/drivers/gpu/drm/tinydrm/repaper.c +++ b/drivers/gpu/drm/tinydrm/repaper.c @@ -540,14 +540,8 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb, clip.y1 = 0; clip.y2 = fb->height; - mutex_lock(&tdev->dirty_lock); - if (!epd->enabled) - goto out_unlock; - - /* fbdev can flush even when we're not interested */ - if (tdev->pipe.plane.fb != fb) - goto out_unlock; + return 0; repaper_get_temperature(epd); @@ -555,16 +549,14 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb, epd->factored_stage_time); buf = kmalloc(fb->width * fb->height, GFP_KERNEL); - if (!buf) { - ret = -ENOMEM; - goto out_unlock; - } + if (!buf) + return -ENOMEM; if (import_attach) { ret = dma_buf_begin_cpu_access(import_attach->dmabuf, DMA_FROM_DEVICE); if (ret) - goto out_unlock; + goto out_free; } tinydrm_xrgb8888_to_gray8(buf, cma_obj->vaddr, fb, &clip); @@ -573,7 +565,7 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb, ret = dma_buf_end_cpu_access(import_attach->dmabuf, DMA_FROM_DEVICE); if (ret) - goto out_unlock; + goto out_free; } repaper_gray8_to_mono_reversed(buf, fb->width, fb->height); @@ -625,11 +617,7 @@ static int repaper_fb_dirty(struct drm_framebuffer *fb, } } -out_unlock: - mutex_unlock(&tdev->dirty_lock); - - if (ret) - DRM_DEV_ERROR(fb->dev->dev, "Failed to update display (%d)\n", ret); +out_free: kfree(buf); return ret; @@ -638,7 +626,7 @@ out_unlock: static const struct drm_framebuffer_funcs repaper_fb_funcs = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, - .dirty = repaper_fb_dirty, + .dirty = tinydrm_fb_dirty, }; static void power_off(struct repaper_epd *epd) @@ -1070,6 +1058,8 @@ static int repaper_probe(struct spi_device *spi) if (ret) return ret; + tdev->fb_dirty = repaper_fb_dirty; + ret = tinydrm_display_pipe_init(tdev, &repaper_pipe_funcs, DRM_MODE_CONNECTOR_VIRTUAL, repaper_formats, diff --git a/drivers/gpu/drm/tinydrm/st7586.c b/drivers/gpu/drm/tinydrm/st7586.c index bb08b293c8ce..22644b88199a 100644 --- a/drivers/gpu/drm/tinydrm/st7586.c +++ b/drivers/gpu/drm/tinydrm/st7586.c @@ -120,14 +120,8 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb, int start, end; int ret = 0; - mutex_lock(&tdev->dirty_lock); - if (!mipi->enabled) - goto out_unlock; - - /* fbdev can flush even when we're not interested */ - if (tdev->pipe.plane.fb != fb) - goto out_unlock; + return 0; tinydrm_merge_clips(&clip, clips, num_clips, flags, fb->width, fb->height); @@ -141,7 +135,7 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb, ret = st7586_buf_copy(mipi->tx_buf, fb, &clip); if (ret) - goto out_unlock; + return ret; /* Pixels are packed 3 per byte */ start = clip.x1 / 3; @@ -158,20 +152,13 @@ static int st7586_fb_dirty(struct drm_framebuffer *fb, (u8 *)mipi->tx_buf, (end - start) * (clip.y2 - clip.y1)); -out_unlock: - mutex_unlock(&tdev->dirty_lock); - - if (ret) - dev_err_once(fb->dev->dev, "Failed to update display %d\n", - ret); - return ret; } static const struct drm_framebuffer_funcs st7586_fb_funcs = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, - .dirty = st7586_fb_dirty, + .dirty = tinydrm_fb_dirty, }; static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe, @@ -238,7 +225,7 @@ static void st7586_pipe_enable(struct drm_simple_display_pipe *pipe, mipi_dbi_command(mipi, MIPI_DCS_SET_DISPLAY_ON); - mipi_dbi_enable_flush(mipi); + mipi_dbi_enable_flush(mipi, crtc_state, plane_state); } static void st7586_pipe_disable(struct drm_simple_display_pipe *pipe) @@ -278,6 +265,8 @@ static int st7586_init(struct device *dev, struct mipi_dbi *mipi, if (ret) return ret; + tdev->fb_dirty = st7586_fb_dirty; + ret = tinydrm_display_pipe_init(tdev, pipe_funcs, DRM_MODE_CONNECTOR_VIRTUAL, st7586_formats, diff --git a/drivers/gpu/drm/tinydrm/st7735r.c b/drivers/gpu/drm/tinydrm/st7735r.c index 19b28f8c78db..189a07894d36 100644 --- a/drivers/gpu/drm/tinydrm/st7735r.c +++ b/drivers/gpu/drm/tinydrm/st7735r.c @@ -99,7 +99,7 @@ static void jd_t18003_t01_pipe_enable(struct drm_simple_display_pipe *pipe, msleep(20); - mipi_dbi_enable_flush(mipi); + mipi_dbi_enable_flush(mipi, crtc_state, plane_state); } static const struct drm_simple_display_pipe_funcs jd_t18003_t01_pipe_funcs = { diff --git a/include/drm/tinydrm/mipi-dbi.h b/include/drm/tinydrm/mipi-dbi.h index 44e824af2ef6..b8ba58861986 100644 --- a/include/drm/tinydrm/mipi-dbi.h +++ b/include/drm/tinydrm/mipi-dbi.h @@ -67,7 +67,9 @@ int mipi_dbi_init(struct device *dev, struct mipi_dbi *mipi, const struct drm_simple_display_pipe_funcs *pipe_funcs, struct drm_driver *driver, const struct drm_display_mode *mode, unsigned int rotation); -void mipi_dbi_enable_flush(struct mipi_dbi *mipi); +void mipi_dbi_enable_flush(struct mipi_dbi *mipi, + struct drm_crtc_state *crtc_state, + struct drm_plane_state *plan_state); void mipi_dbi_pipe_disable(struct drm_simple_display_pipe *pipe); void mipi_dbi_hw_reset(struct mipi_dbi *mipi); bool mipi_dbi_display_is_on(struct mipi_dbi *mipi); diff --git a/include/drm/tinydrm/tinydrm-helpers.h b/include/drm/tinydrm/tinydrm-helpers.h index 0a4ddbc04c60..5b96f0b12c8c 100644 --- a/include/drm/tinydrm/tinydrm-helpers.h +++ b/include/drm/tinydrm/tinydrm-helpers.h @@ -36,6 +36,11 @@ static inline bool tinydrm_machine_little_endian(void) bool tinydrm_merge_clips(struct drm_clip_rect *dst, struct drm_clip_rect *src, unsigned int num_clips, unsigned int flags, u32 max_width, u32 max_height); +int tinydrm_fb_dirty(struct drm_framebuffer *fb, + struct drm_file *file_priv, + unsigned int flags, unsigned int color, + struct drm_clip_rect *clips, + unsigned int num_clips); void tinydrm_memcpy(void *dst, void *vaddr, struct drm_framebuffer *fb, struct drm_clip_rect *clip); void tinydrm_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb, diff --git a/include/drm/tinydrm/tinydrm.h b/include/drm/tinydrm/tinydrm.h index 77a93ec577fd..6e2b960e25eb 100644 --- a/include/drm/tinydrm/tinydrm.h +++ b/include/drm/tinydrm/tinydrm.h @@ -26,6 +26,10 @@ struct tinydrm_device { struct drm_simple_display_pipe pipe; struct mutex dirty_lock; const struct drm_framebuffer_funcs *fb_funcs; + int (*fb_dirty)(struct drm_framebuffer *framebuffer, + struct drm_file *file_priv, unsigned flags, + unsigned color, struct drm_clip_rect *clips, + unsigned num_clips); }; static inline struct tinydrm_device * -- cgit v1.2.3 From bee330f3d67273a68dcb99f59480d59553c008b2 Mon Sep 17 00:00:00 2001 From: Noralf Trønnes Date: Wed, 28 Mar 2018 10:38:35 +0300 Subject: drm: Use srcu to protect drm_device.unplugged MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use srcu to protect drm_device.unplugged in a race free manner. Drivers can use drm_dev_enter()/drm_dev_exit() to protect and mark sections preventing access to device resources that are not available after the device is gone. Suggested-by: Daniel Vetter Signed-off-by: Noralf Trønnes Signed-off-by: Oleksandr Andrushchenko Reviewed-by: Oleksandr Andrushchenko Tested-by: Oleksandr Andrushchenko Cc: intel-gfx@lists.freedesktop.org Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/1522222715-11814-1-git-send-email-andr2000@gmail.com --- drivers/gpu/drm/drm_drv.c | 54 ++++++++++++++++++++++++++++++++++++++++++----- include/drm/drm_device.h | 9 +++++++- include/drm/drm_drv.h | 15 +++++++++---- 3 files changed, 68 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index a1b9338736e3..32a83b41ab61 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -75,6 +76,8 @@ static bool drm_core_init_complete = false; static struct dentry *drm_debugfs_root; +DEFINE_STATIC_SRCU(drm_unplug_srcu); + /* * DRM Minors * A DRM device can provide several char-dev interfaces on the DRM-Major. Each @@ -318,18 +321,51 @@ void drm_put_dev(struct drm_device *dev) } EXPORT_SYMBOL(drm_put_dev); -static void drm_device_set_unplugged(struct drm_device *dev) +/** + * drm_dev_enter - Enter device critical section + * @dev: DRM device + * @idx: Pointer to index that will be passed to the matching drm_dev_exit() + * + * This function marks and protects the beginning of a section that should not + * be entered after the device has been unplugged. The section end is marked + * with drm_dev_exit(). Calls to this function can be nested. + * + * Returns: + * True if it is OK to enter the section, false otherwise. + */ +bool drm_dev_enter(struct drm_device *dev, int *idx) +{ + *idx = srcu_read_lock(&drm_unplug_srcu); + + if (dev->unplugged) { + srcu_read_unlock(&drm_unplug_srcu, *idx); + return false; + } + + return true; +} +EXPORT_SYMBOL(drm_dev_enter); + +/** + * drm_dev_exit - Exit device critical section + * @idx: index returned from drm_dev_enter() + * + * This function marks the end of a section that should not be entered after + * the device has been unplugged. + */ +void drm_dev_exit(int idx) { - smp_wmb(); - atomic_set(&dev->unplugged, 1); + srcu_read_unlock(&drm_unplug_srcu, idx); } +EXPORT_SYMBOL(drm_dev_exit); /** * drm_dev_unplug - unplug a DRM device * @dev: DRM device * * This unplugs a hotpluggable DRM device, which makes it inaccessible to - * userspace operations. Entry-points can use drm_dev_is_unplugged(). This + * userspace operations. Entry-points can use drm_dev_enter() and + * drm_dev_exit() to protect device resources in a race free manner. This * essentially unregisters the device like drm_dev_unregister(), but can be * called while there are still open users of @dev. */ @@ -338,10 +374,18 @@ void drm_dev_unplug(struct drm_device *dev) drm_dev_unregister(dev); mutex_lock(&drm_global_mutex); - drm_device_set_unplugged(dev); if (dev->open_count == 0) drm_dev_put(dev); mutex_unlock(&drm_global_mutex); + + /* + * After synchronizing any critical read section is guaranteed to see + * the new value of ->unplugged, and any critical section which might + * still have seen the old value of ->unplugged is guaranteed to have + * finished. + */ + dev->unplugged = true; + synchronize_srcu(&drm_unplug_srcu); } EXPORT_SYMBOL(drm_dev_unplug); diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h index 7c4fa32f3fc6..3a0eac2885b7 100644 --- a/include/drm/drm_device.h +++ b/include/drm/drm_device.h @@ -46,7 +46,14 @@ struct drm_device { /* currently active master for this device. Protected by master_mutex */ struct drm_master *master; - atomic_t unplugged; /**< Flag whether dev is dead */ + /** + * @unplugged: + * + * Flag to tell if the device has been unplugged. + * See drm_dev_enter() and drm_dev_is_unplugged(). + */ + bool unplugged; + struct inode *anon_inode; /**< inode for private address-space */ char *unique; /**< unique name of the device */ /*@} */ diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h index d32b688eb346..ff7312c40cd8 100644 --- a/include/drm/drm_drv.h +++ b/include/drm/drm_drv.h @@ -623,6 +623,8 @@ void drm_dev_get(struct drm_device *dev); void drm_dev_put(struct drm_device *dev); void drm_dev_unref(struct drm_device *dev); void drm_put_dev(struct drm_device *dev); +bool drm_dev_enter(struct drm_device *dev, int *idx); +void drm_dev_exit(int idx); void drm_dev_unplug(struct drm_device *dev); /** @@ -634,11 +636,16 @@ void drm_dev_unplug(struct drm_device *dev); * unplugged, these two functions guarantee that any store before calling * drm_dev_unplug() is visible to callers of this function after it completes */ -static inline int drm_dev_is_unplugged(struct drm_device *dev) +static inline bool drm_dev_is_unplugged(struct drm_device *dev) { - int ret = atomic_read(&dev->unplugged); - smp_rmb(); - return ret; + int idx; + + if (drm_dev_enter(dev, &idx)) { + drm_dev_exit(idx); + return false; + } + + return true; } -- cgit v1.2.3 From 4f212e40468650e220c1770876c7f25b8e0c1ff5 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Wed, 28 Mar 2018 15:30:37 -0700 Subject: drm: Add DP PSR2 sink enable bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To comply with eDP1.4a this bit should be set when enabling PSR2. Signed-off-by: José Roberto de Souza Reviewed-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180328223046.16125-1-jose.souza@intel.com --- include/drm/drm_dp_helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h index 4de97e94ef9d..a62714578b93 100644 --- a/include/drm/drm_dp_helper.h +++ b/include/drm/drm_dp_helper.h @@ -477,6 +477,7 @@ # define DP_PSR_FRAME_CAPTURE (1 << 3) # define DP_PSR_SELECTIVE_UPDATE (1 << 4) # define DP_PSR_IRQ_HPD_WITH_CRC_ERRORS (1 << 5) +# define DP_PSR_ENABLE_PSR2 (1 << 6) /* eDP 1.4a */ #define DP_ADAPTER_CTRL 0x1a0 # define DP_ADAPTER_CTRL_FORCE_LOAD_SENSE (1 << 0) -- cgit v1.2.3 From fe36948afb08e361d12b2878348778aeaba74134 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Wed, 28 Mar 2018 15:30:38 -0700 Subject: drm: Add DP last received PSR SDP VSC register and bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a register to help debug what is in the last SDP VSC packet revived by sink. Signed-off-by: José Roberto de Souza Reviewed-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180328223046.16125-2-jose.souza@intel.com --- include/drm/drm_dp_helper.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h index a62714578b93..c6853f0fef2a 100644 --- a/include/drm/drm_dp_helper.h +++ b/include/drm/drm_dp_helper.h @@ -794,6 +794,15 @@ # define DP_LAST_ACTUAL_SYNCHRONIZATION_LATENCY_MASK (0xf << 4) # define DP_LAST_ACTUAL_SYNCHRONIZATION_LATENCY_SHIFT 4 +#define DP_LAST_RECEIVED_PSR_SDP 0x200a /* eDP 1.2 */ +# define DP_PSR_STATE_BIT (1 << 0) /* eDP 1.2 */ +# define DP_UPDATE_RFB_BIT (1 << 1) /* eDP 1.2 */ +# define DP_CRC_VALID_BIT (1 << 2) /* eDP 1.2 */ +# define DP_SU_VALID (1 << 3) /* eDP 1.4 */ +# define DP_FIRST_SCAN_LINE_SU_REGION (1 << 4) /* eDP 1.4 */ +# define DP_LAST_SCAN_LINE_SU_REGION (1 << 5) /* eDP 1.4 */ +# define DP_Y_COORDINATE_VALID (1 << 6) /* eDP 1.4a */ + #define DP_RECEIVER_ALPM_STATUS 0x200b /* eDP 1.4 */ # define DP_ALPM_LOCK_TIMEOUT_ERROR (1 << 0) -- cgit v1.2.3 From a2c09ac0fb6756d7085c359b6c020ef8b4205e0f Mon Sep 17 00:00:00 2001 From: Inju Song Date: Tue, 27 Mar 2018 23:14:40 +0900 Subject: netfilter: ipvs: Keep latest weight of destination The hashing table in scheduler such as source hash or maglev hash should ignore the changed weight to 0 and allow changing the weight from/to non-0 values. So, struct ip_vs_dest needs to keep weight with latest non-0 weight. Signed-off-by: Inju Song Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..0ac795b41ab8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -668,6 +668,7 @@ struct ip_vs_dest { volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ + atomic_t last_weight; /* server latest weight */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5ebde4b15810..b91bb70ece92 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; + /* keep the last_weight with latest non-0 weight */ + if (add || udest->weight != 0) + atomic_set(&dest->last_weight, udest->weight); + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; -- cgit v1.2.3 From a70cdb9eddcfd4ba20d69b84149b4a38648455ac Mon Sep 17 00:00:00 2001 From: Nayan Deshmukh Date: Thu, 29 Mar 2018 22:36:33 +0530 Subject: drm/scheduler: move the tracepoints file from the include directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move it with the scheduler code. This is mostly a straight forward rename with no code change except for updating the TRACE_INCLUDE_PATH Signed-off-by: Nayan Deshmukh Suggested-by: Christian König Reviewed-by: Christian König Acked-by: Lucas Stach Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 2 +- drivers/gpu/drm/scheduler/gpu_scheduler_trace.h | 82 +++++++++++++++++++++++++ include/drm/gpu_scheduler_trace.h | 82 ------------------------- 3 files changed, 83 insertions(+), 83 deletions(-) create mode 100644 drivers/gpu/drm/scheduler/gpu_scheduler_trace.h delete mode 100644 include/drm/gpu_scheduler_trace.h (limited to 'include') diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 1d368bc66ac2..310275eaf128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -30,7 +30,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include "gpu_scheduler_trace.h" #define to_drm_sched_job(sched_job) \ container_of((sched_job), struct drm_sched_job, queue_node) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h new file mode 100644 index 000000000000..4998ad950a48 --- /dev/null +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h @@ -0,0 +1,82 @@ +/* + * Copyright 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#if !defined(_GPU_SCHED_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _GPU_SCHED_TRACE_H_ + +#include +#include +#include + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gpu_scheduler +#define TRACE_INCLUDE_FILE gpu_scheduler_trace + +TRACE_EVENT(drm_sched_job, + TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), + TP_ARGS(sched_job, entity), + TP_STRUCT__entry( + __field(struct drm_sched_entity *, entity) + __field(struct dma_fence *, fence) + __field(const char *, name) + __field(uint64_t, id) + __field(u32, job_count) + __field(int, hw_job_count) + ), + + TP_fast_assign( + __entry->entity = entity; + __entry->id = sched_job->id; + __entry->fence = &sched_job->s_fence->finished; + __entry->name = sched_job->sched->name; + __entry->job_count = spsc_queue_count(&entity->job_queue); + __entry->hw_job_count = atomic_read( + &sched_job->sched->hw_rq_count); + ), + TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d", + __entry->entity, __entry->id, + __entry->fence, __entry->name, + __entry->job_count, __entry->hw_job_count) +); + +TRACE_EVENT(drm_sched_process_job, + TP_PROTO(struct drm_sched_fence *fence), + TP_ARGS(fence), + TP_STRUCT__entry( + __field(struct dma_fence *, fence) + ), + + TP_fast_assign( + __entry->fence = &fence->finished; + ), + TP_printk("fence=%p signaled", __entry->fence) +); + +#endif + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/scheduler +#include diff --git a/include/drm/gpu_scheduler_trace.h b/include/drm/gpu_scheduler_trace.h deleted file mode 100644 index 0789e8d0a0e1..000000000000 --- a/include/drm/gpu_scheduler_trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#if !defined(_GPU_SCHED_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _GPU_SCHED_TRACE_H_ - -#include -#include -#include - -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM gpu_scheduler -#define TRACE_INCLUDE_FILE gpu_scheduler_trace - -TRACE_EVENT(drm_sched_job, - TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity), - TP_ARGS(sched_job, entity), - TP_STRUCT__entry( - __field(struct drm_sched_entity *, entity) - __field(struct dma_fence *, fence) - __field(const char *, name) - __field(uint64_t, id) - __field(u32, job_count) - __field(int, hw_job_count) - ), - - TP_fast_assign( - __entry->entity = entity; - __entry->id = sched_job->id; - __entry->fence = &sched_job->s_fence->finished; - __entry->name = sched_job->sched->name; - __entry->job_count = spsc_queue_count(&entity->job_queue); - __entry->hw_job_count = atomic_read( - &sched_job->sched->hw_rq_count); - ), - TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d", - __entry->entity, __entry->id, - __entry->fence, __entry->name, - __entry->job_count, __entry->hw_job_count) -); - -TRACE_EVENT(drm_sched_process_job, - TP_PROTO(struct drm_sched_fence *fence), - TP_ARGS(fence), - TP_STRUCT__entry( - __field(struct dma_fence *, fence) - ), - - TP_fast_assign( - __entry->fence = &fence->finished; - ), - TP_printk("fence=%p signaled", __entry->fence) -); - -#endif - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#include -- cgit v1.2.3 From 552825b28ddac200b6080d9e79f4121b68e1517d Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Mon, 2 Apr 2018 11:20:44 +0800 Subject: drm/amdgpu: add new bo flag that indicates BOs don't need fallback (v2) user cases: 1. KFD wraps amdgpu_bo_create, they have no fallback case which is different with amdgpu_gem_object_create. since upstream branch has no amdgpu_amdkfd_gpuvm.c, which need KFD guys add this flag to __alloc_memory_of_gpu: + flags |= AMDGPU_GEM_CREATE_NO_FALLBACK; 2. UMD can specify this flag for their allocation as well if they like. v2: squash in merge conflict fix (Chunming) Signed-off-by: Chunming Zhou Acked-by: Felix Kuehling Cc: felix.kuehling@amd.com Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++++- include/uapi/drm/amdgpu_drm.h | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index dc34b50e6b29..d7d7ce1507ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -386,7 +386,8 @@ retry: bo->tbo.mem.start < adev->gmc.visible_vram_size >> PAGE_SHIFT) p->bytes_moved_vis += ctx.bytes_moved; - if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { + if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains && + !(bo->flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) { domain = bo->allowed_domains; goto retry; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 04d6830347ec..9e23d6f6f3f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -388,6 +388,8 @@ retry: drm_gem_private_object_init(adev->ddev, &bo->gem_base, size); INIT_LIST_HEAD(&bo->shadow_list); INIT_LIST_HEAD(&bo->va); + bo->preferred_domains = preferred_domains; + bo->allowed_domains = allowed_domains; bo->flags = flags; @@ -424,7 +426,8 @@ retry: r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type, &bo->placement, page_align, &ctx, acc_size, NULL, resv, &amdgpu_ttm_bo_destroy); - if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device) { + if (unlikely(r && r != -ERESTARTSYS) && type == ttm_bo_type_device && + !(flags & AMDGPU_GEM_CREATE_NO_FALLBACK)) { if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) { flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; goto retry; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index c363b67f2d0a..4f5a27d64c54 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -95,6 +95,8 @@ extern "C" { #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) /* Flag that BO sharing will be explicitly synchronized */ #define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that BO doesn't need fallback */ +#define AMDGPU_GEM_CREATE_NO_FALLBACK (1 << 8) struct drm_amdgpu_gem_create_in { /** the requested memory size */ -- cgit v1.2.3 From 1a61ee07211c543bf43e635fa703c162a78af0e1 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 4 Apr 2018 15:32:51 -0700 Subject: drm/sched: Extend the documentation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These comments answer all the questions I had for myself when implementing a driver using the GPU scheduler. Signed-off-by: Eric Anholt Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/drm/gpu_scheduler.h | 46 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index dfd54fb94e10..c053a32341bf 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -43,10 +43,12 @@ enum drm_sched_priority { }; /** - * A scheduler entity is a wrapper around a job queue or a group - * of other entities. Entities take turns emitting jobs from their - * job queues to corresponding hardware ring based on scheduling - * policy. + * drm_sched_entity - A wrapper around a job queue (typically attached + * to the DRM file_priv). + * + * Entities will emit jobs in order to their corresponding hardware + * ring, and the scheduler will alternate between entities based on + * scheduling policy. */ struct drm_sched_entity { struct list_head list; @@ -78,7 +80,18 @@ struct drm_sched_rq { struct drm_sched_fence { struct dma_fence scheduled; + + /* This fence is what will be signaled by the scheduler when + * the job is completed. + * + * When setting up an out fence for the job, you should use + * this, since it's available immediately upon + * drm_sched_job_init(), and the fence returned by the driver + * from run_job() won't be created until the dependencies have + * resolved. + */ struct dma_fence finished; + struct dma_fence_cb cb; struct dma_fence *parent; struct drm_gpu_scheduler *sched; @@ -88,6 +101,13 @@ struct drm_sched_fence { struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); +/** + * drm_sched_job - A job to be run by an entity. + * + * A job is created by the driver using drm_sched_job_init(), and + * should call drm_sched_entity_push_job() once it wants the scheduler + * to schedule the job. + */ struct drm_sched_job { struct spsc_node queue_node; struct drm_gpu_scheduler *sched; @@ -112,10 +132,28 @@ static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, * these functions should be implemented in driver side */ struct drm_sched_backend_ops { + /* Called when the scheduler is considering scheduling this + * job next, to get another struct dma_fence for this job to + * block on. Once it returns NULL, run_job() may be called. + */ struct dma_fence *(*dependency)(struct drm_sched_job *sched_job, struct drm_sched_entity *s_entity); + + /* Called to execute the job once all of the dependencies have + * been resolved. This may be called multiple times, if + * timedout_job() has happened and drm_sched_job_recovery() + * decides to try it again. + */ struct dma_fence *(*run_job)(struct drm_sched_job *sched_job); + + /* Called when a job has taken too long to execute, to trigger + * GPU recovery. + */ void (*timedout_job)(struct drm_sched_job *sched_job); + + /* Called once the job's finished fence has been signaled and + * it's time to clean it up. + */ void (*free_job)(struct drm_sched_job *sched_job); }; -- cgit v1.2.3 From 351f683b9823a3d1bffb6e2e3f381601aa0b2671 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 17 Feb 2018 20:33:13 +0100 Subject: auxdisplay: Replace licenses with SPDX identifiers Cc: Philippe Ombredanne Acked-by: Willy Tarreau Acked-by: Linus Walleij Acked-by: Robin van der Gracht Acked-by: Geert Uytterhoeven Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/arm-charlcd.c | 2 +- drivers/auxdisplay/cfag12864b.c | 16 +--------------- drivers/auxdisplay/cfag12864bfb.c | 16 +--------------- drivers/auxdisplay/charlcd.c | 6 +----- drivers/auxdisplay/hd44780.c | 6 +----- drivers/auxdisplay/ht16k33.c | 10 +--------- drivers/auxdisplay/ks0108.c | 16 +--------------- drivers/auxdisplay/panel.c | 6 +----- include/linux/cfag12864b.h | 16 +--------------- include/linux/ks0108.h | 16 +--------------- samples/auxdisplay/cfag12864b-example.c | 16 +--------------- 11 files changed, 11 insertions(+), 115 deletions(-) (limited to 'include') diff --git a/drivers/auxdisplay/arm-charlcd.c b/drivers/auxdisplay/arm-charlcd.c index dde180995582..296fb30dfa00 100644 --- a/drivers/auxdisplay/arm-charlcd.c +++ b/drivers/auxdisplay/arm-charlcd.c @@ -1,10 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Driver for the on-board character LCD found on some ARM reference boards * This is basically an Hitachi HD44780 LCD with a custom IP block to drive it * http://en.wikipedia.org/wiki/HD44780_Character_LCD * Currently it will just display the text "ARM Linux" and the linux version * - * License terms: GNU General Public License (GPL) version 2 * Author: Linus Walleij */ #include diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c index 41ce4bd96813..6bd2f65e116a 100644 --- a/drivers/auxdisplay/cfag12864b.c +++ b/drivers/auxdisplay/cfag12864b.c @@ -1,26 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Filename: cfag12864b.c * Version: 0.1.0 * Description: cfag12864b LCD driver - * License: GPLv2 * Depends: ks0108 * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ #include diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c index dcbee7e44721..40c8a552a478 100644 --- a/drivers/auxdisplay/cfag12864bfb.c +++ b/drivers/auxdisplay/cfag12864bfb.c @@ -1,26 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Filename: cfag12864bfb.c * Version: 0.1.0 * Description: cfag12864b LCD framebuffer driver - * License: GPLv2 * Depends: cfag12864b * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ #include diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index e92f3e25f51d..8673fc2b9eb8 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Character LCD driver for Linux * * Copyright (C) 2000-2008, Willy Tarreau * Copyright (C) 2016-2017 Glider bvba - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 036eec404289..78d8f1986fec 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * HD44780 Character LCD driver for Linux * * Copyright (C) 2000-2008, Willy Tarreau * Copyright (C) 2016-2017 Glider bvba - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index fbfa5b4cc567..a43276c76fc6 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -1,18 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * HT16K33 driver * * Author: Robin van der Gracht * * Copyright: (C) 2016 Protonic Holland. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index 816de9eaac26..abfe3fa9e6f4 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -1,26 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Filename: ks0108.c * Version: 0.1.0 * Description: ks0108 LCD Controller driver - * License: GPLv2 * Depends: parport * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index ec5e8800f8ad..3b25a643058c 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Front panel driver for Linux * Copyright (C) 2000-2008, Willy Tarreau * Copyright (C) 2016-2017 Glider bvba * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * This code drives an LCD module (/dev/lcd), and a keypad (/dev/keypad) * connected to a parallel printer port. * diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h index b454dfce60d9..4060004968c8 100644 --- a/include/linux/cfag12864b.h +++ b/include/linux/cfag12864b.h @@ -1,25 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Filename: cfag12864b.h * Version: 0.1.0 * Description: cfag12864b LCD driver header - * License: GPLv2 * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-12 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ #ifndef _CFAG12864B_H_ diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h index cb311798e0bc..0738389b42b6 100644 --- a/include/linux/ks0108.h +++ b/include/linux/ks0108.h @@ -1,25 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Filename: ks0108.h * Version: 0.1.0 * Description: ks0108 LCD Controller driver header - * License: GPLv2 * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ #ifndef _KS0108_H_ diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c index e7823ffb1ca0..85571e90191f 100644 --- a/samples/auxdisplay/cfag12864b-example.c +++ b/samples/auxdisplay/cfag12864b-example.c @@ -1,25 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Filename: cfag12864b-example.c * Version: 0.1.0 * Description: cfag12864b LCD userspace example program - * License: GPLv2 * * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ /* -- cgit v1.2.3 From 8ba05d770dc46df76900d4e17c312d62c83bed6f Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 28 Mar 2018 02:18:13 +0000 Subject: ASoC: trace: remove snd_soc_codec snd_soc_codec is replaced to snd_soc_component, and it is not used in this file. Let's remove it Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- include/trace/events/asoc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h index ccd1a3bdff46..40c300fe704d 100644 --- a/include/trace/events/asoc.h +++ b/include/trace/events/asoc.h @@ -12,7 +12,6 @@ #define DAPM_ARROW(dir) (((dir) == SND_SOC_DAPM_DIR_OUT) ? "->" : "<-") struct snd_soc_jack; -struct snd_soc_codec; struct snd_soc_card; struct snd_soc_dapm_widget; struct snd_soc_dapm_path; -- cgit v1.2.3 From 483cbae76824e12d4894f20fab2608789532db3f Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 28 Mar 2018 02:22:07 +0000 Subject: ASoC: wm8350: remove snd_soc_codec codec is replace to component. It seems no-one is using it, Let's remove it. Signed-off-by: Kuninori Morimoto Acked-by: Charles Keepax Signed-off-by: Mark Brown --- include/linux/mfd/wm8350/audio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/wm8350/audio.h b/include/linux/mfd/wm8350/audio.h index bd581c6fa085..0bc41c4c0429 100644 --- a/include/linux/mfd/wm8350/audio.h +++ b/include/linux/mfd/wm8350/audio.h @@ -617,11 +617,8 @@ struct wm8350_audio_platform_data { u32 codec_current_charge:2; /* codec current @ vmid charge */ }; -struct snd_soc_codec; - struct wm8350_codec { struct platform_device *pdev; - struct snd_soc_codec *codec; struct wm8350_audio_platform_data *platform_data; }; -- cgit v1.2.3 From 343e64a6c48a6c86552db945d842283eee9f528b Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 28 Mar 2018 20:26:11 +0100 Subject: clk: renesas: Add r8a77470 CPG Core Clock Definitions Add all RZ/G1C Clock Pulse Generator Core Clock Outputs, as listed in Table 7.2 ("List of Clocks [RZ/G1C]") of the RZ/G1C Hardware User's Manual. Signed-off-by: Biju Das Reviewed-by: Fabrizio Castro [geert: Use consecutive numbering] Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/r8a77470-cpg-mssr.h | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/dt-bindings/clock/r8a77470-cpg-mssr.h (limited to 'include') diff --git a/include/dt-bindings/clock/r8a77470-cpg-mssr.h b/include/dt-bindings/clock/r8a77470-cpg-mssr.h new file mode 100644 index 000000000000..34cba49d0f84 --- /dev/null +++ b/include/dt-bindings/clock/r8a77470-cpg-mssr.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2018 Renesas Electronics Corp. + */ +#ifndef __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__ +#define __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__ + +#include + +/* r8a77470 CPG Core Clocks */ +#define R8A77470_CLK_Z2 0 +#define R8A77470_CLK_ZTR 1 +#define R8A77470_CLK_ZTRD2 2 +#define R8A77470_CLK_ZT 3 +#define R8A77470_CLK_ZX 4 +#define R8A77470_CLK_ZS 5 +#define R8A77470_CLK_HP 6 +#define R8A77470_CLK_B 7 +#define R8A77470_CLK_LB 8 +#define R8A77470_CLK_P 9 +#define R8A77470_CLK_CL 10 +#define R8A77470_CLK_CP 11 +#define R8A77470_CLK_M2 12 +#define R8A77470_CLK_ZB3 13 +#define R8A77470_CLK_SDH 14 +#define R8A77470_CLK_SD0 15 +#define R8A77470_CLK_SD1 16 +#define R8A77470_CLK_SD2 17 +#define R8A77470_CLK_MP 18 +#define R8A77470_CLK_QSPI 19 +#define R8A77470_CLK_CPEX 20 +#define R8A77470_CLK_RCAN 21 +#define R8A77470_CLK_R 22 +#define R8A77470_CLK_OSC 23 + +#endif /* __DT_BINDINGS_CLOCK_R8A77470_CPG_MSSR_H__ */ -- cgit v1.2.3 From ed645cccc0ebc0329916b6df039dd792d9105c9d Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 27 Mar 2018 17:19:49 +0300 Subject: hwmon: MC13783: Add uid and die temperature sensor inputs The uid and die temperature can be read out on the ADIN7 using input mux. Map uid and die temperature sensor to channels 16 and 17. Signed-off-by: Andrey Gusakov Acked-by: Guenter Roeck Signed-off-by: Lee Jones --- drivers/hwmon/mc13783-adc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ drivers/mfd/mc13xxx-core.c | 15 +++++++++++- include/linux/mfd/mc13xxx.h | 2 ++ 3 files changed, 76 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/hwmon/mc13783-adc.c b/drivers/hwmon/mc13783-adc.c index 960a1db6f269..67860ad2e3d9 100644 --- a/drivers/hwmon/mc13783-adc.c +++ b/drivers/hwmon/mc13783-adc.c @@ -63,6 +63,10 @@ static int mc13783_adc_read(struct device *dev, if (ret) return ret; + /* ADIN7 subchannels */ + if (channel >= 16) + channel = 7; + channel &= 0x7; *val = (sample[channel % 4] >> (channel > 3 ? 14 : 2)) & 0x3ff; @@ -111,6 +115,57 @@ static ssize_t mc13783_adc_read_gp(struct device *dev, return sprintf(buf, "%u\n", val); } +static ssize_t mc13783_adc_read_uid(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + unsigned int val; + struct platform_device *pdev = to_platform_device(dev); + kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data; + int ret = mc13783_adc_read(dev, devattr, &val); + + if (ret) + return ret; + + if (driver_data & MC13783_ADC_BPDIV2) + /* MC13892 have 1/2 divider, input range is [0, 4.800V] */ + val = DIV_ROUND_CLOSEST(val * 4800, 1024); + else + /* MC13783 have 0.9 divider, input range is [0, 2.555V] */ + val = DIV_ROUND_CLOSEST(val * 2555, 1024); + + return sprintf(buf, "%u\n", val); +} + +static ssize_t mc13783_adc_read_temp(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + unsigned int val; + struct platform_device *pdev = to_platform_device(dev); + kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data; + int ret = mc13783_adc_read(dev, devattr, &val); + + if (ret) + return ret; + + if (driver_data & MC13783_ADC_BPDIV2) { + /* + * MC13892: + * Die Temperature Read Out Code at 25C 680 + * Temperature change per LSB +0.4244C + */ + ret = DIV_ROUND_CLOSEST(-2635920 + val * 4244, 10); + } else { + /* + * MC13783: + * Die Temperature Read Out Code at 25C 282 + * Temperature change per LSB -1.14C + */ + ret = 346480 - 1140 * val; + } + + return sprintf(buf, "%d\n", ret); +} + static DEVICE_ATTR_RO(name); static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, mc13783_adc_read_bp, NULL, 2); static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, mc13783_adc_read_gp, NULL, 5); @@ -124,6 +179,9 @@ static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, mc13783_adc_read_gp, NULL, 12); static SENSOR_DEVICE_ATTR(in13_input, S_IRUGO, mc13783_adc_read_gp, NULL, 13); static SENSOR_DEVICE_ATTR(in14_input, S_IRUGO, mc13783_adc_read_gp, NULL, 14); static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, mc13783_adc_read_gp, NULL, 15); +static SENSOR_DEVICE_ATTR(in16_input, S_IRUGO, mc13783_adc_read_uid, NULL, 16); +static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, + mc13783_adc_read_temp, NULL, 17); static struct attribute *mc13783_attr_base[] = { &dev_attr_name.attr, @@ -131,6 +189,8 @@ static struct attribute *mc13783_attr_base[] = { &sensor_dev_attr_in5_input.dev_attr.attr, &sensor_dev_attr_in6_input.dev_attr.attr, &sensor_dev_attr_in7_input.dev_attr.attr, + &sensor_dev_attr_in16_input.dev_attr.attr, + &sensor_dev_attr_temp1_input.dev_attr.attr, NULL }; diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c index d7f54e492aa6..c63e331738c1 100644 --- a/drivers/mfd/mc13xxx-core.c +++ b/drivers/mfd/mc13xxx-core.c @@ -279,8 +279,21 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode, adc0 = MC13XXX_ADC0_ADINC1 | MC13XXX_ADC0_ADINC2; adc1 = MC13XXX_ADC1_ADEN | MC13XXX_ADC1_ADTRIGIGN | MC13XXX_ADC1_ASC; - if (channel > 7) + /* + * Channels mapped through ADIN7: + * 7 - General purpose ADIN7 + * 16 - UID + * 17 - Die temperature + */ + if (channel > 7 && channel < 16) { adc1 |= MC13XXX_ADC1_ADSEL; + } else if (channel == 16) { + adc0 |= MC13XXX_ADC0_ADIN7SEL_UID; + channel = 7; + } else if (channel == 17) { + adc0 |= MC13XXX_ADC0_ADIN7SEL_DIE; + channel = 7; + } switch (mode) { case MC13XXX_ADC_MODE_TS: diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h index 638222e43e48..54a3cd808f9e 100644 --- a/include/linux/mfd/mc13xxx.h +++ b/include/linux/mfd/mc13xxx.h @@ -243,6 +243,8 @@ struct mc13xxx_platform_data { #define MC13XXX_ADC0_LICELLCON (1 << 0) #define MC13XXX_ADC0_CHRGICON (1 << 1) #define MC13XXX_ADC0_BATICON (1 << 2) +#define MC13XXX_ADC0_ADIN7SEL_DIE (1 << 4) +#define MC13XXX_ADC0_ADIN7SEL_UID (2 << 4) #define MC13XXX_ADC0_ADREFEN (1 << 10) #define MC13XXX_ADC0_TSMOD0 (1 << 12) #define MC13XXX_ADC0_TSMOD1 (1 << 13) -- cgit v1.2.3 From 057666b69b1d51feb389a17ec73722b001aaf3d0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Apr 2018 22:21:49 +0200 Subject: ALSA: emu10k1: Reduce GFP_ATOMIC allocation The emu10k1 fx8010 code allocates each irq resource dynamically and links to the list at PCM trigger callback. Due to the nature of trigger callback, the allocation is done with GFP_ATOMIC, hence it may fail more often. Moreover, the irq resource isn't big at all, and using the kmalloc for this won't save many bytes, either. This patch removes the dynamic allocation and embeds the irq resource into struct snd_emu10k1_fx8010_pcm.irq field instead of keeping a pointer. As a result, it simplifies the code and removes the unnecessary GFP_ATOMIC usage. Signed-off-by: Takashi Iwai --- include/sound/emu10k1.h | 4 ++-- sound/pci/emu10k1/emufx.c | 9 +-------- sound/pci/emu10k1/emupcm.c | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h index 5ebcc51c0a6a..8c1572de44c5 100644 --- a/include/sound/emu10k1.h +++ b/include/sound/emu10k1.h @@ -1610,7 +1610,7 @@ struct snd_emu10k1_fx8010_pcm { struct snd_pcm_indirect pcm_rec; unsigned int tram_pos; unsigned int tram_shift; - struct snd_emu10k1_fx8010_irq *irq; + struct snd_emu10k1_fx8010_irq irq; }; struct snd_emu10k1_fx8010 { @@ -1902,7 +1902,7 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu, snd_fx8010_irq_handler_t *handler, unsigned char gpr_running, void *private_data, - struct snd_emu10k1_fx8010_irq **r_irq); + struct snd_emu10k1_fx8010_irq *irq); int snd_emu10k1_fx8010_unregister_irq_handler(struct snd_emu10k1 *emu, struct snd_emu10k1_fx8010_irq *irq); diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c index a2b56b188be4..608ff4857d70 100644 --- a/sound/pci/emu10k1/emufx.c +++ b/sound/pci/emu10k1/emufx.c @@ -421,14 +421,10 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu, snd_fx8010_irq_handler_t *handler, unsigned char gpr_running, void *private_data, - struct snd_emu10k1_fx8010_irq **r_irq) + struct snd_emu10k1_fx8010_irq *irq) { - struct snd_emu10k1_fx8010_irq *irq; unsigned long flags; - irq = kmalloc(sizeof(*irq), GFP_ATOMIC); - if (irq == NULL) - return -ENOMEM; irq->handler = handler; irq->gpr_running = gpr_running; irq->private_data = private_data; @@ -443,8 +439,6 @@ int snd_emu10k1_fx8010_register_irq_handler(struct snd_emu10k1 *emu, emu->fx8010.irq_handlers = irq; } spin_unlock_irqrestore(&emu->fx8010.irq_lock, flags); - if (r_irq) - *r_irq = irq; return 0; } @@ -468,7 +462,6 @@ int snd_emu10k1_fx8010_unregister_irq_handler(struct snd_emu10k1 *emu, tmp->next = tmp->next->next; } spin_unlock_irqrestore(&emu->fx8010.irq_lock, flags); - kfree(irq); return 0; } diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c index cefe613ef7b7..d39458ab251f 100644 --- a/sound/pci/emu10k1/emupcm.c +++ b/sound/pci/emu10k1/emupcm.c @@ -1724,7 +1724,7 @@ static int snd_emu10k1_fx8010_playback_trigger(struct snd_pcm_substream *substre case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: case SNDRV_PCM_TRIGGER_SUSPEND: - snd_emu10k1_fx8010_unregister_irq_handler(emu, pcm->irq); pcm->irq = NULL; + snd_emu10k1_fx8010_unregister_irq_handler(emu, &pcm->irq); snd_emu10k1_ptr_write(emu, emu->gpr_base + pcm->gpr_trigger, 0, 0); pcm->tram_pos = INITIAL_TRAM_POS(pcm->buffer_size); pcm->tram_shift = 0; -- cgit v1.2.3 From ec1ba3e519c0f46523cf40b83dc71562171b7c08 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 22 Mar 2018 11:17:40 +0100 Subject: regulator: ab8500: Drop AB8540/9540 support The AB8540 was an evolved version of the AB8500, but it was never mass produced or put into products, only reference designs exist. The upstream support was never completed and it is unlikely that this will happen so drop the support for now to simplify maintenance of the AB8500. Cc: Loic Pallardy Signed-off-by: Linus Walleij Signed-off-by: Mark Brown --- drivers/regulator/ab8500.c | 1779 ++++---------------------------------- include/linux/regulator/ab8500.h | 157 +--- 2 files changed, 188 insertions(+), 1748 deletions(-) (limited to 'include') diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c index 0f97514e3474..83dba3fbfe0c 100644 --- a/drivers/regulator/ab8500.c +++ b/drivers/regulator/ab8500.c @@ -132,33 +132,6 @@ static const unsigned int ldo_vaux56_voltages[] = { 2790000, }; -static const unsigned int ldo_vaux3_ab8540_voltages[] = { - 1200000, - 1500000, - 1800000, - 2100000, - 2500000, - 2750000, - 2790000, - 2910000, - 3050000, -}; - -static const unsigned int ldo_vaux56_ab8540_voltages[] = { - 750000, 760000, 770000, 780000, 790000, 800000, - 810000, 820000, 830000, 840000, 850000, 860000, - 870000, 880000, 890000, 900000, 910000, 920000, - 930000, 940000, 950000, 960000, 970000, 980000, - 990000, 1000000, 1010000, 1020000, 1030000, - 1040000, 1050000, 1060000, 1070000, 1080000, - 1090000, 1100000, 1110000, 1120000, 1130000, - 1140000, 1150000, 1160000, 1170000, 1180000, - 1190000, 1200000, 1210000, 1220000, 1230000, - 1240000, 1250000, 1260000, 1270000, 1280000, - 1290000, 1300000, 1310000, 1320000, 1330000, - 1340000, 1350000, 1360000, 1800000, 2790000, -}; - static const unsigned int ldo_vintcore_voltages[] = { 1200000, 1225000, @@ -232,8 +205,6 @@ static const unsigned int ldo_vdmic_voltages[] = { static DEFINE_MUTEX(shared_mode_mutex); static struct ab8500_shared_mode ldo_anamic1_shared; static struct ab8500_shared_mode ldo_anamic2_shared; -static struct ab8500_shared_mode ab8540_ldo_anamic1_shared; -static struct ab8500_shared_mode ab8540_ldo_anamic2_shared; static int ab8500_regulator_enable(struct regulator_dev *rdev) { @@ -507,53 +478,6 @@ static int ab8500_regulator_get_voltage_sel(struct regulator_dev *rdev) return (regval & info->voltage_mask) >> voltage_shift; } -static int ab8540_aux3_regulator_get_voltage_sel(struct regulator_dev *rdev) -{ - int ret, voltage_shift; - struct ab8500_regulator_info *info = rdev_get_drvdata(rdev); - u8 regval, regval_expand; - - if (info == NULL) { - dev_err(rdev_get_dev(rdev), "regulator info null pointer\n"); - return -EINVAL; - } - - ret = abx500_get_register_interruptible(info->dev, - info->expand_register.voltage_bank, - info->expand_register.voltage_reg, ®val_expand); - if (ret < 0) { - dev_err(rdev_get_dev(rdev), - "couldn't read voltage expand reg for regulator\n"); - return ret; - } - - dev_vdbg(rdev_get_dev(rdev), - "%s-get_voltage expand (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n", - info->desc.name, info->expand_register.voltage_bank, - info->expand_register.voltage_reg, - info->expand_register.voltage_mask, regval_expand); - - if (regval_expand & info->expand_register.voltage_mask) - return info->expand_register.voltage_limit; - - ret = abx500_get_register_interruptible(info->dev, - info->voltage_bank, info->voltage_reg, ®val); - if (ret < 0) { - dev_err(rdev_get_dev(rdev), - "couldn't read voltage reg for regulator\n"); - return ret; - } - - dev_vdbg(rdev_get_dev(rdev), - "%s-get_voltage (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n", - info->desc.name, info->voltage_bank, info->voltage_reg, - info->voltage_mask, regval); - - voltage_shift = ffs(info->voltage_mask) - 1; - - return (regval & info->voltage_mask) >> voltage_shift; -} - static int ab8500_regulator_set_voltage_sel(struct regulator_dev *rdev, unsigned selector) { @@ -586,61 +510,6 @@ static int ab8500_regulator_set_voltage_sel(struct regulator_dev *rdev, return ret; } -static int ab8540_aux3_regulator_set_voltage_sel(struct regulator_dev *rdev, - unsigned selector) -{ - int ret; - struct ab8500_regulator_info *info = rdev_get_drvdata(rdev); - u8 regval, regval_expand; - - if (info == NULL) { - dev_err(rdev_get_dev(rdev), "regulator info null pointer\n"); - return -EINVAL; - } - - if (selector < info->expand_register.voltage_limit) { - int voltage_shift = ffs(info->voltage_mask) - 1; - - regval = (u8)selector << voltage_shift; - ret = abx500_mask_and_set_register_interruptible(info->dev, - info->voltage_bank, info->voltage_reg, - info->voltage_mask, regval); - if (ret < 0) { - dev_err(rdev_get_dev(rdev), - "couldn't set voltage reg for regulator\n"); - return ret; - } - - dev_vdbg(rdev_get_dev(rdev), - "%s-set_voltage (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n", - info->desc.name, info->voltage_bank, info->voltage_reg, - info->voltage_mask, regval); - - regval_expand = 0; - } else { - regval_expand = info->expand_register.voltage_mask; - } - - ret = abx500_mask_and_set_register_interruptible(info->dev, - info->expand_register.voltage_bank, - info->expand_register.voltage_reg, - info->expand_register.voltage_mask, - regval_expand); - if (ret < 0) { - dev_err(rdev_get_dev(rdev), - "couldn't set expand voltage reg for regulator\n"); - return ret; - } - - dev_vdbg(rdev_get_dev(rdev), - "%s-set_voltage expand (bank, reg, mask, value): 0x%x, 0x%x, 0x%x, 0x%x\n", - info->desc.name, info->expand_register.voltage_bank, - info->expand_register.voltage_reg, - info->expand_register.voltage_mask, regval_expand); - - return 0; -} - static struct regulator_ops ab8500_regulator_volt_mode_ops = { .enable = ab8500_regulator_enable, .disable = ab8500_regulator_disable, @@ -653,18 +522,6 @@ static struct regulator_ops ab8500_regulator_volt_mode_ops = { .list_voltage = regulator_list_voltage_table, }; -static struct regulator_ops ab8540_aux3_regulator_volt_mode_ops = { - .enable = ab8500_regulator_enable, - .disable = ab8500_regulator_disable, - .get_optimum_mode = ab8500_regulator_get_optimum_mode, - .set_mode = ab8500_regulator_set_mode, - .get_mode = ab8500_regulator_get_mode, - .is_enabled = ab8500_regulator_is_enabled, - .get_voltage_sel = ab8540_aux3_regulator_get_voltage_sel, - .set_voltage_sel = ab8540_aux3_regulator_set_voltage_sel, - .list_voltage = regulator_list_voltage_table, -}; - static struct regulator_ops ab8500_regulator_volt_ops = { .enable = ab8500_regulator_enable, .disable = ab8500_regulator_disable, @@ -1217,1156 +1074,118 @@ static struct ab8500_regulator_info }, }; -/* AB9540 regulator information */ -static struct ab8500_regulator_info - ab9540_regulator_info[AB9540_NUM_REGULATORS] = { +static struct ab8500_shared_mode ldo_anamic1_shared = { + .shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC2], +}; + +static struct ab8500_shared_mode ldo_anamic2_shared = { + .shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC1], +}; + +struct ab8500_reg_init { + u8 bank; + u8 addr; + u8 mask; +}; + +#define REG_INIT(_id, _bank, _addr, _mask) \ + [_id] = { \ + .bank = _bank, \ + .addr = _addr, \ + .mask = _mask, \ + } + +/* AB8500 register init */ +static struct ab8500_reg_init ab8500_reg_init[] = { /* - * Variable Voltage Regulators - * name, min mV, max mV, - * update bank, reg, mask, enable val - * volt bank, reg, mask + * 0x30, VanaRequestCtrl + * 0xc0, VextSupply1RequestCtrl */ - [AB9540_LDO_AUX1] = { - .desc = { - .name = "LDO-AUX1", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_AUX1, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x09, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - .voltage_bank = 0x04, - .voltage_reg = 0x1f, - .voltage_mask = 0x0f, - }, - [AB9540_LDO_AUX2] = { - .desc = { - .name = "LDO-AUX2", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_AUX2, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x09, - .update_mask = 0x0c, - .update_val = 0x04, - .update_val_idle = 0x0c, - .update_val_normal = 0x04, - .voltage_bank = 0x04, - .voltage_reg = 0x20, - .voltage_mask = 0x0f, - }, - [AB9540_LDO_AUX3] = { - .desc = { - .name = "LDO-AUX3", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_AUX3, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vaux3_voltages), - .volt_table = ldo_vaux3_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x0a, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - .voltage_bank = 0x04, - .voltage_reg = 0x21, - .voltage_mask = 0x07, - }, - [AB9540_LDO_AUX4] = { - .desc = { - .name = "LDO-AUX4", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_AUX4, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - /* values for Vaux4Regu register */ - .update_bank = 0x04, - .update_reg = 0x2e, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - /* values for Vaux4SEL register */ - .voltage_bank = 0x04, - .voltage_reg = 0x2f, - .voltage_mask = 0x0f, - }, - [AB9540_LDO_INTCORE] = { - .desc = { - .name = "LDO-INTCORE", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_INTCORE, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vintcore_voltages), - .volt_table = ldo_vintcore_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x03, - .update_reg = 0x80, - .update_mask = 0x44, - .update_val = 0x44, - .update_val_idle = 0x44, - .update_val_normal = 0x04, - .voltage_bank = 0x03, - .voltage_reg = 0x80, - .voltage_mask = 0x38, - }, - + REG_INIT(AB8500_REGUREQUESTCTRL2, 0x03, 0x04, 0xf0), /* - * Fixed Voltage Regulators - * name, fixed mV, - * update bank, reg, mask, enable val + * 0x03, VextSupply2RequestCtrl + * 0x0c, VextSupply3RequestCtrl + * 0x30, Vaux1RequestCtrl + * 0xc0, Vaux2RequestCtrl */ - [AB9540_LDO_TVOUT] = { - .desc = { - .name = "LDO-TVOUT", - .ops = &ab8500_regulator_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_TVOUT, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2000000_voltage, - .enable_time = 10000, - }, - .load_lp_uA = 1000, - .update_bank = 0x03, - .update_reg = 0x80, - .update_mask = 0x82, - .update_val = 0x02, - .update_val_idle = 0x82, - .update_val_normal = 0x02, - }, - [AB9540_LDO_USB] = { - .desc = { - .name = "LDO-USB", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_USB, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_3300000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x82, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - }, - [AB9540_LDO_AUDIO] = { - .desc = { - .name = "LDO-AUDIO", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_AUDIO, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2000000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x02, - .update_val = 0x02, - }, - [AB9540_LDO_ANAMIC1] = { - .desc = { - .name = "LDO-ANAMIC1", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_ANAMIC1, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2050000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x08, - .update_val = 0x08, - }, - [AB9540_LDO_ANAMIC2] = { - .desc = { - .name = "LDO-ANAMIC2", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_ANAMIC2, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2050000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x10, - .update_val = 0x10, - }, - [AB9540_LDO_DMIC] = { - .desc = { - .name = "LDO-DMIC", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_DMIC, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_1800000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x04, - .update_val = 0x04, - }, - - /* - * Regulators with fixed voltage and normal/idle modes - */ - [AB9540_LDO_ANA] = { - .desc = { - .name = "LDO-ANA", - .ops = &ab8500_regulator_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB9540_LDO_ANA, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_1200000_voltage, - }, - .load_lp_uA = 1000, - .update_bank = 0x04, - .update_reg = 0x06, - .update_mask = 0x0c, - .update_val = 0x08, - .update_val_idle = 0x0c, - .update_val_normal = 0x08, - }, -}; - -/* AB8540 regulator information */ -static struct ab8500_regulator_info - ab8540_regulator_info[AB8540_NUM_REGULATORS] = { - /* - * Variable Voltage Regulators - * name, min mV, max mV, - * update bank, reg, mask, enable val - * volt bank, reg, mask - */ - [AB8540_LDO_AUX1] = { - .desc = { - .name = "LDO-AUX1", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX1, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x09, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - .voltage_bank = 0x04, - .voltage_reg = 0x1f, - .voltage_mask = 0x0f, - }, - [AB8540_LDO_AUX2] = { - .desc = { - .name = "LDO-AUX2", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX2, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x09, - .update_mask = 0x0c, - .update_val = 0x04, - .update_val_idle = 0x0c, - .update_val_normal = 0x04, - .voltage_bank = 0x04, - .voltage_reg = 0x20, - .voltage_mask = 0x0f, - }, - [AB8540_LDO_AUX3] = { - .desc = { - .name = "LDO-AUX3", - .ops = &ab8540_aux3_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX3, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vaux3_ab8540_voltages), - .volt_table = ldo_vaux3_ab8540_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x04, - .update_reg = 0x0a, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - .voltage_bank = 0x04, - .voltage_reg = 0x21, - .voltage_mask = 0x07, - .expand_register = { - .voltage_limit = 8, - .voltage_bank = 0x04, - .voltage_reg = 0x01, - .voltage_mask = 0x10, - } - }, - [AB8540_LDO_AUX4] = { - .desc = { - .name = "LDO-AUX4", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX4, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vauxn_voltages), - .volt_table = ldo_vauxn_voltages, - }, - .load_lp_uA = 5000, - /* values for Vaux4Regu register */ - .update_bank = 0x04, - .update_reg = 0x2e, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - /* values for Vaux4SEL register */ - .voltage_bank = 0x04, - .voltage_reg = 0x2f, - .voltage_mask = 0x0f, - }, - [AB8540_LDO_AUX5] = { - .desc = { - .name = "LDO-AUX5", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX5, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vaux56_ab8540_voltages), - .volt_table = ldo_vaux56_ab8540_voltages, - }, - .load_lp_uA = 20000, - /* values for Vaux5Regu register */ - .update_bank = 0x04, - .update_reg = 0x32, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - /* values for Vaux5SEL register */ - .voltage_bank = 0x04, - .voltage_reg = 0x33, - .voltage_mask = 0x3f, - }, - [AB8540_LDO_AUX6] = { - .desc = { - .name = "LDO-AUX6", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUX6, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vaux56_ab8540_voltages), - .volt_table = ldo_vaux56_ab8540_voltages, - }, - .load_lp_uA = 20000, - /* values for Vaux6Regu register */ - .update_bank = 0x04, - .update_reg = 0x35, - .update_mask = 0x03, - .update_val = 0x01, - .update_val_idle = 0x03, - .update_val_normal = 0x01, - /* values for Vaux6SEL register */ - .voltage_bank = 0x04, - .voltage_reg = 0x36, - .voltage_mask = 0x3f, - }, - [AB8540_LDO_INTCORE] = { - .desc = { - .name = "LDO-INTCORE", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_INTCORE, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vintcore_voltages), - .volt_table = ldo_vintcore_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x03, - .update_reg = 0x80, - .update_mask = 0x44, - .update_val = 0x44, - .update_val_idle = 0x44, - .update_val_normal = 0x04, - .voltage_bank = 0x03, - .voltage_reg = 0x80, - .voltage_mask = 0x38, - }, - - /* - * Fixed Voltage Regulators - * name, fixed mV, - * update bank, reg, mask, enable val - */ - [AB8540_LDO_TVOUT] = { - .desc = { - .name = "LDO-TVOUT", - .ops = &ab8500_regulator_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_TVOUT, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2000000_voltage, - .enable_time = 10000, - }, - .load_lp_uA = 1000, - .update_bank = 0x03, - .update_reg = 0x80, - .update_mask = 0x82, - .update_val = 0x02, - .update_val_idle = 0x82, - .update_val_normal = 0x02, - }, - [AB8540_LDO_AUDIO] = { - .desc = { - .name = "LDO-AUDIO", - .ops = &ab8500_regulator_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_AUDIO, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2000000_voltage, - }, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x02, - .update_val = 0x02, - }, - [AB8540_LDO_ANAMIC1] = { - .desc = { - .name = "LDO-ANAMIC1", - .ops = &ab8500_regulator_anamic_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_ANAMIC1, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2050000_voltage, - }, - .shared_mode = &ab8540_ldo_anamic1_shared, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x08, - .update_val = 0x08, - .mode_bank = 0x03, - .mode_reg = 0x83, - .mode_mask = 0x20, - .mode_val_idle = 0x20, - .mode_val_normal = 0x00, - }, - [AB8540_LDO_ANAMIC2] = { - .desc = { - .name = "LDO-ANAMIC2", - .ops = &ab8500_regulator_anamic_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_ANAMIC2, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_2050000_voltage, - }, - .shared_mode = &ab8540_ldo_anamic2_shared, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x10, - .update_val = 0x10, - .mode_bank = 0x03, - .mode_reg = 0x83, - .mode_mask = 0x20, - .mode_val_idle = 0x20, - .mode_val_normal = 0x00, - }, - [AB8540_LDO_DMIC] = { - .desc = { - .name = "LDO-DMIC", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_DMIC, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_vdmic_voltages), - .volt_table = ldo_vdmic_voltages, - }, - .load_lp_uA = 1000, - .update_bank = 0x03, - .update_reg = 0x83, - .update_mask = 0x04, - .update_val = 0x04, - .voltage_bank = 0x03, - .voltage_reg = 0x83, - .voltage_mask = 0xc0, - }, - - /* - * Regulators with fixed voltage and normal/idle modes - */ - [AB8540_LDO_ANA] = { - .desc = { - .name = "LDO-ANA", - .ops = &ab8500_regulator_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_ANA, - .owner = THIS_MODULE, - .n_voltages = 1, - .volt_table = fixed_1200000_voltage, - }, - .load_lp_uA = 1000, - .update_bank = 0x04, - .update_reg = 0x06, - .update_mask = 0x0c, - .update_val = 0x04, - .update_val_idle = 0x0c, - .update_val_normal = 0x04, - }, - [AB8540_LDO_SDIO] = { - .desc = { - .name = "LDO-SDIO", - .ops = &ab8500_regulator_volt_mode_ops, - .type = REGULATOR_VOLTAGE, - .id = AB8540_LDO_SDIO, - .owner = THIS_MODULE, - .n_voltages = ARRAY_SIZE(ldo_sdio_voltages), - .volt_table = ldo_sdio_voltages, - }, - .load_lp_uA = 5000, - .update_bank = 0x03, - .update_reg = 0x88, - .update_mask = 0x30, - .update_val = 0x10, - .update_val_idle = 0x30, - .update_val_normal = 0x10, - .voltage_bank = 0x03, - .voltage_reg = 0x88, - .voltage_mask = 0x07, - }, -}; - -static struct ab8500_shared_mode ldo_anamic1_shared = { - .shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC2], -}; - -static struct ab8500_shared_mode ldo_anamic2_shared = { - .shared_regulator = &ab8505_regulator_info[AB8505_LDO_ANAMIC1], -}; - -static struct ab8500_shared_mode ab8540_ldo_anamic1_shared = { - .shared_regulator = &ab8540_regulator_info[AB8540_LDO_ANAMIC2], -}; - -static struct ab8500_shared_mode ab8540_ldo_anamic2_shared = { - .shared_regulator = &ab8540_regulator_info[AB8540_LDO_ANAMIC1], -}; - -struct ab8500_reg_init { - u8 bank; - u8 addr; - u8 mask; -}; - -#define REG_INIT(_id, _bank, _addr, _mask) \ - [_id] = { \ - .bank = _bank, \ - .addr = _addr, \ - .mask = _mask, \ - } - -/* AB8500 register init */ -static struct ab8500_reg_init ab8500_reg_init[] = { - /* - * 0x30, VanaRequestCtrl - * 0xc0, VextSupply1RequestCtrl - */ - REG_INIT(AB8500_REGUREQUESTCTRL2, 0x03, 0x04, 0xf0), - /* - * 0x03, VextSupply2RequestCtrl - * 0x0c, VextSupply3RequestCtrl - * 0x30, Vaux1RequestCtrl - * 0xc0, Vaux2RequestCtrl - */ - REG_INIT(AB8500_REGUREQUESTCTRL3, 0x03, 0x05, 0xff), - /* - * 0x03, Vaux3RequestCtrl - * 0x04, SwHPReq - */ - REG_INIT(AB8500_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), - /* - * 0x08, VanaSysClkReq1HPValid - * 0x20, Vaux1SysClkReq1HPValid - * 0x40, Vaux2SysClkReq1HPValid - * 0x80, Vaux3SysClkReq1HPValid - */ - REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xe8), - /* - * 0x10, VextSupply1SysClkReq1HPValid - * 0x20, VextSupply2SysClkReq1HPValid - * 0x40, VextSupply3SysClkReq1HPValid - */ - REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x70), - /* - * 0x08, VanaHwHPReq1Valid - * 0x20, Vaux1HwHPReq1Valid - * 0x40, Vaux2HwHPReq1Valid - * 0x80, Vaux3HwHPReq1Valid - */ - REG_INIT(AB8500_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xe8), - /* - * 0x01, VextSupply1HwHPReq1Valid - * 0x02, VextSupply2HwHPReq1Valid - * 0x04, VextSupply3HwHPReq1Valid - */ - REG_INIT(AB8500_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x07), - /* - * 0x08, VanaHwHPReq2Valid - * 0x20, Vaux1HwHPReq2Valid - * 0x40, Vaux2HwHPReq2Valid - * 0x80, Vaux3HwHPReq2Valid - */ - REG_INIT(AB8500_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xe8), - /* - * 0x01, VextSupply1HwHPReq2Valid - * 0x02, VextSupply2HwHPReq2Valid - * 0x04, VextSupply3HwHPReq2Valid - */ - REG_INIT(AB8500_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x07), - /* - * 0x20, VanaSwHPReqValid - * 0x80, Vaux1SwHPReqValid - */ - REG_INIT(AB8500_REGUSWHPREQVALID1, 0x03, 0x0d, 0xa0), - /* - * 0x01, Vaux2SwHPReqValid - * 0x02, Vaux3SwHPReqValid - * 0x04, VextSupply1SwHPReqValid - * 0x08, VextSupply2SwHPReqValid - * 0x10, VextSupply3SwHPReqValid - */ - REG_INIT(AB8500_REGUSWHPREQVALID2, 0x03, 0x0e, 0x1f), - /* - * 0x02, SysClkReq2Valid1 - * 0x04, SysClkReq3Valid1 - * 0x08, SysClkReq4Valid1 - * 0x10, SysClkReq5Valid1 - * 0x20, SysClkReq6Valid1 - * 0x40, SysClkReq7Valid1 - * 0x80, SysClkReq8Valid1 - */ - REG_INIT(AB8500_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0xfe), - /* - * 0x02, SysClkReq2Valid2 - * 0x04, SysClkReq3Valid2 - * 0x08, SysClkReq4Valid2 - * 0x10, SysClkReq5Valid2 - * 0x20, SysClkReq6Valid2 - * 0x40, SysClkReq7Valid2 - * 0x80, SysClkReq8Valid2 - */ - REG_INIT(AB8500_REGUSYSCLKREQVALID2, 0x03, 0x10, 0xfe), - /* - * 0x02, VTVoutEna - * 0x04, Vintcore12Ena - * 0x38, Vintcore12Sel - * 0x40, Vintcore12LP - * 0x80, VTVoutLP - */ - REG_INIT(AB8500_REGUMISC1, 0x03, 0x80, 0xfe), - /* - * 0x02, VaudioEna - * 0x04, VdmicEna - * 0x08, Vamic1Ena - * 0x10, Vamic2Ena - */ - REG_INIT(AB8500_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), - /* - * 0x01, Vamic1_dzout - * 0x02, Vamic2_dzout - */ - REG_INIT(AB8500_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), - /* - * 0x03, VpllRegu (NOTE! PRCMU register bits) - * 0x0c, VanaRegu - */ - REG_INIT(AB8500_VPLLVANAREGU, 0x04, 0x06, 0x0f), - /* - * 0x01, VrefDDREna - * 0x02, VrefDDRSleepMode - */ - REG_INIT(AB8500_VREFDDR, 0x04, 0x07, 0x03), - /* - * 0x03, VextSupply1Regu - * 0x0c, VextSupply2Regu - * 0x30, VextSupply3Regu - * 0x40, ExtSupply2Bypass - * 0x80, ExtSupply3Bypass - */ - REG_INIT(AB8500_EXTSUPPLYREGU, 0x04, 0x08, 0xff), - /* - * 0x03, Vaux1Regu - * 0x0c, Vaux2Regu - */ - REG_INIT(AB8500_VAUX12REGU, 0x04, 0x09, 0x0f), - /* - * 0x03, Vaux3Regu - */ - REG_INIT(AB8500_VRF1VAUX3REGU, 0x04, 0x0a, 0x03), - /* - * 0x0f, Vaux1Sel - */ - REG_INIT(AB8500_VAUX1SEL, 0x04, 0x1f, 0x0f), - /* - * 0x0f, Vaux2Sel - */ - REG_INIT(AB8500_VAUX2SEL, 0x04, 0x20, 0x0f), - /* - * 0x07, Vaux3Sel - */ - REG_INIT(AB8500_VRF1VAUX3SEL, 0x04, 0x21, 0x07), - /* - * 0x01, VextSupply12LP - */ - REG_INIT(AB8500_REGUCTRL2SPARE, 0x04, 0x22, 0x01), - /* - * 0x04, Vaux1Disch - * 0x08, Vaux2Disch - * 0x10, Vaux3Disch - * 0x20, Vintcore12Disch - * 0x40, VTVoutDisch - * 0x80, VaudioDisch - */ - REG_INIT(AB8500_REGUCTRLDISCH, 0x04, 0x43, 0xfc), - /* - * 0x02, VanaDisch - * 0x04, VdmicPullDownEna - * 0x10, VdmicDisch - */ - REG_INIT(AB8500_REGUCTRLDISCH2, 0x04, 0x44, 0x16), -}; - -/* AB8505 register init */ -static struct ab8500_reg_init ab8505_reg_init[] = { - /* - * 0x03, VarmRequestCtrl - * 0x0c, VsmpsCRequestCtrl - * 0x30, VsmpsARequestCtrl - * 0xc0, VsmpsBRequestCtrl - */ - REG_INIT(AB8505_REGUREQUESTCTRL1, 0x03, 0x03, 0xff), - /* - * 0x03, VsafeRequestCtrl - * 0x0c, VpllRequestCtrl - * 0x30, VanaRequestCtrl - */ - REG_INIT(AB8505_REGUREQUESTCTRL2, 0x03, 0x04, 0x3f), - /* - * 0x30, Vaux1RequestCtrl - * 0xc0, Vaux2RequestCtrl - */ - REG_INIT(AB8505_REGUREQUESTCTRL3, 0x03, 0x05, 0xf0), - /* - * 0x03, Vaux3RequestCtrl - * 0x04, SwHPReq - */ - REG_INIT(AB8505_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), - /* - * 0x01, VsmpsASysClkReq1HPValid - * 0x02, VsmpsBSysClkReq1HPValid - * 0x04, VsafeSysClkReq1HPValid - * 0x08, VanaSysClkReq1HPValid - * 0x10, VpllSysClkReq1HPValid - * 0x20, Vaux1SysClkReq1HPValid - * 0x40, Vaux2SysClkReq1HPValid - * 0x80, Vaux3SysClkReq1HPValid - */ - REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xff), - /* - * 0x01, VsmpsCSysClkReq1HPValid - * 0x02, VarmSysClkReq1HPValid - * 0x04, VbbSysClkReq1HPValid - * 0x08, VsmpsMSysClkReq1HPValid - */ - REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x0f), - /* - * 0x01, VsmpsAHwHPReq1Valid - * 0x02, VsmpsBHwHPReq1Valid - * 0x04, VsafeHwHPReq1Valid - * 0x08, VanaHwHPReq1Valid - * 0x10, VpllHwHPReq1Valid - * 0x20, Vaux1HwHPReq1Valid - * 0x40, Vaux2HwHPReq1Valid - * 0x80, Vaux3HwHPReq1Valid - */ - REG_INIT(AB8505_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xff), - /* - * 0x08, VsmpsMHwHPReq1Valid - */ - REG_INIT(AB8505_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x08), - /* - * 0x01, VsmpsAHwHPReq2Valid - * 0x02, VsmpsBHwHPReq2Valid - * 0x04, VsafeHwHPReq2Valid - * 0x08, VanaHwHPReq2Valid - * 0x10, VpllHwHPReq2Valid - * 0x20, Vaux1HwHPReq2Valid - * 0x40, Vaux2HwHPReq2Valid - * 0x80, Vaux3HwHPReq2Valid - */ - REG_INIT(AB8505_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xff), - /* - * 0x08, VsmpsMHwHPReq2Valid - */ - REG_INIT(AB8505_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x08), - /* - * 0x01, VsmpsCSwHPReqValid - * 0x02, VarmSwHPReqValid - * 0x04, VsmpsASwHPReqValid - * 0x08, VsmpsBSwHPReqValid - * 0x10, VsafeSwHPReqValid - * 0x20, VanaSwHPReqValid - * 0x40, VpllSwHPReqValid - * 0x80, Vaux1SwHPReqValid - */ - REG_INIT(AB8505_REGUSWHPREQVALID1, 0x03, 0x0d, 0xff), - /* - * 0x01, Vaux2SwHPReqValid - * 0x02, Vaux3SwHPReqValid - * 0x20, VsmpsMSwHPReqValid - */ - REG_INIT(AB8505_REGUSWHPREQVALID2, 0x03, 0x0e, 0x23), - /* - * 0x02, SysClkReq2Valid1 - * 0x04, SysClkReq3Valid1 - * 0x08, SysClkReq4Valid1 - */ - REG_INIT(AB8505_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0x0e), - /* - * 0x02, SysClkReq2Valid2 - * 0x04, SysClkReq3Valid2 - * 0x08, SysClkReq4Valid2 - */ - REG_INIT(AB8505_REGUSYSCLKREQVALID2, 0x03, 0x10, 0x0e), - /* - * 0x01, Vaux4SwHPReqValid - * 0x02, Vaux4HwHPReq2Valid - * 0x04, Vaux4HwHPReq1Valid - * 0x08, Vaux4SysClkReq1HPValid - */ - REG_INIT(AB8505_REGUVAUX4REQVALID, 0x03, 0x11, 0x0f), - /* - * 0x02, VadcEna - * 0x04, VintCore12Ena - * 0x38, VintCore12Sel - * 0x40, VintCore12LP - * 0x80, VadcLP - */ - REG_INIT(AB8505_REGUMISC1, 0x03, 0x80, 0xfe), - /* - * 0x02, VaudioEna - * 0x04, VdmicEna - * 0x08, Vamic1Ena - * 0x10, Vamic2Ena - */ - REG_INIT(AB8505_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), - /* - * 0x01, Vamic1_dzout - * 0x02, Vamic2_dzout - */ - REG_INIT(AB8505_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), - /* - * 0x03, VsmpsARegu - * 0x0c, VsmpsASelCtrl - * 0x10, VsmpsAAutoMode - * 0x20, VsmpsAPWMMode - */ - REG_INIT(AB8505_VSMPSAREGU, 0x04, 0x03, 0x3f), - /* - * 0x03, VsmpsBRegu - * 0x0c, VsmpsBSelCtrl - * 0x10, VsmpsBAutoMode - * 0x20, VsmpsBPWMMode - */ - REG_INIT(AB8505_VSMPSBREGU, 0x04, 0x04, 0x3f), - /* - * 0x03, VsafeRegu - * 0x0c, VsafeSelCtrl - * 0x10, VsafeAutoMode - * 0x20, VsafePWMMode - */ - REG_INIT(AB8505_VSAFEREGU, 0x04, 0x05, 0x3f), - /* - * 0x03, VpllRegu (NOTE! PRCMU register bits) - * 0x0c, VanaRegu - */ - REG_INIT(AB8505_VPLLVANAREGU, 0x04, 0x06, 0x0f), - /* - * 0x03, VextSupply1Regu - * 0x0c, VextSupply2Regu - * 0x30, VextSupply3Regu - * 0x40, ExtSupply2Bypass - * 0x80, ExtSupply3Bypass - */ - REG_INIT(AB8505_EXTSUPPLYREGU, 0x04, 0x08, 0xff), - /* - * 0x03, Vaux1Regu - * 0x0c, Vaux2Regu - */ - REG_INIT(AB8505_VAUX12REGU, 0x04, 0x09, 0x0f), - /* - * 0x0f, Vaux3Regu - */ - REG_INIT(AB8505_VRF1VAUX3REGU, 0x04, 0x0a, 0x0f), - /* - * 0x3f, VsmpsASel1 - */ - REG_INIT(AB8505_VSMPSASEL1, 0x04, 0x13, 0x3f), - /* - * 0x3f, VsmpsASel2 - */ - REG_INIT(AB8505_VSMPSASEL2, 0x04, 0x14, 0x3f), - /* - * 0x3f, VsmpsASel3 - */ - REG_INIT(AB8505_VSMPSASEL3, 0x04, 0x15, 0x3f), - /* - * 0x3f, VsmpsBSel1 - */ - REG_INIT(AB8505_VSMPSBSEL1, 0x04, 0x17, 0x3f), - /* - * 0x3f, VsmpsBSel2 - */ - REG_INIT(AB8505_VSMPSBSEL2, 0x04, 0x18, 0x3f), - /* - * 0x3f, VsmpsBSel3 - */ - REG_INIT(AB8505_VSMPSBSEL3, 0x04, 0x19, 0x3f), - /* - * 0x7f, VsafeSel1 - */ - REG_INIT(AB8505_VSAFESEL1, 0x04, 0x1b, 0x7f), - /* - * 0x3f, VsafeSel2 - */ - REG_INIT(AB8505_VSAFESEL2, 0x04, 0x1c, 0x7f), - /* - * 0x3f, VsafeSel3 - */ - REG_INIT(AB8505_VSAFESEL3, 0x04, 0x1d, 0x7f), - /* - * 0x0f, Vaux1Sel - */ - REG_INIT(AB8505_VAUX1SEL, 0x04, 0x1f, 0x0f), - /* - * 0x0f, Vaux2Sel - */ - REG_INIT(AB8505_VAUX2SEL, 0x04, 0x20, 0x0f), - /* - * 0x07, Vaux3Sel - * 0x30, VRF1Sel - */ - REG_INIT(AB8505_VRF1VAUX3SEL, 0x04, 0x21, 0x37), - /* - * 0x03, Vaux4RequestCtrl - */ - REG_INIT(AB8505_VAUX4REQCTRL, 0x04, 0x2d, 0x03), - /* - * 0x03, Vaux4Regu - */ - REG_INIT(AB8505_VAUX4REGU, 0x04, 0x2e, 0x03), - /* - * 0x0f, Vaux4Sel - */ - REG_INIT(AB8505_VAUX4SEL, 0x04, 0x2f, 0x0f), - /* - * 0x04, Vaux1Disch - * 0x08, Vaux2Disch - * 0x10, Vaux3Disch - * 0x20, Vintcore12Disch - * 0x40, VTVoutDisch - * 0x80, VaudioDisch - */ - REG_INIT(AB8505_REGUCTRLDISCH, 0x04, 0x43, 0xfc), - /* - * 0x02, VanaDisch - * 0x04, VdmicPullDownEna - * 0x10, VdmicDisch - */ - REG_INIT(AB8505_REGUCTRLDISCH2, 0x04, 0x44, 0x16), - /* - * 0x01, Vaux4Disch - */ - REG_INIT(AB8505_REGUCTRLDISCH3, 0x04, 0x48, 0x01), - /* - * 0x07, Vaux5Sel - * 0x08, Vaux5LP - * 0x10, Vaux5Ena - * 0x20, Vaux5Disch - * 0x40, Vaux5DisSfst - * 0x80, Vaux5DisPulld - */ - REG_INIT(AB8505_CTRLVAUX5, 0x01, 0x55, 0xff), - /* - * 0x07, Vaux6Sel - * 0x08, Vaux6LP - * 0x10, Vaux6Ena - * 0x80, Vaux6DisPulld - */ - REG_INIT(AB8505_CTRLVAUX6, 0x01, 0x56, 0x9f), -}; - -/* AB9540 register init */ -static struct ab8500_reg_init ab9540_reg_init[] = { - /* - * 0x03, VarmRequestCtrl - * 0x0c, VapeRequestCtrl - * 0x30, Vsmps1RequestCtrl - * 0xc0, Vsmps2RequestCtrl - */ - REG_INIT(AB9540_REGUREQUESTCTRL1, 0x03, 0x03, 0xff), - /* - * 0x03, Vsmps3RequestCtrl - * 0x0c, VpllRequestCtrl - * 0x30, VanaRequestCtrl - * 0xc0, VextSupply1RequestCtrl - */ - REG_INIT(AB9540_REGUREQUESTCTRL2, 0x03, 0x04, 0xff), - /* - * 0x03, VextSupply2RequestCtrl - * 0x0c, VextSupply3RequestCtrl - * 0x30, Vaux1RequestCtrl - * 0xc0, Vaux2RequestCtrl - */ - REG_INIT(AB9540_REGUREQUESTCTRL3, 0x03, 0x05, 0xff), + REG_INIT(AB8500_REGUREQUESTCTRL3, 0x03, 0x05, 0xff), /* * 0x03, Vaux3RequestCtrl * 0x04, SwHPReq */ - REG_INIT(AB9540_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), + REG_INIT(AB8500_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), /* - * 0x01, Vsmps1SysClkReq1HPValid - * 0x02, Vsmps2SysClkReq1HPValid - * 0x04, Vsmps3SysClkReq1HPValid * 0x08, VanaSysClkReq1HPValid - * 0x10, VpllSysClkReq1HPValid * 0x20, Vaux1SysClkReq1HPValid * 0x40, Vaux2SysClkReq1HPValid * 0x80, Vaux3SysClkReq1HPValid */ - REG_INIT(AB9540_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xff), + REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xe8), /* - * 0x01, VapeSysClkReq1HPValid - * 0x02, VarmSysClkReq1HPValid - * 0x04, VbbSysClkReq1HPValid - * 0x08, VmodSysClkReq1HPValid * 0x10, VextSupply1SysClkReq1HPValid * 0x20, VextSupply2SysClkReq1HPValid * 0x40, VextSupply3SysClkReq1HPValid */ - REG_INIT(AB9540_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x7f), + REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x70), /* - * 0x01, Vsmps1HwHPReq1Valid - * 0x02, Vsmps2HwHPReq1Valid - * 0x04, Vsmps3HwHPReq1Valid * 0x08, VanaHwHPReq1Valid - * 0x10, VpllHwHPReq1Valid * 0x20, Vaux1HwHPReq1Valid * 0x40, Vaux2HwHPReq1Valid * 0x80, Vaux3HwHPReq1Valid */ - REG_INIT(AB9540_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xff), + REG_INIT(AB8500_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xe8), /* * 0x01, VextSupply1HwHPReq1Valid * 0x02, VextSupply2HwHPReq1Valid * 0x04, VextSupply3HwHPReq1Valid - * 0x08, VmodHwHPReq1Valid */ - REG_INIT(AB9540_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x0f), + REG_INIT(AB8500_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x07), /* - * 0x01, Vsmps1HwHPReq2Valid - * 0x02, Vsmps2HwHPReq2Valid - * 0x03, Vsmps3HwHPReq2Valid * 0x08, VanaHwHPReq2Valid - * 0x10, VpllHwHPReq2Valid * 0x20, Vaux1HwHPReq2Valid * 0x40, Vaux2HwHPReq2Valid * 0x80, Vaux3HwHPReq2Valid */ - REG_INIT(AB9540_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xff), + REG_INIT(AB8500_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xe8), /* * 0x01, VextSupply1HwHPReq2Valid * 0x02, VextSupply2HwHPReq2Valid * 0x04, VextSupply3HwHPReq2Valid - * 0x08, VmodHwHPReq2Valid */ - REG_INIT(AB9540_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x0f), + REG_INIT(AB8500_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x07), /* - * 0x01, VapeSwHPReqValid - * 0x02, VarmSwHPReqValid - * 0x04, Vsmps1SwHPReqValid - * 0x08, Vsmps2SwHPReqValid - * 0x10, Vsmps3SwHPReqValid * 0x20, VanaSwHPReqValid - * 0x40, VpllSwHPReqValid * 0x80, Vaux1SwHPReqValid */ - REG_INIT(AB9540_REGUSWHPREQVALID1, 0x03, 0x0d, 0xff), + REG_INIT(AB8500_REGUSWHPREQVALID1, 0x03, 0x0d, 0xa0), /* * 0x01, Vaux2SwHPReqValid * 0x02, Vaux3SwHPReqValid * 0x04, VextSupply1SwHPReqValid * 0x08, VextSupply2SwHPReqValid * 0x10, VextSupply3SwHPReqValid - * 0x20, VmodSwHPReqValid */ - REG_INIT(AB9540_REGUSWHPREQVALID2, 0x03, 0x0e, 0x3f), + REG_INIT(AB8500_REGUSWHPREQVALID2, 0x03, 0x0e, 0x1f), /* * 0x02, SysClkReq2Valid1 - * ... + * 0x04, SysClkReq3Valid1 + * 0x08, SysClkReq4Valid1 + * 0x10, SysClkReq5Valid1 + * 0x20, SysClkReq6Valid1 + * 0x40, SysClkReq7Valid1 * 0x80, SysClkReq8Valid1 */ - REG_INIT(AB9540_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0xfe), + REG_INIT(AB8500_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0xfe), /* * 0x02, SysClkReq2Valid2 - * ... + * 0x04, SysClkReq3Valid2 + * 0x08, SysClkReq4Valid2 + * 0x10, SysClkReq5Valid2 + * 0x20, SysClkReq6Valid2 + * 0x40, SysClkReq7Valid2 * 0x80, SysClkReq8Valid2 */ - REG_INIT(AB9540_REGUSYSCLKREQVALID2, 0x03, 0x10, 0xfe), - /* - * 0x01, Vaux4SwHPReqValid - * 0x02, Vaux4HwHPReq2Valid - * 0x04, Vaux4HwHPReq1Valid - * 0x08, Vaux4SysClkReq1HPValid - */ - REG_INIT(AB9540_REGUVAUX4REQVALID, 0x03, 0x11, 0x0f), + REG_INIT(AB8500_REGUSYSCLKREQVALID2, 0x03, 0x10, 0xfe), /* * 0x02, VTVoutEna * 0x04, Vintcore12Ena @@ -2374,44 +1193,29 @@ static struct ab8500_reg_init ab9540_reg_init[] = { * 0x40, Vintcore12LP * 0x80, VTVoutLP */ - REG_INIT(AB9540_REGUMISC1, 0x03, 0x80, 0xfe), + REG_INIT(AB8500_REGUMISC1, 0x03, 0x80, 0xfe), /* * 0x02, VaudioEna * 0x04, VdmicEna * 0x08, Vamic1Ena * 0x10, Vamic2Ena */ - REG_INIT(AB9540_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), + REG_INIT(AB8500_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), /* * 0x01, Vamic1_dzout * 0x02, Vamic2_dzout */ - REG_INIT(AB9540_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), - /* - * 0x03, Vsmps1Regu - * 0x0c, Vsmps1SelCtrl - * 0x10, Vsmps1AutoMode - * 0x20, Vsmps1PWMMode - */ - REG_INIT(AB9540_VSMPS1REGU, 0x04, 0x03, 0x3f), - /* - * 0x03, Vsmps2Regu - * 0x0c, Vsmps2SelCtrl - * 0x10, Vsmps2AutoMode - * 0x20, Vsmps2PWMMode - */ - REG_INIT(AB9540_VSMPS2REGU, 0x04, 0x04, 0x3f), + REG_INIT(AB8500_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), /* - * 0x03, Vsmps3Regu - * 0x0c, Vsmps3SelCtrl - * NOTE! PRCMU register + * 0x03, VpllRegu (NOTE! PRCMU register bits) + * 0x0c, VanaRegu */ - REG_INIT(AB9540_VSMPS3REGU, 0x04, 0x05, 0x0f), + REG_INIT(AB8500_VPLLVANAREGU, 0x04, 0x06, 0x0f), /* - * 0x03, VpllRegu - * 0x0c, VanaRegu + * 0x01, VrefDDREna + * 0x02, VrefDDRSleepMode */ - REG_INIT(AB9540_VPLLVANAREGU, 0x04, 0x06, 0x0f), + REG_INIT(AB8500_VREFDDR, 0x04, 0x07, 0x03), /* * 0x03, VextSupply1Regu * 0x0c, VextSupply2Regu @@ -2419,83 +1223,33 @@ static struct ab8500_reg_init ab9540_reg_init[] = { * 0x40, ExtSupply2Bypass * 0x80, ExtSupply3Bypass */ - REG_INIT(AB9540_EXTSUPPLYREGU, 0x04, 0x08, 0xff), + REG_INIT(AB8500_EXTSUPPLYREGU, 0x04, 0x08, 0xff), /* * 0x03, Vaux1Regu * 0x0c, Vaux2Regu */ - REG_INIT(AB9540_VAUX12REGU, 0x04, 0x09, 0x0f), + REG_INIT(AB8500_VAUX12REGU, 0x04, 0x09, 0x0f), /* - * 0x0c, Vrf1Regu * 0x03, Vaux3Regu */ - REG_INIT(AB9540_VRF1VAUX3REGU, 0x04, 0x0a, 0x0f), - /* - * 0x3f, Vsmps1Sel1 - */ - REG_INIT(AB9540_VSMPS1SEL1, 0x04, 0x13, 0x3f), - /* - * 0x3f, Vsmps1Sel2 - */ - REG_INIT(AB9540_VSMPS1SEL2, 0x04, 0x14, 0x3f), - /* - * 0x3f, Vsmps1Sel3 - */ - REG_INIT(AB9540_VSMPS1SEL3, 0x04, 0x15, 0x3f), - /* - * 0x3f, Vsmps2Sel1 - */ - REG_INIT(AB9540_VSMPS2SEL1, 0x04, 0x17, 0x3f), - /* - * 0x3f, Vsmps2Sel2 - */ - REG_INIT(AB9540_VSMPS2SEL2, 0x04, 0x18, 0x3f), - /* - * 0x3f, Vsmps2Sel3 - */ - REG_INIT(AB9540_VSMPS2SEL3, 0x04, 0x19, 0x3f), - /* - * 0x7f, Vsmps3Sel1 - * NOTE! PRCMU register - */ - REG_INIT(AB9540_VSMPS3SEL1, 0x04, 0x1b, 0x7f), - /* - * 0x7f, Vsmps3Sel2 - * NOTE! PRCMU register - */ - REG_INIT(AB9540_VSMPS3SEL2, 0x04, 0x1c, 0x7f), + REG_INIT(AB8500_VRF1VAUX3REGU, 0x04, 0x0a, 0x03), /* * 0x0f, Vaux1Sel */ - REG_INIT(AB9540_VAUX1SEL, 0x04, 0x1f, 0x0f), + REG_INIT(AB8500_VAUX1SEL, 0x04, 0x1f, 0x0f), /* * 0x0f, Vaux2Sel */ - REG_INIT(AB9540_VAUX2SEL, 0x04, 0x20, 0x0f), + REG_INIT(AB8500_VAUX2SEL, 0x04, 0x20, 0x0f), /* * 0x07, Vaux3Sel - * 0x30, Vrf1Sel */ - REG_INIT(AB9540_VRF1VAUX3SEL, 0x04, 0x21, 0x37), + REG_INIT(AB8500_VRF1VAUX3SEL, 0x04, 0x21, 0x07), /* * 0x01, VextSupply12LP */ - REG_INIT(AB9540_REGUCTRL2SPARE, 0x04, 0x22, 0x01), - /* - * 0x03, Vaux4RequestCtrl - */ - REG_INIT(AB9540_VAUX4REQCTRL, 0x04, 0x2d, 0x03), - /* - * 0x03, Vaux4Regu - */ - REG_INIT(AB9540_VAUX4REGU, 0x04, 0x2e, 0x03), - /* - * 0x08, Vaux4Sel - */ - REG_INIT(AB9540_VAUX4SEL, 0x04, 0x2f, 0x0f), + REG_INIT(AB8500_REGUCTRL2SPARE, 0x04, 0x22, 0x01), /* - * 0x01, VpllDisch - * 0x02, Vrf1Disch * 0x04, Vaux1Disch * 0x08, Vaux2Disch * 0x10, Vaux3Disch @@ -2503,243 +1257,170 @@ static struct ab8500_reg_init ab9540_reg_init[] = { * 0x40, VTVoutDisch * 0x80, VaudioDisch */ - REG_INIT(AB9540_REGUCTRLDISCH, 0x04, 0x43, 0xff), + REG_INIT(AB8500_REGUCTRLDISCH, 0x04, 0x43, 0xfc), /* - * 0x01, VsimDisch * 0x02, VanaDisch * 0x04, VdmicPullDownEna - * 0x08, VpllPullDownEna * 0x10, VdmicDisch */ - REG_INIT(AB9540_REGUCTRLDISCH2, 0x04, 0x44, 0x1f), - /* - * 0x01, Vaux4Disch - */ - REG_INIT(AB9540_REGUCTRLDISCH3, 0x04, 0x48, 0x01), + REG_INIT(AB8500_REGUCTRLDISCH2, 0x04, 0x44, 0x16), }; -/* AB8540 register init */ -static struct ab8500_reg_init ab8540_reg_init[] = { - /* - * 0x01, VSimSycClkReq1Valid - * 0x02, VSimSycClkReq2Valid - * 0x04, VSimSycClkReq3Valid - * 0x08, VSimSycClkReq4Valid - * 0x10, VSimSycClkReq5Valid - * 0x20, VSimSycClkReq6Valid - * 0x40, VSimSycClkReq7Valid - * 0x80, VSimSycClkReq8Valid - */ - REG_INIT(AB8540_VSIMSYSCLKCTRL, 0x02, 0x33, 0xff), +/* AB8505 register init */ +static struct ab8500_reg_init ab8505_reg_init[] = { /* * 0x03, VarmRequestCtrl - * 0x0c, VapeRequestCtrl - * 0x30, Vsmps1RequestCtrl - * 0xc0, Vsmps2RequestCtrl + * 0x0c, VsmpsCRequestCtrl + * 0x30, VsmpsARequestCtrl + * 0xc0, VsmpsBRequestCtrl */ - REG_INIT(AB8540_REGUREQUESTCTRL1, 0x03, 0x03, 0xff), + REG_INIT(AB8505_REGUREQUESTCTRL1, 0x03, 0x03, 0xff), /* - * 0x03, Vsmps3RequestCtrl + * 0x03, VsafeRequestCtrl * 0x0c, VpllRequestCtrl * 0x30, VanaRequestCtrl - * 0xc0, VextSupply1RequestCtrl */ - REG_INIT(AB8540_REGUREQUESTCTRL2, 0x03, 0x04, 0xff), + REG_INIT(AB8505_REGUREQUESTCTRL2, 0x03, 0x04, 0x3f), /* - * 0x03, VextSupply2RequestCtrl - * 0x0c, VextSupply3RequestCtrl * 0x30, Vaux1RequestCtrl * 0xc0, Vaux2RequestCtrl */ - REG_INIT(AB8540_REGUREQUESTCTRL3, 0x03, 0x05, 0xff), + REG_INIT(AB8505_REGUREQUESTCTRL3, 0x03, 0x05, 0xf0), /* * 0x03, Vaux3RequestCtrl * 0x04, SwHPReq */ - REG_INIT(AB8540_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), + REG_INIT(AB8505_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), /* - * 0x01, Vsmps1SysClkReq1HPValid - * 0x02, Vsmps2SysClkReq1HPValid - * 0x04, Vsmps3SysClkReq1HPValid + * 0x01, VsmpsASysClkReq1HPValid + * 0x02, VsmpsBSysClkReq1HPValid + * 0x04, VsafeSysClkReq1HPValid * 0x08, VanaSysClkReq1HPValid * 0x10, VpllSysClkReq1HPValid * 0x20, Vaux1SysClkReq1HPValid * 0x40, Vaux2SysClkReq1HPValid * 0x80, Vaux3SysClkReq1HPValid */ - REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xff), + REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xff), /* - * 0x01, VapeSysClkReq1HPValid + * 0x01, VsmpsCSysClkReq1HPValid * 0x02, VarmSysClkReq1HPValid * 0x04, VbbSysClkReq1HPValid - * 0x10, VextSupply1SysClkReq1HPValid - * 0x20, VextSupply2SysClkReq1HPValid - * 0x40, VextSupply3SysClkReq1HPValid + * 0x08, VsmpsMSysClkReq1HPValid */ - REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x77), + REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x0f), /* - * 0x01, Vsmps1HwHPReq1Valid - * 0x02, Vsmps2HwHPReq1Valid - * 0x04, Vsmps3HwHPReq1Valid + * 0x01, VsmpsAHwHPReq1Valid + * 0x02, VsmpsBHwHPReq1Valid + * 0x04, VsafeHwHPReq1Valid * 0x08, VanaHwHPReq1Valid * 0x10, VpllHwHPReq1Valid * 0x20, Vaux1HwHPReq1Valid * 0x40, Vaux2HwHPReq1Valid * 0x80, Vaux3HwHPReq1Valid */ - REG_INIT(AB8540_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xff), + REG_INIT(AB8505_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xff), /* - * 0x01, VextSupply1HwHPReq1Valid - * 0x02, VextSupply2HwHPReq1Valid - * 0x04, VextSupply3HwHPReq1Valid + * 0x08, VsmpsMHwHPReq1Valid */ - REG_INIT(AB8540_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x07), + REG_INIT(AB8505_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x08), /* - * 0x01, Vsmps1HwHPReq2Valid - * 0x02, Vsmps2HwHPReq2Valid - * 0x03, Vsmps3HwHPReq2Valid + * 0x01, VsmpsAHwHPReq2Valid + * 0x02, VsmpsBHwHPReq2Valid + * 0x04, VsafeHwHPReq2Valid * 0x08, VanaHwHPReq2Valid * 0x10, VpllHwHPReq2Valid * 0x20, Vaux1HwHPReq2Valid * 0x40, Vaux2HwHPReq2Valid * 0x80, Vaux3HwHPReq2Valid */ - REG_INIT(AB8540_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xff), + REG_INIT(AB8505_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xff), /* - * 0x01, VextSupply1HwHPReq2Valid - * 0x02, VextSupply2HwHPReq2Valid - * 0x04, VextSupply3HwHPReq2Valid + * 0x08, VsmpsMHwHPReq2Valid */ - REG_INIT(AB8540_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x07), + REG_INIT(AB8505_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x08), /* - * 0x01, VapeSwHPReqValid + * 0x01, VsmpsCSwHPReqValid * 0x02, VarmSwHPReqValid - * 0x04, Vsmps1SwHPReqValid - * 0x08, Vsmps2SwHPReqValid - * 0x10, Vsmps3SwHPReqValid + * 0x04, VsmpsASwHPReqValid + * 0x08, VsmpsBSwHPReqValid + * 0x10, VsafeSwHPReqValid * 0x20, VanaSwHPReqValid * 0x40, VpllSwHPReqValid * 0x80, Vaux1SwHPReqValid */ - REG_INIT(AB8540_REGUSWHPREQVALID1, 0x03, 0x0d, 0xff), + REG_INIT(AB8505_REGUSWHPREQVALID1, 0x03, 0x0d, 0xff), /* * 0x01, Vaux2SwHPReqValid * 0x02, Vaux3SwHPReqValid - * 0x04, VextSupply1SwHPReqValid - * 0x08, VextSupply2SwHPReqValid - * 0x10, VextSupply3SwHPReqValid + * 0x20, VsmpsMSwHPReqValid */ - REG_INIT(AB8540_REGUSWHPREQVALID2, 0x03, 0x0e, 0x1f), + REG_INIT(AB8505_REGUSWHPREQVALID2, 0x03, 0x0e, 0x23), /* * 0x02, SysClkReq2Valid1 - * ... - * 0x80, SysClkReq8Valid1 + * 0x04, SysClkReq3Valid1 + * 0x08, SysClkReq4Valid1 */ - REG_INIT(AB8540_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0xff), + REG_INIT(AB8505_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0x0e), /* * 0x02, SysClkReq2Valid2 - * ... - * 0x80, SysClkReq8Valid2 + * 0x04, SysClkReq3Valid2 + * 0x08, SysClkReq4Valid2 */ - REG_INIT(AB8540_REGUSYSCLKREQVALID2, 0x03, 0x10, 0xff), + REG_INIT(AB8505_REGUSYSCLKREQVALID2, 0x03, 0x10, 0x0e), /* * 0x01, Vaux4SwHPReqValid * 0x02, Vaux4HwHPReq2Valid * 0x04, Vaux4HwHPReq1Valid * 0x08, Vaux4SysClkReq1HPValid */ - REG_INIT(AB8540_REGUVAUX4REQVALID, 0x03, 0x11, 0x0f), - /* - * 0x01, Vaux5SwHPReqValid - * 0x02, Vaux5HwHPReq2Valid - * 0x04, Vaux5HwHPReq1Valid - * 0x08, Vaux5SysClkReq1HPValid - */ - REG_INIT(AB8540_REGUVAUX5REQVALID, 0x03, 0x12, 0x0f), - /* - * 0x01, Vaux6SwHPReqValid - * 0x02, Vaux6HwHPReq2Valid - * 0x04, Vaux6HwHPReq1Valid - * 0x08, Vaux6SysClkReq1HPValid - */ - REG_INIT(AB8540_REGUVAUX6REQVALID, 0x03, 0x13, 0x0f), - /* - * 0x01, VclkbSwHPReqValid - * 0x02, VclkbHwHPReq2Valid - * 0x04, VclkbHwHPReq1Valid - * 0x08, VclkbSysClkReq1HPValid - */ - REG_INIT(AB8540_REGUVCLKBREQVALID, 0x03, 0x14, 0x0f), - /* - * 0x01, Vrf1SwHPReqValid - * 0x02, Vrf1HwHPReq2Valid - * 0x04, Vrf1HwHPReq1Valid - * 0x08, Vrf1SysClkReq1HPValid - */ - REG_INIT(AB8540_REGUVRF1REQVALID, 0x03, 0x15, 0x0f), + REG_INIT(AB8505_REGUVAUX4REQVALID, 0x03, 0x11, 0x0f), /* - * 0x02, VTVoutEna - * 0x04, Vintcore12Ena - * 0x38, Vintcore12Sel - * 0x40, Vintcore12LP - * 0x80, VTVoutLP + * 0x02, VadcEna + * 0x04, VintCore12Ena + * 0x38, VintCore12Sel + * 0x40, VintCore12LP + * 0x80, VadcLP */ - REG_INIT(AB8540_REGUMISC1, 0x03, 0x80, 0xfe), + REG_INIT(AB8505_REGUMISC1, 0x03, 0x80, 0xfe), /* * 0x02, VaudioEna * 0x04, VdmicEna * 0x08, Vamic1Ena * 0x10, Vamic2Ena - * 0x20, Vamic12LP - * 0xC0, VdmicSel */ - REG_INIT(AB8540_VAUDIOSUPPLY, 0x03, 0x83, 0xfe), + REG_INIT(AB8505_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), /* * 0x01, Vamic1_dzout * 0x02, Vamic2_dzout */ - REG_INIT(AB8540_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), - /* - * 0x07, VHSICSel - * 0x08, VHSICOffState - * 0x10, VHSIEna - * 0x20, VHSICLP - */ - REG_INIT(AB8540_VHSIC, 0x03, 0x87, 0x3f), - /* - * 0x07, VSDIOSel - * 0x08, VSDIOOffState - * 0x10, VSDIOEna - * 0x20, VSDIOLP - */ - REG_INIT(AB8540_VSDIO, 0x03, 0x88, 0x3f), + REG_INIT(AB8505_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), /* - * 0x03, Vsmps1Regu - * 0x0c, Vsmps1SelCtrl - * 0x10, Vsmps1AutoMode - * 0x20, Vsmps1PWMMode + * 0x03, VsmpsARegu + * 0x0c, VsmpsASelCtrl + * 0x10, VsmpsAAutoMode + * 0x20, VsmpsAPWMMode */ - REG_INIT(AB8540_VSMPS1REGU, 0x04, 0x03, 0x3f), + REG_INIT(AB8505_VSMPSAREGU, 0x04, 0x03, 0x3f), /* - * 0x03, Vsmps2Regu - * 0x0c, Vsmps2SelCtrl - * 0x10, Vsmps2AutoMode - * 0x20, Vsmps2PWMMode + * 0x03, VsmpsBRegu + * 0x0c, VsmpsBSelCtrl + * 0x10, VsmpsBAutoMode + * 0x20, VsmpsBPWMMode */ - REG_INIT(AB8540_VSMPS2REGU, 0x04, 0x04, 0x3f), + REG_INIT(AB8505_VSMPSBREGU, 0x04, 0x04, 0x3f), /* - * 0x03, Vsmps3Regu - * 0x0c, Vsmps3SelCtrl - * 0x10, Vsmps3AutoMode - * 0x20, Vsmps3PWMMode - * NOTE! PRCMU register + * 0x03, VsafeRegu + * 0x0c, VsafeSelCtrl + * 0x10, VsafeAutoMode + * 0x20, VsafePWMMode */ - REG_INIT(AB8540_VSMPS3REGU, 0x04, 0x05, 0x0f), + REG_INIT(AB8505_VSAFEREGU, 0x04, 0x05, 0x3f), /* - * 0x03, VpllRegu + * 0x03, VpllRegu (NOTE! PRCMU register bits) * 0x0c, VanaRegu */ - REG_INIT(AB8540_VPLLVANAREGU, 0x04, 0x06, 0x0f), + REG_INIT(AB8505_VPLLVANAREGU, 0x04, 0x06, 0x0f), /* * 0x03, VextSupply1Regu * 0x0c, VextSupply2Regu @@ -2747,128 +1428,78 @@ static struct ab8500_reg_init ab8540_reg_init[] = { * 0x40, ExtSupply2Bypass * 0x80, ExtSupply3Bypass */ - REG_INIT(AB8540_EXTSUPPLYREGU, 0x04, 0x08, 0xff), + REG_INIT(AB8505_EXTSUPPLYREGU, 0x04, 0x08, 0xff), /* * 0x03, Vaux1Regu * 0x0c, Vaux2Regu */ - REG_INIT(AB8540_VAUX12REGU, 0x04, 0x09, 0x0f), + REG_INIT(AB8505_VAUX12REGU, 0x04, 0x09, 0x0f), /* - * 0x0c, VRF1Regu - * 0x03, Vaux3Regu + * 0x0f, Vaux3Regu + */ + REG_INIT(AB8505_VRF1VAUX3REGU, 0x04, 0x0a, 0x0f), + /* + * 0x3f, VsmpsASel1 */ - REG_INIT(AB8540_VRF1VAUX3REGU, 0x04, 0x0a, 0x0f), + REG_INIT(AB8505_VSMPSASEL1, 0x04, 0x13, 0x3f), /* - * 0x3f, Vsmps1Sel1 + * 0x3f, VsmpsASel2 */ - REG_INIT(AB8540_VSMPS1SEL1, 0x04, 0x13, 0x3f), + REG_INIT(AB8505_VSMPSASEL2, 0x04, 0x14, 0x3f), /* - * 0x3f, Vsmps1Sel2 + * 0x3f, VsmpsASel3 */ - REG_INIT(AB8540_VSMPS1SEL2, 0x04, 0x14, 0x3f), + REG_INIT(AB8505_VSMPSASEL3, 0x04, 0x15, 0x3f), /* - * 0x3f, Vsmps1Sel3 + * 0x3f, VsmpsBSel1 */ - REG_INIT(AB8540_VSMPS1SEL3, 0x04, 0x15, 0x3f), + REG_INIT(AB8505_VSMPSBSEL1, 0x04, 0x17, 0x3f), /* - * 0x3f, Vsmps2Sel1 + * 0x3f, VsmpsBSel2 */ - REG_INIT(AB8540_VSMPS2SEL1, 0x04, 0x17, 0x3f), + REG_INIT(AB8505_VSMPSBSEL2, 0x04, 0x18, 0x3f), /* - * 0x3f, Vsmps2Sel2 + * 0x3f, VsmpsBSel3 */ - REG_INIT(AB8540_VSMPS2SEL2, 0x04, 0x18, 0x3f), + REG_INIT(AB8505_VSMPSBSEL3, 0x04, 0x19, 0x3f), /* - * 0x3f, Vsmps2Sel3 + * 0x7f, VsafeSel1 */ - REG_INIT(AB8540_VSMPS2SEL3, 0x04, 0x19, 0x3f), + REG_INIT(AB8505_VSAFESEL1, 0x04, 0x1b, 0x7f), /* - * 0x7f, Vsmps3Sel1 - * NOTE! PRCMU register + * 0x3f, VsafeSel2 */ - REG_INIT(AB8540_VSMPS3SEL1, 0x04, 0x1b, 0x7f), + REG_INIT(AB8505_VSAFESEL2, 0x04, 0x1c, 0x7f), /* - * 0x7f, Vsmps3Sel2 - * NOTE! PRCMU register + * 0x3f, VsafeSel3 */ - REG_INIT(AB8540_VSMPS3SEL2, 0x04, 0x1c, 0x7f), + REG_INIT(AB8505_VSAFESEL3, 0x04, 0x1d, 0x7f), /* * 0x0f, Vaux1Sel */ - REG_INIT(AB8540_VAUX1SEL, 0x04, 0x1f, 0x0f), + REG_INIT(AB8505_VAUX1SEL, 0x04, 0x1f, 0x0f), /* * 0x0f, Vaux2Sel */ - REG_INIT(AB8540_VAUX2SEL, 0x04, 0x20, 0x0f), + REG_INIT(AB8505_VAUX2SEL, 0x04, 0x20, 0x0f), /* * 0x07, Vaux3Sel - * 0x70, Vrf1Sel - */ - REG_INIT(AB8540_VRF1VAUX3SEL, 0x04, 0x21, 0x77), - /* - * 0x01, VextSupply12LP - */ - REG_INIT(AB8540_REGUCTRL2SPARE, 0x04, 0x22, 0x01), - /* - * 0x07, Vanasel - * 0x30, Vpllsel + * 0x30, VRF1Sel */ - REG_INIT(AB8540_VANAVPLLSEL, 0x04, 0x29, 0x37), + REG_INIT(AB8505_VRF1VAUX3SEL, 0x04, 0x21, 0x37), /* * 0x03, Vaux4RequestCtrl */ - REG_INIT(AB8540_VAUX4REQCTRL, 0x04, 0x2d, 0x03), + REG_INIT(AB8505_VAUX4REQCTRL, 0x04, 0x2d, 0x03), /* * 0x03, Vaux4Regu */ - REG_INIT(AB8540_VAUX4REGU, 0x04, 0x2e, 0x03), + REG_INIT(AB8505_VAUX4REGU, 0x04, 0x2e, 0x03), /* * 0x0f, Vaux4Sel */ - REG_INIT(AB8540_VAUX4SEL, 0x04, 0x2f, 0x0f), - /* - * 0x03, Vaux5RequestCtrl - */ - REG_INIT(AB8540_VAUX5REQCTRL, 0x04, 0x31, 0x03), - /* - * 0x03, Vaux5Regu - */ - REG_INIT(AB8540_VAUX5REGU, 0x04, 0x32, 0x03), - /* - * 0x3f, Vaux5Sel - */ - REG_INIT(AB8540_VAUX5SEL, 0x04, 0x33, 0x3f), - /* - * 0x03, Vaux6RequestCtrl - */ - REG_INIT(AB8540_VAUX6REQCTRL, 0x04, 0x34, 0x03), - /* - * 0x03, Vaux6Regu - */ - REG_INIT(AB8540_VAUX6REGU, 0x04, 0x35, 0x03), - /* - * 0x3f, Vaux6Sel - */ - REG_INIT(AB8540_VAUX6SEL, 0x04, 0x36, 0x3f), - /* - * 0x03, VCLKBRequestCtrl - */ - REG_INIT(AB8540_VCLKBREQCTRL, 0x04, 0x37, 0x03), - /* - * 0x03, VCLKBRegu - */ - REG_INIT(AB8540_VCLKBREGU, 0x04, 0x38, 0x03), - /* - * 0x07, VCLKBSel - */ - REG_INIT(AB8540_VCLKBSEL, 0x04, 0x39, 0x07), - /* - * 0x03, Vrf1RequestCtrl - */ - REG_INIT(AB8540_VRF1REQCTRL, 0x04, 0x3a, 0x03), + REG_INIT(AB8505_VAUX4SEL, 0x04, 0x2f, 0x0f), /* - * 0x01, VpllDisch - * 0x02, Vrf1Disch * 0x04, Vaux1Disch * 0x08, Vaux2Disch * 0x10, Vaux3Disch @@ -2876,24 +1507,33 @@ static struct ab8500_reg_init ab8540_reg_init[] = { * 0x40, VTVoutDisch * 0x80, VaudioDisch */ - REG_INIT(AB8540_REGUCTRLDISCH, 0x04, 0x43, 0xff), + REG_INIT(AB8505_REGUCTRLDISCH, 0x04, 0x43, 0xfc), /* * 0x02, VanaDisch * 0x04, VdmicPullDownEna - * 0x08, VpllPullDownEna * 0x10, VdmicDisch */ - REG_INIT(AB8540_REGUCTRLDISCH2, 0x04, 0x44, 0x1e), + REG_INIT(AB8505_REGUCTRLDISCH2, 0x04, 0x44, 0x16), /* * 0x01, Vaux4Disch */ - REG_INIT(AB8540_REGUCTRLDISCH3, 0x04, 0x48, 0x01), + REG_INIT(AB8505_REGUCTRLDISCH3, 0x04, 0x48, 0x01), + /* + * 0x07, Vaux5Sel + * 0x08, Vaux5LP + * 0x10, Vaux5Ena + * 0x20, Vaux5Disch + * 0x40, Vaux5DisSfst + * 0x80, Vaux5DisPulld + */ + REG_INIT(AB8505_CTRLVAUX5, 0x01, 0x55, 0xff), /* - * 0x01, Vaux5Disch - * 0x02, Vaux6Disch - * 0x04, VCLKBDisch + * 0x07, Vaux6Sel + * 0x08, Vaux6LP + * 0x10, Vaux6Ena + * 0x80, Vaux6DisPulld */ - REG_INIT(AB8540_REGUCTRLDISCH4, 0x04, 0x49, 0x07), + REG_INIT(AB8505_CTRLVAUX6, 0x01, 0x56, 0x9f), }; static struct of_regulator_match ab8500_regulator_match[] = { @@ -2925,37 +1565,6 @@ static struct of_regulator_match ab8505_regulator_match[] = { { .name = "ab8500_ldo_ana", .driver_data = (void *) AB8505_LDO_ANA, }, }; -static struct of_regulator_match ab8540_regulator_match[] = { - { .name = "ab8500_ldo_aux1", .driver_data = (void *) AB8540_LDO_AUX1, }, - { .name = "ab8500_ldo_aux2", .driver_data = (void *) AB8540_LDO_AUX2, }, - { .name = "ab8500_ldo_aux3", .driver_data = (void *) AB8540_LDO_AUX3, }, - { .name = "ab8500_ldo_aux4", .driver_data = (void *) AB8540_LDO_AUX4, }, - { .name = "ab8500_ldo_aux5", .driver_data = (void *) AB8540_LDO_AUX5, }, - { .name = "ab8500_ldo_aux6", .driver_data = (void *) AB8540_LDO_AUX6, }, - { .name = "ab8500_ldo_intcore", .driver_data = (void *) AB8540_LDO_INTCORE, }, - { .name = "ab8500_ldo_tvout", .driver_data = (void *) AB8540_LDO_TVOUT, }, - { .name = "ab8500_ldo_audio", .driver_data = (void *) AB8540_LDO_AUDIO, }, - { .name = "ab8500_ldo_anamic1", .driver_data = (void *) AB8540_LDO_ANAMIC1, }, - { .name = "ab8500_ldo_anamic2", .driver_data = (void *) AB8540_LDO_ANAMIC2, }, - { .name = "ab8500_ldo_dmic", .driver_data = (void *) AB8540_LDO_DMIC, }, - { .name = "ab8500_ldo_ana", .driver_data = (void *) AB8540_LDO_ANA, }, - { .name = "ab8500_ldo_sdio", .driver_data = (void *) AB8540_LDO_SDIO, }, -}; - -static struct of_regulator_match ab9540_regulator_match[] = { - { .name = "ab8500_ldo_aux1", .driver_data = (void *) AB9540_LDO_AUX1, }, - { .name = "ab8500_ldo_aux2", .driver_data = (void *) AB9540_LDO_AUX2, }, - { .name = "ab8500_ldo_aux3", .driver_data = (void *) AB9540_LDO_AUX3, }, - { .name = "ab8500_ldo_aux4", .driver_data = (void *) AB9540_LDO_AUX4, }, - { .name = "ab8500_ldo_intcore", .driver_data = (void *) AB9540_LDO_INTCORE, }, - { .name = "ab8500_ldo_tvout", .driver_data = (void *) AB9540_LDO_TVOUT, }, - { .name = "ab8500_ldo_audio", .driver_data = (void *) AB9540_LDO_AUDIO, }, - { .name = "ab8500_ldo_anamic1", .driver_data = (void *) AB9540_LDO_ANAMIC1, }, - { .name = "ab8500_ldo_anamic2", .driver_data = (void *) AB9540_LDO_ANAMIC2, }, - { .name = "ab8500_ldo_dmic", .driver_data = (void *) AB9540_LDO_DMIC, }, - { .name = "ab8500_ldo_ana", .driver_data = (void *) AB9540_LDO_ANA, }, -}; - static struct { struct ab8500_regulator_info *info; int info_size; @@ -2967,27 +1576,13 @@ static struct { static void abx500_get_regulator_info(struct ab8500 *ab8500) { - if (is_ab9540(ab8500)) { - abx500_regulator.info = ab9540_regulator_info; - abx500_regulator.info_size = ARRAY_SIZE(ab9540_regulator_info); - abx500_regulator.init = ab9540_reg_init; - abx500_regulator.init_size = AB9540_NUM_REGULATOR_REGISTERS; - abx500_regulator.match = ab9540_regulator_match; - abx500_regulator.match_size = ARRAY_SIZE(ab9540_regulator_match); - } else if (is_ab8505(ab8500)) { + if (is_ab8505(ab8500)) { abx500_regulator.info = ab8505_regulator_info; abx500_regulator.info_size = ARRAY_SIZE(ab8505_regulator_info); abx500_regulator.init = ab8505_reg_init; abx500_regulator.init_size = AB8505_NUM_REGULATOR_REGISTERS; abx500_regulator.match = ab8505_regulator_match; abx500_regulator.match_size = ARRAY_SIZE(ab8505_regulator_match); - } else if (is_ab8540(ab8500)) { - abx500_regulator.info = ab8540_regulator_info; - abx500_regulator.info_size = ARRAY_SIZE(ab8540_regulator_info); - abx500_regulator.init = ab8540_reg_init; - abx500_regulator.init_size = AB8540_NUM_REGULATOR_REGISTERS; - abx500_regulator.match = ab8540_regulator_match; - abx500_regulator.match_size = ARRAY_SIZE(ab8540_regulator_match); } else { abx500_regulator.info = ab8500_regulator_info; abx500_regulator.info_size = ARRAY_SIZE(ab8500_regulator_info); diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h index d8ecefaf63ca..6d46f962685d 100644 --- a/include/linux/regulator/ab8500.h +++ b/include/linux/regulator/ab8500.h @@ -49,47 +49,7 @@ enum ab8505_regulator_id { AB8505_NUM_REGULATORS, }; -/* AB9540 regulators */ -enum ab9540_regulator_id { - AB9540_LDO_AUX1, - AB9540_LDO_AUX2, - AB9540_LDO_AUX3, - AB9540_LDO_AUX4, - AB9540_LDO_INTCORE, - AB9540_LDO_TVOUT, - AB9540_LDO_USB, - AB9540_LDO_AUDIO, - AB9540_LDO_ANAMIC1, - AB9540_LDO_ANAMIC2, - AB9540_LDO_DMIC, - AB9540_LDO_ANA, - AB9540_SYSCLKREQ_2, - AB9540_SYSCLKREQ_4, - AB9540_NUM_REGULATORS, -}; - -/* AB8540 regulators */ -enum ab8540_regulator_id { - AB8540_LDO_AUX1, - AB8540_LDO_AUX2, - AB8540_LDO_AUX3, - AB8540_LDO_AUX4, - AB8540_LDO_AUX5, - AB8540_LDO_AUX6, - AB8540_LDO_INTCORE, - AB8540_LDO_TVOUT, - AB8540_LDO_AUDIO, - AB8540_LDO_ANAMIC1, - AB8540_LDO_ANAMIC2, - AB8540_LDO_DMIC, - AB8540_LDO_ANA, - AB8540_LDO_SDIO, - AB8540_SYSCLKREQ_2, - AB8540_SYSCLKREQ_4, - AB8540_NUM_REGULATORS, -}; - -/* AB8500, AB8505, and AB9540 register initialization */ +/* AB8500 and AB8505 register initialization */ struct ab8500_regulator_reg_init { int id; u8 mask; @@ -185,121 +145,6 @@ enum ab8505_regulator_reg { AB8505_NUM_REGULATOR_REGISTERS, }; -/* AB9540 registers */ -enum ab9540_regulator_reg { - AB9540_REGUREQUESTCTRL1, - AB9540_REGUREQUESTCTRL2, - AB9540_REGUREQUESTCTRL3, - AB9540_REGUREQUESTCTRL4, - AB9540_REGUSYSCLKREQ1HPVALID1, - AB9540_REGUSYSCLKREQ1HPVALID2, - AB9540_REGUHWHPREQ1VALID1, - AB9540_REGUHWHPREQ1VALID2, - AB9540_REGUHWHPREQ2VALID1, - AB9540_REGUHWHPREQ2VALID2, - AB9540_REGUSWHPREQVALID1, - AB9540_REGUSWHPREQVALID2, - AB9540_REGUSYSCLKREQVALID1, - AB9540_REGUSYSCLKREQVALID2, - AB9540_REGUVAUX4REQVALID, - AB9540_REGUMISC1, - AB9540_VAUDIOSUPPLY, - AB9540_REGUCTRL1VAMIC, - AB9540_VSMPS1REGU, - AB9540_VSMPS2REGU, - AB9540_VSMPS3REGU, /* NOTE! PRCMU register */ - AB9540_VPLLVANAREGU, - AB9540_EXTSUPPLYREGU, - AB9540_VAUX12REGU, - AB9540_VRF1VAUX3REGU, - AB9540_VSMPS1SEL1, - AB9540_VSMPS1SEL2, - AB9540_VSMPS1SEL3, - AB9540_VSMPS2SEL1, - AB9540_VSMPS2SEL2, - AB9540_VSMPS2SEL3, - AB9540_VSMPS3SEL1, /* NOTE! PRCMU register */ - AB9540_VSMPS3SEL2, /* NOTE! PRCMU register */ - AB9540_VAUX1SEL, - AB9540_VAUX2SEL, - AB9540_VRF1VAUX3SEL, - AB9540_REGUCTRL2SPARE, - AB9540_VAUX4REQCTRL, - AB9540_VAUX4REGU, - AB9540_VAUX4SEL, - AB9540_REGUCTRLDISCH, - AB9540_REGUCTRLDISCH2, - AB9540_REGUCTRLDISCH3, - AB9540_NUM_REGULATOR_REGISTERS, -}; - -/* AB8540 registers */ -enum ab8540_regulator_reg { - AB8540_REGUREQUESTCTRL1, - AB8540_REGUREQUESTCTRL2, - AB8540_REGUREQUESTCTRL3, - AB8540_REGUREQUESTCTRL4, - AB8540_REGUSYSCLKREQ1HPVALID1, - AB8540_REGUSYSCLKREQ1HPVALID2, - AB8540_REGUHWHPREQ1VALID1, - AB8540_REGUHWHPREQ1VALID2, - AB8540_REGUHWHPREQ2VALID1, - AB8540_REGUHWHPREQ2VALID2, - AB8540_REGUSWHPREQVALID1, - AB8540_REGUSWHPREQVALID2, - AB8540_REGUSYSCLKREQVALID1, - AB8540_REGUSYSCLKREQVALID2, - AB8540_REGUVAUX4REQVALID, - AB8540_REGUVAUX5REQVALID, - AB8540_REGUVAUX6REQVALID, - AB8540_REGUVCLKBREQVALID, - AB8540_REGUVRF1REQVALID, - AB8540_REGUMISC1, - AB8540_VAUDIOSUPPLY, - AB8540_REGUCTRL1VAMIC, - AB8540_VHSIC, - AB8540_VSDIO, - AB8540_VSMPS1REGU, - AB8540_VSMPS2REGU, - AB8540_VSMPS3REGU, - AB8540_VPLLVANAREGU, - AB8540_EXTSUPPLYREGU, - AB8540_VAUX12REGU, - AB8540_VRF1VAUX3REGU, - AB8540_VSMPS1SEL1, - AB8540_VSMPS1SEL2, - AB8540_VSMPS1SEL3, - AB8540_VSMPS2SEL1, - AB8540_VSMPS2SEL2, - AB8540_VSMPS2SEL3, - AB8540_VSMPS3SEL1, - AB8540_VSMPS3SEL2, - AB8540_VAUX1SEL, - AB8540_VAUX2SEL, - AB8540_VRF1VAUX3SEL, - AB8540_REGUCTRL2SPARE, - AB8540_VAUX4REQCTRL, - AB8540_VAUX4REGU, - AB8540_VAUX4SEL, - AB8540_VAUX5REQCTRL, - AB8540_VAUX5REGU, - AB8540_VAUX5SEL, - AB8540_VAUX6REQCTRL, - AB8540_VAUX6REGU, - AB8540_VAUX6SEL, - AB8540_VCLKBREQCTRL, - AB8540_VCLKBREGU, - AB8540_VCLKBSEL, - AB8540_VRF1REQCTRL, - AB8540_REGUCTRLDISCH, - AB8540_REGUCTRLDISCH2, - AB8540_REGUCTRLDISCH3, - AB8540_REGUCTRLDISCH4, - AB8540_VSIMSYSCLKCTRL, - AB8540_VANAVPLLSEL, - AB8540_NUM_REGULATOR_REGISTERS, -}; - /* AB8500 external regulators */ struct ab8500_ext_regulator_cfg { bool hwreq; /* requires hw mode or high power mode */ -- cgit v1.2.3 From 964f7c0dd23de68c0a3f33a91ca10775ef39ad71 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 28 Mar 2018 20:26:09 +0100 Subject: soc: renesas: rcar-sysc: Add r8a77470 support Add support for RZ/G1C (R8A77470) SoC power areas to the R-Car SYSC driver. Signed-off-by: Biju Das Reviewed-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Signed-off-by: Simon Horman --- .../bindings/power/renesas,rcar-sysc.txt | 1 + drivers/soc/renesas/Kconfig | 5 ++++ drivers/soc/renesas/Makefile | 1 + drivers/soc/renesas/r8a77470-sysc.c | 29 ++++++++++++++++++++++ drivers/soc/renesas/rcar-sysc.c | 3 +++ drivers/soc/renesas/rcar-sysc.h | 1 + include/dt-bindings/power/r8a77470-sysc.h | 22 ++++++++++++++++ 7 files changed, 62 insertions(+) create mode 100644 drivers/soc/renesas/r8a77470-sysc.c create mode 100644 include/dt-bindings/power/r8a77470-sysc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt index ab399e559257..3e91d2032253 100644 --- a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt +++ b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt @@ -9,6 +9,7 @@ Required properties: - compatible: Must contain exactly one of the following: - "renesas,r8a7743-sysc" (RZ/G1M) - "renesas,r8a7745-sysc" (RZ/G1E) + - "renesas,r8a77470-sysc" (RZ/G1C) - "renesas,r8a7779-sysc" (R-Car H1) - "renesas,r8a7790-sysc" (R-Car H2) - "renesas,r8a7791-sysc" (R-Car M2-W) diff --git a/drivers/soc/renesas/Kconfig b/drivers/soc/renesas/Kconfig index 3bbe6114a420..96dd93660359 100644 --- a/drivers/soc/renesas/Kconfig +++ b/drivers/soc/renesas/Kconfig @@ -7,6 +7,7 @@ config SOC_RENESAS ARCH_R8A77970 || ARCH_R8A77980 || ARCH_R8A77995 select SYSC_R8A7743 if ARCH_R8A7743 select SYSC_R8A7745 if ARCH_R8A7745 + select SYSC_R8A77470 if ARCH_R8A77470 select SYSC_R8A7779 if ARCH_R8A7779 select SYSC_R8A7790 if ARCH_R8A7790 select SYSC_R8A7791 if ARCH_R8A7791 || ARCH_R8A7793 @@ -30,6 +31,10 @@ config SYSC_R8A7745 bool "RZ/G1E System Controller support" if COMPILE_TEST select SYSC_RCAR +config SYSC_R8A77470 + bool "RZ/G1C System Controller support" if COMPILE_TEST + select SYSC_RCAR + config SYSC_R8A7779 bool "R-Car H1 System Controller support" if COMPILE_TEST select SYSC_RCAR diff --git a/drivers/soc/renesas/Makefile b/drivers/soc/renesas/Makefile index ccb5ec57a262..a86ece7b84d1 100644 --- a/drivers/soc/renesas/Makefile +++ b/drivers/soc/renesas/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SOC_RENESAS) += renesas-soc.o # SoC obj-$(CONFIG_SYSC_R8A7743) += r8a7743-sysc.o obj-$(CONFIG_SYSC_R8A7745) += r8a7745-sysc.o +obj-$(CONFIG_SYSC_R8A77470) += r8a77470-sysc.o obj-$(CONFIG_SYSC_R8A7779) += r8a7779-sysc.o obj-$(CONFIG_SYSC_R8A7790) += r8a7790-sysc.o obj-$(CONFIG_SYSC_R8A7791) += r8a7791-sysc.o diff --git a/drivers/soc/renesas/r8a77470-sysc.c b/drivers/soc/renesas/r8a77470-sysc.c new file mode 100644 index 000000000000..cfa015e208ef --- /dev/null +++ b/drivers/soc/renesas/r8a77470-sysc.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G1C System Controller + * + * Copyright (C) 2018 Renesas Electronics Corp. + */ + +#include +#include + +#include + +#include "rcar-sysc.h" + +static const struct rcar_sysc_area r8a77470_areas[] __initconst = { + { "always-on", 0, 0, R8A77470_PD_ALWAYS_ON, -1, PD_ALWAYS_ON }, + { "ca7-scu", 0x100, 0, R8A77470_PD_CA7_SCU, R8A77470_PD_ALWAYS_ON, + PD_SCU }, + { "ca7-cpu0", 0x1c0, 0, R8A77470_PD_CA7_CPU0, R8A77470_PD_CA7_SCU, + PD_CPU_NOCR }, + { "ca7-cpu1", 0x1c0, 1, R8A77470_PD_CA7_CPU1, R8A77470_PD_CA7_SCU, + PD_CPU_NOCR }, + { "sgx", 0xc0, 0, R8A77470_PD_SGX, R8A77470_PD_ALWAYS_ON }, +}; + +const struct rcar_sysc_info r8a77470_sysc_info __initconst = { + .areas = r8a77470_areas, + .num_areas = ARRAY_SIZE(r8a77470_areas), +}; diff --git a/drivers/soc/renesas/rcar-sysc.c b/drivers/soc/renesas/rcar-sysc.c index faf20e719361..99203bdc333a 100644 --- a/drivers/soc/renesas/rcar-sysc.c +++ b/drivers/soc/renesas/rcar-sysc.c @@ -261,6 +261,9 @@ static const struct of_device_id rcar_sysc_matches[] __initconst = { #ifdef CONFIG_SYSC_R8A7745 { .compatible = "renesas,r8a7745-sysc", .data = &r8a7745_sysc_info }, #endif +#ifdef CONFIG_SYSC_R8A77470 + { .compatible = "renesas,r8a77470-sysc", .data = &r8a77470_sysc_info }, +#endif #ifdef CONFIG_SYSC_R8A7779 { .compatible = "renesas,r8a7779-sysc", .data = &r8a7779_sysc_info }, #endif diff --git a/drivers/soc/renesas/rcar-sysc.h b/drivers/soc/renesas/rcar-sysc.h index dcdc9ec8eba7..9b24e3af288f 100644 --- a/drivers/soc/renesas/rcar-sysc.h +++ b/drivers/soc/renesas/rcar-sysc.h @@ -51,6 +51,7 @@ struct rcar_sysc_info { extern const struct rcar_sysc_info r8a7743_sysc_info; extern const struct rcar_sysc_info r8a7745_sysc_info; +extern const struct rcar_sysc_info r8a77470_sysc_info; extern const struct rcar_sysc_info r8a7779_sysc_info; extern const struct rcar_sysc_info r8a7790_sysc_info; extern const struct rcar_sysc_info r8a7791_sysc_info; diff --git a/include/dt-bindings/power/r8a77470-sysc.h b/include/dt-bindings/power/r8a77470-sysc.h new file mode 100644 index 000000000000..8bf4db187c31 --- /dev/null +++ b/include/dt-bindings/power/r8a77470-sysc.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2018 Renesas Electronics Corp. + */ +#ifndef __DT_BINDINGS_POWER_R8A77470_SYSC_H__ +#define __DT_BINDINGS_POWER_R8A77470_SYSC_H__ + +/* + * These power domain indices match the numbers of the interrupt bits + * representing the power areas in the various Interrupt Registers + * (e.g. SYSCISR, Interrupt Status Register) + */ + +#define R8A77470_PD_CA7_CPU0 5 +#define R8A77470_PD_CA7_CPU1 6 +#define R8A77470_PD_SGX 20 +#define R8A77470_PD_CA7_SCU 21 + +/* Always-on power area */ +#define R8A77470_PD_ALWAYS_ON 32 + +#endif /* __DT_BINDINGS_POWER_R8A77470_SYSC_H__ */ -- cgit v1.2.3 From 355f9e6482b2e5c34aa46cac199eff56de286514 Mon Sep 17 00:00:00 2001 From: Takeshi Kihara Date: Wed, 11 Apr 2018 18:36:24 +0900 Subject: soc: renesas: Add r8a77990 SYSC PM Domain Binding Definitions This patch adds power domain indices for R-Car E3. Signed-off-by: Takeshi Kihara [shimoda: add commit log and SPDX-License-Identifier] Signed-off-by: Yoshihiro Shimoda Reviewed-by: Geert Uytterhoeven Signed-off-by: Simon Horman --- include/dt-bindings/power/r8a77990-sysc.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/dt-bindings/power/r8a77990-sysc.h (limited to 'include') diff --git a/include/dt-bindings/power/r8a77990-sysc.h b/include/dt-bindings/power/r8a77990-sysc.h new file mode 100644 index 000000000000..944d85beec15 --- /dev/null +++ b/include/dt-bindings/power/r8a77990-sysc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Renesas Electronics Corp. + */ +#ifndef __DT_BINDINGS_POWER_R8A77990_SYSC_H__ +#define __DT_BINDINGS_POWER_R8A77990_SYSC_H__ + +/* + * These power domain indices match the numbers of the interrupt bits + * representing the power areas in the various Interrupt Registers + * (e.g. SYSCISR, Interrupt Status Register) + */ + +#define R8A77990_PD_CA53_CPU0 5 +#define R8A77990_PD_CA53_CPU1 6 +#define R8A77990_PD_CR7 13 +#define R8A77990_PD_A3VC 14 +#define R8A77990_PD_3DG_A 17 +#define R8A77990_PD_3DG_B 18 +#define R8A77990_PD_CA53_SCU 21 +#define R8A77990_PD_A2VC1 26 + +/* Always-on power area */ +#define R8A77990_PD_ALWAYS_ON 32 + +#endif /* __DT_BINDINGS_POWER_R8A77990_SYSC_H__ */ -- cgit v1.2.3 From a941e2fab3207cb0d57dc4ec47b1b12c8ea78b84 Mon Sep 17 00:00:00 2001 From: Kirill Marinushkin Date: Wed, 4 Apr 2018 06:19:37 +0200 Subject: ASoC: topology: Fix bclk and fsync inversion in set_link_hw_format() The values of bclk and fsync are inverted WRT the codec. But the existing solution already works for Broadwell, see the alsa-lib config: `alsa-lib/src/conf/topology/broadwell/broadwell.conf` This commit provides the backwards-compatible solution to fix this misuse. Signed-off-by: Kirill Marinushkin Reviewed-by: Pierre-Louis Bossart Tested-by: Pan Xiuli Tested-by: Pierre-Louis Bossart Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Mark Brown Cc: Liam Girdwood Cc: linux-kernel@vger.kernel.org Cc: alsa-devel@alsa-project.org Signed-off-by: Mark Brown --- include/uapi/sound/asoc.h | 16 ++++++++++++++-- sound/soc/soc-topology.c | 12 +++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h index 69c37ecbff7e..f0e5e21efa54 100644 --- a/include/uapi/sound/asoc.h +++ b/include/uapi/sound/asoc.h @@ -160,6 +160,18 @@ #define SND_SOC_TPLG_LNK_FLGBIT_SYMMETRIC_SAMPLEBITS (1 << 2) #define SND_SOC_TPLG_LNK_FLGBIT_VOICE_WAKEUP (1 << 3) +/* DAI topology BCLK parameter + * For the backwards capability, by default codec is bclk master + */ +#define SND_SOC_TPLG_BCLK_CM 0 /* codec is bclk master */ +#define SND_SOC_TPLG_BCLK_CS 1 /* codec is bclk slave */ + +/* DAI topology FSYNC parameter + * For the backwards capability, by default codec is fsync master + */ +#define SND_SOC_TPLG_FSYNC_CM 0 /* codec is fsync master */ +#define SND_SOC_TPLG_FSYNC_CS 1 /* codec is fsync slave */ + /* * Block Header. * This header precedes all object and object arrays below. @@ -315,8 +327,8 @@ struct snd_soc_tplg_hw_config { __u8 clock_gated; /* 1 if clock can be gated to save power */ __u8 invert_bclk; /* 1 for inverted BCLK, 0 for normal */ __u8 invert_fsync; /* 1 for inverted frame clock, 0 for normal */ - __u8 bclk_master; /* 1 for master of BCLK, 0 for slave */ - __u8 fsync_master; /* 1 for master of FSYNC, 0 for slave */ + __u8 bclk_master; /* SND_SOC_TPLG_BCLK_ value */ + __u8 fsync_master; /* SND_SOC_TPLG_FSYNC_ value */ __u8 mclk_direction; /* 0 for input, 1 for output */ __le16 reserved; /* for 32bit alignment */ __le32 mclk_rate; /* MCLK or SYSCLK freqency in Hz */ diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 942c6e5eb4b7..1c55252641f3 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -2019,13 +2019,15 @@ static void set_link_hw_format(struct snd_soc_dai_link *link, link->dai_fmt |= SND_SOC_DAIFMT_IB_IF; /* clock masters */ - bclk_master = hw_config->bclk_master; - fsync_master = hw_config->fsync_master; - if (!bclk_master && !fsync_master) + bclk_master = (hw_config->bclk_master == + SND_SOC_TPLG_BCLK_CM); + fsync_master = (hw_config->fsync_master == + SND_SOC_TPLG_FSYNC_CM); + if (bclk_master && fsync_master) link->dai_fmt |= SND_SOC_DAIFMT_CBM_CFM; - else if (bclk_master && !fsync_master) - link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFM; else if (!bclk_master && fsync_master) + link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFM; + else if (bclk_master && !fsync_master) link->dai_fmt |= SND_SOC_DAIFMT_CBM_CFS; else link->dai_fmt |= SND_SOC_DAIFMT_CBS_CFS; -- cgit v1.2.3 From 933e1c4a667103c4d10ebdc9505a0a6abd8c3fbd Mon Sep 17 00:00:00 2001 From: Kirill Marinushkin Date: Wed, 4 Apr 2018 06:19:38 +0200 Subject: ASoC: topology: Add missing clock gating parameter when parsing hw_configs Clock gating parameter is a part of `dai_fmt`. It is supported by `alsa-lib` when creating a topology binary file, but ignored by kernel when loading this topology file. After applying this commit, the clock gating parameter is not ignored any more. This solution is backwards compatible. The existing behaviour is not broken, because by default the parameter value is 0 and is ignored. snd_soc_tplg_hw_config.clock_gated = 0 => no effect snd_soc_tplg_hw_config.clock_gated = 1 => SND_SOC_DAIFMT_GATED snd_soc_tplg_hw_config.clock_gated = 2 => SND_SOC_DAIFMT_CONT For example, the following config, based on alsa-lib/src/conf/topology/broadwell/broadwell.conf, is now supported: ~~~~ SectionHWConfig."CodecHWConfig" { id "1" format "I2S" # physical audio format. pm_gate_clocks "true" # clock can be gated } SectionLink."Codec" { # used for binding to the physical link id "0" hw_configs [ "CodecHWConfig" ] default_hw_conf_id "1" } ~~~~ Signed-off-by: Kirill Marinushkin Reviewed-by: Pierre-Louis Bossart Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Mark Brown Cc: Pan Xiuli Cc: Liam Girdwood Cc: linux-kernel@vger.kernel.org Cc: alsa-devel@alsa-project.org Signed-off-by: Mark Brown --- include/uapi/sound/asoc.h | 7 ++++++- sound/soc/soc-topology.c | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h index f0e5e21efa54..f3c4b46e39d8 100644 --- a/include/uapi/sound/asoc.h +++ b/include/uapi/sound/asoc.h @@ -139,6 +139,11 @@ #define SND_SOC_TPLG_DAI_FLGBIT_SYMMETRIC_CHANNELS (1 << 1) #define SND_SOC_TPLG_DAI_FLGBIT_SYMMETRIC_SAMPLEBITS (1 << 2) +/* DAI clock gating */ +#define SND_SOC_TPLG_DAI_CLK_GATE_UNDEFINED 0 +#define SND_SOC_TPLG_DAI_CLK_GATE_GATED 1 +#define SND_SOC_TPLG_DAI_CLK_GATE_CONT 2 + /* DAI physical PCM data formats. * Add new formats to the end of the list. */ @@ -324,7 +329,7 @@ struct snd_soc_tplg_hw_config { __le32 size; /* in bytes of this structure */ __le32 id; /* unique ID - - used to match */ __le32 fmt; /* SND_SOC_DAI_FORMAT_ format value */ - __u8 clock_gated; /* 1 if clock can be gated to save power */ + __u8 clock_gated; /* SND_SOC_TPLG_DAI_CLK_GATE_ value */ __u8 invert_bclk; /* 1 for inverted BCLK, 0 for normal */ __u8 invert_fsync; /* 1 for inverted frame clock, 0 for normal */ __u8 bclk_master; /* SND_SOC_TPLG_BCLK_ value */ diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 1c55252641f3..aab31144f683 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -2006,6 +2006,13 @@ static void set_link_hw_format(struct snd_soc_dai_link *link, link->dai_fmt = hw_config->fmt & SND_SOC_DAIFMT_FORMAT_MASK; + /* clock gating */ + if (hw_config->clock_gated == SND_SOC_TPLG_DAI_CLK_GATE_GATED) + link->dai_fmt |= SND_SOC_DAIFMT_GATED; + else if (hw_config->clock_gated == + SND_SOC_TPLG_DAI_CLK_GATE_CONT) + link->dai_fmt |= SND_SOC_DAIFMT_CONT; + /* clock signal polarity */ invert_bclk = hw_config->invert_bclk; invert_fsync = hw_config->invert_fsync; -- cgit v1.2.3 From e590522a06adce8ca2eb47e77d80616cd1542d91 Mon Sep 17 00:00:00 2001 From: Kirill Marinushkin Date: Wed, 4 Apr 2018 06:19:39 +0200 Subject: ASoC: topology: Add definitions for mclk_direction values Current comment makes not clear the direction of mclk. Previously, similar description caused a misunderstanding for bclk_master and fsync_master. This commit solves the potential confusion the same way it is solved for bclk_master and fsync_master. Signed-off-by: Kirill Marinushkin Acked-by: Pierre-Louis Bossart Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Mark Brown Cc: Pan Xiuli Cc: Liam Girdwood Cc: linux-kernel@vger.kernel.org Cc: alsa-devel@alsa-project.org Signed-off-by: Mark Brown --- include/uapi/sound/asoc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h index f3c4b46e39d8..b901cdbe532a 100644 --- a/include/uapi/sound/asoc.h +++ b/include/uapi/sound/asoc.h @@ -144,6 +144,10 @@ #define SND_SOC_TPLG_DAI_CLK_GATE_GATED 1 #define SND_SOC_TPLG_DAI_CLK_GATE_CONT 2 +/* DAI mclk_direction */ +#define SND_SOC_TPLG_MCLK_CO 0 /* for codec, mclk is output */ +#define SND_SOC_TPLG_MCLK_CI 1 /* for codec, mclk is input */ + /* DAI physical PCM data formats. * Add new formats to the end of the list. */ @@ -334,7 +338,7 @@ struct snd_soc_tplg_hw_config { __u8 invert_fsync; /* 1 for inverted frame clock, 0 for normal */ __u8 bclk_master; /* SND_SOC_TPLG_BCLK_ value */ __u8 fsync_master; /* SND_SOC_TPLG_FSYNC_ value */ - __u8 mclk_direction; /* 0 for input, 1 for output */ + __u8 mclk_direction; /* SND_SOC_TPLG_MCLK_ value */ __le16 reserved; /* for 32bit alignment */ __le32 mclk_rate; /* MCLK or SYSCLK freqency in Hz */ __le32 bclk_rate; /* BCLK freqency in Hz */ -- cgit v1.2.3 From 44773ba170a6f969620221a6d87d03feae5e464f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 16 Apr 2018 10:22:01 -0700 Subject: ARM: OMAP2+: Drop unused pm-noop Looks like these functions don't do anything in the mainline kernel so we can just drop it. Note that we must now also remove ir-rx51 pdata as it relies on the dummy platform data that does not do anything. And ir-rx51 is calling a pdata callback that doesn't do anything without checking if it exists first. For configuring device specific minimal latencies, the interface to use is pm_qos_add_request(). For an example, see what was done in commit 9834ffd1ecc3 ("ASoC: omap-mcbsp: Add PM QoS support for McBSP to prevent glitches"). I've added some comments to ir-rx51 so people using it can add pm_qos support and test it. Cc: Ivaylo Dimitrov Cc: Kevin Hilman Cc: Laurent Pinchart Cc: Tomi Valkeinen Acked-by: Mauro Carvalho Chehab Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/Makefile | 1 - arch/arm/mach-omap2/display.c | 7 -- arch/arm/mach-omap2/hsmmc.c | 1 - arch/arm/mach-omap2/i2c.c | 1 - arch/arm/mach-omap2/io.c | 3 - arch/arm/mach-omap2/omap-pm-noop.c | 176 ---------------------------- arch/arm/mach-omap2/omap-pm.h | 161 ------------------------- arch/arm/mach-omap2/pdata-quirks.c | 15 --- arch/arm/mach-omap2/pm-debug.c | 5 - arch/arm/mach-omap2/pm.c | 10 +- arch/arm/mach-omap2/timer.c | 1 - arch/arm/plat-omap/Kconfig | 10 -- drivers/media/rc/ir-rx51.c | 17 +-- include/linux/platform_data/media/ir-rx51.h | 9 -- 14 files changed, 4 insertions(+), 413 deletions(-) delete mode 100644 arch/arm/mach-omap2/omap-pm-noop.c delete mode 100644 arch/arm/mach-omap2/omap-pm.h delete mode 100644 include/linux/platform_data/media/ir-rx51.h (limited to 'include') diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 4603c30fef73..95b64b2940e9 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -78,7 +78,6 @@ endif omap-4-5-pm-common = omap-mpuss-lowpower.o obj-$(CONFIG_ARCH_OMAP4) += $(omap-4-5-pm-common) obj-$(CONFIG_SOC_OMAP5) += $(omap-4-5-pm-common) -obj-$(CONFIG_OMAP_PM_NOOP) += omap-pm-noop.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_ARCH_OMAP2) += pm24xx.o diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index b3f6eb5d04a2..480a2b989908 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -32,7 +32,6 @@ #include #include "omap_hwmod.h" #include "omap_device.h" -#include "omap-pm.h" #include "common.h" #include "soc.h" @@ -126,11 +125,6 @@ static void omap_dsi_disable_pads(int dsi_id, unsigned lane_mask) omap4_dsi_mux_pads(dsi_id, 0); } -static int omap_dss_set_min_bus_tput(struct device *dev, unsigned long tput) -{ - return omap_pm_set_min_bus_tput(dev, OCP_INITIATOR_AGENT, tput); -} - static enum omapdss_version __init omap_display_get_version(void) { if (cpu_is_omap24xx()) @@ -169,7 +163,6 @@ static int __init omapdss_init_fbdev(void) static struct omap_dss_board_info board_data = { .dsi_enable_pads = omap_dsi_enable_pads, .dsi_disable_pads = omap_dsi_disable_pads, - .set_min_bus_tput = omap_dss_set_min_bus_tput, }; struct device_node *node; int r; diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c index b064066d431c..0103548b0b15 100644 --- a/arch/arm/mach-omap2/hsmmc.c +++ b/arch/arm/mach-omap2/hsmmc.c @@ -18,7 +18,6 @@ #include "soc.h" #include "omap_device.h" -#include "omap-pm.h" #include "hsmmc.h" #include "control.h" diff --git a/arch/arm/mach-omap2/i2c.c b/arch/arm/mach-omap2/i2c.c index 91a21c3923b2..37ff25ee3d89 100644 --- a/arch/arm/mach-omap2/i2c.c +++ b/arch/arm/mach-omap2/i2c.c @@ -22,7 +22,6 @@ #include "soc.h" #include "omap_hwmod.h" #include "omap_device.h" -#include "omap-pm.h" #include "prm.h" #include "common.h" diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index cf546dfe3b32..6ce60a478409 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -37,7 +37,6 @@ #include "clock.h" #include "clock2xxx.h" #include "clock3xxx.h" -#include "omap-pm.h" #include "sdrc.h" #include "control.h" #include "serial.h" @@ -421,8 +420,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void) postsetup_state = _HWMOD_STATE_ENABLED; #endif omap_hwmod_for_each(_set_hwmod_postsetup_state, &postsetup_state); - - omap_pm_if_early_init(); } static void __init __maybe_unused omap_common_late_init(void) diff --git a/arch/arm/mach-omap2/omap-pm-noop.c b/arch/arm/mach-omap2/omap-pm-noop.c deleted file mode 100644 index 4ead077ea4e7..000000000000 --- a/arch/arm/mach-omap2/omap-pm-noop.c +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * omap-pm-noop.c - OMAP power management interface - dummy version - * - * This code implements the OMAP power management interface to - * drivers, CPUIdle, CPUFreq, and DSP Bridge. It is strictly for - * debug/demonstration use, as it does nothing but printk() whenever a - * function is called (when DEBUG is defined, below) - * - * Copyright (C) 2008-2009 Texas Instruments, Inc. - * Copyright (C) 2008-2009 Nokia Corporation - * Paul Walmsley - * - * Interface developed by (in alphabetical order): - * Karthik Dasu, Tony Lindgren, Rajendra Nayak, Sakari Poussa, Veeramanikandan - * Raju, Anand Sawant, Igor Stoppa, Paul Walmsley, Richard Woodruff - */ - -#undef DEBUG - -#include -#include -#include -#include - -#include "omap_device.h" -#include "omap-pm.h" - -static bool off_mode_enabled; -static int dummy_context_loss_counter; - -/* - * Device-driver-originated constraints (via board-*.c files) - */ - -int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t) -{ - if (!dev || t < -1) { - WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__); - return -EINVAL; - } - - if (t == -1) - pr_debug("OMAP PM: remove max MPU wakeup latency constraint: dev %s\n", - dev_name(dev)); - else - pr_debug("OMAP PM: add max MPU wakeup latency constraint: dev %s, t = %ld usec\n", - dev_name(dev), t); - - /* - * For current Linux, this needs to map the MPU to a - * powerdomain, then go through the list of current max lat - * constraints on the MPU and find the smallest. If - * the latency constraint has changed, the code should - * recompute the state to enter for the next powerdomain - * state. - * - * TI CDP code can call constraint_set here. - */ - - return 0; -} - -int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r) -{ - if (!dev || (agent_id != OCP_INITIATOR_AGENT && - agent_id != OCP_TARGET_AGENT)) { - WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__); - return -EINVAL; - } - - if (r == 0) - pr_debug("OMAP PM: remove min bus tput constraint: dev %s for agent_id %d\n", - dev_name(dev), agent_id); - else - pr_debug("OMAP PM: add min bus tput constraint: dev %s for agent_id %d: rate %ld KiB\n", - dev_name(dev), agent_id, r); - - /* - * This code should model the interconnect and compute the - * required clock frequency, convert that to a VDD2 OPP ID, then - * set the VDD2 OPP appropriately. - * - * TI CDP code can call constraint_set here on the VDD2 OPP. - */ - - return 0; -} - -/* - * DSP Bridge-specific constraints - */ - - -/** - * omap_pm_enable_off_mode - notify OMAP PM that off-mode is enabled - * - * Intended for use only by OMAP PM core code to notify this layer - * that off mode has been enabled. - */ -void omap_pm_enable_off_mode(void) -{ - off_mode_enabled = true; -} - -/** - * omap_pm_disable_off_mode - notify OMAP PM that off-mode is disabled - * - * Intended for use only by OMAP PM core code to notify this layer - * that off mode has been disabled. - */ -void omap_pm_disable_off_mode(void) -{ - off_mode_enabled = false; -} - -/* - * Device context loss tracking - */ - -#ifdef CONFIG_ARCH_OMAP2PLUS - -int omap_pm_get_dev_context_loss_count(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - int count; - - if (WARN_ON(!dev)) - return -ENODEV; - - if (dev->pm_domain == &omap_device_pm_domain) { - count = omap_device_get_context_loss_count(pdev); - } else { - WARN_ONCE(off_mode_enabled, "omap_pm: using dummy context loss counter; device %s should be converted to omap_device", - dev_name(dev)); - - count = dummy_context_loss_counter; - - if (off_mode_enabled) { - count++; - /* - * Context loss count has to be a non-negative value. - * Clear the sign bit to get a value range from 0 to - * INT_MAX. - */ - count &= INT_MAX; - dummy_context_loss_counter = count; - } - } - - pr_debug("OMAP PM: context loss count for dev %s = %d\n", - dev_name(dev), count); - - return count; -} - -#else - -int omap_pm_get_dev_context_loss_count(struct device *dev) -{ - return dummy_context_loss_counter; -} - -#endif - -/* Should be called before clk framework init */ -int __init omap_pm_if_early_init(void) -{ - return 0; -} - -/* Must be called after clock framework is initialized */ -int __init omap_pm_if_init(void) -{ - return 0; -} diff --git a/arch/arm/mach-omap2/omap-pm.h b/arch/arm/mach-omap2/omap-pm.h deleted file mode 100644 index 5ba5df47f91b..000000000000 --- a/arch/arm/mach-omap2/omap-pm.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * omap-pm.h - OMAP power management interface - * - * Copyright (C) 2008-2010 Texas Instruments, Inc. - * Copyright (C) 2008-2010 Nokia Corporation - * Paul Walmsley - * - * Interface developed by (in alphabetical order): Karthik Dasu, Jouni - * Högander, Tony Lindgren, Rajendra Nayak, Sakari Poussa, - * Veeramanikandan Raju, Anand Sawant, Igor Stoppa, Paul Walmsley, - * Richard Woodruff - */ - -#ifndef ASM_ARM_ARCH_OMAP_OMAP_PM_H -#define ASM_ARM_ARCH_OMAP_OMAP_PM_H - -#include -#include -#include -#include - -/* - * agent_id values for use with omap_pm_set_min_bus_tput(): - * - * OCP_INITIATOR_AGENT is only valid for devices that can act as - * initiators -- it represents the device's L3 interconnect - * connection. OCP_TARGET_AGENT represents the device's L4 - * interconnect connection. - */ -#define OCP_TARGET_AGENT 1 -#define OCP_INITIATOR_AGENT 2 - -/** - * omap_pm_if_early_init - OMAP PM init code called before clock fw init - * @mpu_opp_table: array ptr to struct omap_opp for MPU - * @dsp_opp_table: array ptr to struct omap_opp for DSP - * @l3_opp_table : array ptr to struct omap_opp for CORE - * - * Initialize anything that must be configured before the clock - * framework starts. The "_if_" is to avoid name collisions with the - * PM idle-loop code. - */ -int __init omap_pm_if_early_init(void); - -/** - * omap_pm_if_init - OMAP PM init code called after clock fw init - * - * The main initialization code. OPP tables are passed in here. The - * "_if_" is to avoid name collisions with the PM idle-loop code. - */ -int __init omap_pm_if_init(void); - -/* - * Device-driver-originated constraints (via board-*.c files, platform_data) - */ - - -/** - * omap_pm_set_max_mpu_wakeup_lat - set the maximum MPU wakeup latency - * @dev: struct device * requesting the constraint - * @t: maximum MPU wakeup latency in microseconds - * - * Request that the maximum interrupt latency for the MPU to be no - * greater than @t microseconds. "Interrupt latency" in this case is - * defined as the elapsed time from the occurrence of a hardware or - * timer interrupt to the time when the device driver's interrupt - * service routine has been entered by the MPU. - * - * It is intended that underlying PM code will use this information to - * determine what power state to put the MPU powerdomain into, and - * possibly the CORE powerdomain as well, since interrupt handling - * code currently runs from SDRAM. Advanced PM or board*.c code may - * also configure interrupt controller priorities, OCP bus priorities, - * CPU speed(s), etc. - * - * This function will not affect device wakeup latency, e.g., time - * elapsed from when a device driver enables a hardware device with - * clk_enable(), to when the device is ready for register access or - * other use. To control this device wakeup latency, use - * omap_pm_set_max_dev_wakeup_lat() - * - * Multiple calls to omap_pm_set_max_mpu_wakeup_lat() will replace the - * previous t value. To remove the latency target for the MPU, call - * with t = -1. - * - * XXX This constraint will be deprecated soon in favor of the more - * general omap_pm_set_max_dev_wakeup_lat() - * - * Returns -EINVAL for an invalid argument, -ERANGE if the constraint - * is not satisfiable, or 0 upon success. - */ -int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t); - - -/** - * omap_pm_set_min_bus_tput - set minimum bus throughput needed by device - * @dev: struct device * requesting the constraint - * @tbus_id: interconnect to operate on (OCP_{INITIATOR,TARGET}_AGENT) - * @r: minimum throughput (in KiB/s) - * - * Request that the minimum data throughput on the OCP interconnect - * attached to device @dev interconnect agent @tbus_id be no less - * than @r KiB/s. - * - * It is expected that the OMAP PM or bus code will use this - * information to set the interconnect clock to run at the lowest - * possible speed that satisfies all current system users. The PM or - * bus code will adjust the estimate based on its model of the bus, so - * device driver authors should attempt to specify an accurate - * quantity for their device use case, and let the PM or bus code - * overestimate the numbers as necessary to handle request/response - * latency, other competing users on the system, etc. On OMAP2/3, if - * a driver requests a minimum L4 interconnect speed constraint, the - * code will also need to add an minimum L3 interconnect speed - * constraint, - * - * Multiple calls to omap_pm_set_min_bus_tput() will replace the - * previous rate value for this device. To remove the interconnect - * throughput restriction for this device, call with r = 0. - * - * Returns -EINVAL for an invalid argument, -ERANGE if the constraint - * is not satisfiable, or 0 upon success. - */ -int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r); - - -/* - * CPUFreq-originated constraint - * - * In the future, this should be handled by custom OPP clocktype - * functions. - */ - - -/* - * Device context loss tracking - */ - -/** - * omap_pm_get_dev_context_loss_count - return count of times dev has lost ctx - * @dev: struct device * - * - * This function returns the number of times that the device @dev has - * lost its internal context. This generally occurs on a powerdomain - * transition to OFF. Drivers use this as an optimization to avoid restoring - * context if the device hasn't lost it. To use, drivers should initially - * call this in their context save functions and store the result. Early in - * the driver's context restore function, the driver should call this function - * again, and compare the result to the stored counter. If they differ, the - * driver must restore device context. If the number of context losses - * exceeds the maximum positive integer, the function will wrap to 0 and - * continue counting. Returns the number of context losses for this device, - * or negative value upon error. - */ -int omap_pm_get_dev_context_loss_count(struct device *dev); - -void omap_pm_enable_off_mode(void); -void omap_pm_disable_off_mode(void); - -#endif diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 6459816c2879..7f02743edbe4 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -26,14 +26,12 @@ #include #include #include -#include #include #include "common.h" #include "common-board-devices.h" #include "control.h" #include "omap_device.h" -#include "omap-pm.h" #include "omap-secure.h" #include "soc.h" #include "hsmmc.h" @@ -514,18 +512,6 @@ void omap_auxdata_legacy_init(struct device *dev) dev->platform_data = &twl_gpio_auxdata; } -static struct ir_rx51_platform_data __maybe_unused rx51_ir_data = { - .set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat, -}; - -static struct platform_device __maybe_unused rx51_ir_device = { - .name = "ir_rx51", - .id = -1, - .dev = { - .platform_data = &rx51_ir_data, - }, -}; - #if IS_ENABLED(CONFIG_SND_OMAP_SOC_MCBSP) static struct omap_mcbsp_platform_data mcbsp_pdata; static void __init omap3_mcbsp_init(void) @@ -569,7 +555,6 @@ static struct of_dev_auxdata omap_auxdata_lookup[] = { "480c9000.smartreflex", &omap_sr_pdata[OMAP_SR_MPU]), OF_DEV_AUXDATA("ti,omap3-hsmmc", 0x4809c000, "4809c000.mmc", &mmc_pdata[0]), OF_DEV_AUXDATA("ti,omap3-hsmmc", 0x480b4000, "480b4000.mmc", &mmc_pdata[1]), - OF_DEV_AUXDATA("nokia,n900-ir", 0, "n900-ir", &rx51_ir_data), /* Only on am3517 */ OF_DEV_AUXDATA("ti,davinci_mdio", 0x5c030000, "davinci_mdio.0", NULL), OF_DEV_AUXDATA("ti,am3517-emac", 0x5c000000, "davinci_emac.0", diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c index 5c46ea6756d7..acb698d5780f 100644 --- a/arch/arm/mach-omap2/pm-debug.c +++ b/arch/arm/mach-omap2/pm-debug.c @@ -31,7 +31,6 @@ #include "clock.h" #include "powerdomain.h" #include "clockdomain.h" -#include "omap-pm.h" #include "soc.h" #include "cm2xxx_3xxx.h" @@ -240,10 +239,6 @@ static int option_set(void *data, u64 val) *option = val; if (option == &enable_off_mode) { - if (val) - omap_pm_enable_off_mode(); - else - omap_pm_disable_off_mode(); if (cpu_is_omap34xx()) omap3_pm_off_mode_enable(val); } diff --git a/arch/arm/mach-omap2/pm.c b/arch/arm/mach-omap2/pm.c index 6f68576e5695..b98c46d7f112 100644 --- a/arch/arm/mach-omap2/pm.c +++ b/arch/arm/mach-omap2/pm.c @@ -16,11 +16,11 @@ #include #include #include +#include #include #include -#include "omap-pm.h" #include "omap_device.h" #include "common.h" @@ -230,14 +230,6 @@ static void __init omap4_init_voltages(void) omap2_set_init_voltage("iva", "dpll_iva_m5x2_ck", "iva"); } -static int __init omap2_common_pm_init(void) -{ - omap_pm_if_init(); - - return 0; -} -omap_postcore_initcall(omap2_common_pm_init); - int __init omap2_common_pm_late_init(void) { /* Init the voltage layer */ diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c index 4fb4dc24e5e9..61dd72df119c 100644 --- a/arch/arm/mach-omap2/timer.c +++ b/arch/arm/mach-omap2/timer.c @@ -50,7 +50,6 @@ #include "omap_device.h" #include #include -#include "omap-pm.h" #include "soc.h" #include "common.h" diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig index afc1a1d4f7a5..c0a242cae79a 100644 --- a/arch/arm/plat-omap/Kconfig +++ b/arch/arm/plat-omap/Kconfig @@ -115,16 +115,6 @@ config OMAP_SERIAL_WAKE to data on the serial RX line. This allows you to wake the system from serial console. -choice - prompt "OMAP PM layer selection" - depends on ARCH_OMAP - default OMAP_PM_NOOP - -config OMAP_PM_NOOP - bool "No-op/debug PM layer" - -endchoice - endmenu endif diff --git a/drivers/media/rc/ir-rx51.c b/drivers/media/rc/ir-rx51.c index 49265f02e772..8a93f7468622 100644 --- a/drivers/media/rc/ir-rx51.c +++ b/drivers/media/rc/ir-rx51.c @@ -22,7 +22,6 @@ #include #include -#include #define WBUF_LEN 256 @@ -31,7 +30,6 @@ struct ir_rx51 { struct pwm_device *pwm; struct hrtimer timer; struct device *dev; - struct ir_rx51_platform_data *pdata; wait_queue_head_t wqueue; unsigned int freq; /* carrier frequency */ @@ -130,10 +128,9 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer, ir_rx51->wbuf[count] = -1; /* Insert termination mark */ /* - * Adjust latency requirements so the device doesn't go in too - * deep sleep states + * REVISIT: Adjust latency requirements so the device doesn't go in too + * deep sleep states with pm_qos_add_request(). */ - ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, 50); ir_rx51_on(ir_rx51); ir_rx51->wbuf_index = 1; @@ -146,8 +143,7 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer, */ wait_event_interruptible(ir_rx51->wqueue, ir_rx51->wbuf_index < 0); - /* We can sleep again */ - ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, -1); + /* REVISIT: Remove pm_qos constraint, we can sleep again */ return count; } @@ -244,13 +240,6 @@ static int ir_rx51_probe(struct platform_device *dev) struct pwm_device *pwm; struct rc_dev *rcdev; - ir_rx51.pdata = dev->dev.platform_data; - - if (!ir_rx51.pdata) { - dev_err(&dev->dev, "Platform Data is missing\n"); - return -ENXIO; - } - pwm = pwm_get(&dev->dev, NULL); if (IS_ERR(pwm)) { int err = PTR_ERR(pwm); diff --git a/include/linux/platform_data/media/ir-rx51.h b/include/linux/platform_data/media/ir-rx51.h deleted file mode 100644 index 9d127aa648e7..000000000000 --- a/include/linux/platform_data/media/ir-rx51.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _IR_RX51_H -#define _IR_RX51_H - -struct ir_rx51_platform_data { - int(*set_max_mpu_wakeup_lat)(struct device *dev, long t); -}; - -#endif -- cgit v1.2.3 From 69c45d57bab00d5777e7686b3965b68b8ab043c7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 16 Apr 2018 14:20:26 -0400 Subject: remove rpc_rmdir() no users since 2014... Signed-off-by: Al Viro --- include/linux/sunrpc/rpc_pipe_fs.h | 2 -- net/sunrpc/rpc_pipe.c | 16 ---------------- 2 files changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index a5704daf5df9..e90b9bd99ded 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -122,8 +122,6 @@ extern struct dentry *rpc_create_cache_dir(struct dentry *, struct cache_detail *); extern void rpc_remove_cache_dir(struct dentry *); -extern int rpc_rmdir(struct dentry *dentry); - struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags); void rpc_destroy_pipe_data(struct rpc_pipe *pipe); extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *, diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0f08934b2cea..9ec3bd7369dc 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -609,22 +609,6 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -int rpc_rmdir(struct dentry *dentry) -{ - struct dentry *parent; - struct inode *dir; - int error; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - error = __rpc_rmdir(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; -} -EXPORT_SYMBOL_GPL(rpc_rmdir); - static int __rpc_unlink(struct inode *dir, struct dentry *dentry) { int ret; -- cgit v1.2.3 From d59fb2856223219ccaa73bd2e96021f02ea5c266 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Thu, 22 Mar 2018 14:12:33 +0800 Subject: ASoC: rt5668: add rt5668B codec driver This is the initial codec driver for rt5668b. Signed-off-by: Bard Liao Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/rt5668.txt | 50 + include/sound/rt5668.h | 40 + sound/soc/codecs/Kconfig | 6 + sound/soc/codecs/Makefile | 2 + sound/soc/codecs/rt5668.c | 2639 ++++++++++++++++++++ sound/soc/codecs/rt5668.h | 1318 ++++++++++ 6 files changed, 4055 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/rt5668.txt create mode 100644 include/sound/rt5668.h create mode 100644 sound/soc/codecs/rt5668.c create mode 100644 sound/soc/codecs/rt5668.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/sound/rt5668.txt b/Documentation/devicetree/bindings/sound/rt5668.txt new file mode 100644 index 000000000000..c88b96e7764b --- /dev/null +++ b/Documentation/devicetree/bindings/sound/rt5668.txt @@ -0,0 +1,50 @@ +RT5668B audio CODEC + +This device supports I2C only. + +Required properties: + +- compatible : "realtek,rt5668b" + +- reg : The I2C address of the device. + +Optional properties: + +- interrupts : The CODEC's interrupt output. + +- realtek,dmic1-data-pin + 0: dmic1 is not used + 1: using GPIO2 pin as dmic1 data pin + 2: using GPIO5 pin as dmic1 data pin + +- realtek,dmic1-clk-pin + 0: using GPIO1 pin as dmic1 clock pin + 1: using GPIO3 pin as dmic1 clock pin + +- realtek,jd-src + 0: No JD is used + 1: using JD1 as JD source + +- realtek,ldo1-en-gpios : The GPIO that controls the CODEC's LDO1_EN pin. + +Pins on the device (for linking into audio routes) for RT5668B: + + * DMIC L1 + * DMIC R1 + * IN1P + * HPOL + * HPOR + +Example: + +rt5668 { + compatible = "realtek,rt5668b"; + reg = <0x1a>; + interrupt-parent = <&gpio>; + interrupts = ; + realtek,ldo1-en-gpios = + <&gpio TEGRA_GPIO(R, 2) GPIO_ACTIVE_HIGH>; + realtek,dmic1-data-pin = <1>; + realtek,dmic1-clk-pin = <1>; + realtek,jd-src = <1>; +}; diff --git a/include/sound/rt5668.h b/include/sound/rt5668.h new file mode 100644 index 000000000000..f907b78696cf --- /dev/null +++ b/include/sound/rt5668.h @@ -0,0 +1,40 @@ +/* + * linux/sound/rt5668.h -- Platform data for RT5668 + * + * Copyright 2018 Realtek Microelectronics + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_SND_RT5668_H +#define __LINUX_SND_RT5668_H + +enum rt5668_dmic1_data_pin { + RT5668_DMIC1_NULL, + RT5668_DMIC1_DATA_GPIO2, + RT5668_DMIC1_DATA_GPIO5, +}; + +enum rt5668_dmic1_clk_pin { + RT5668_DMIC1_CLK_GPIO1, + RT5668_DMIC1_CLK_GPIO3, +}; + +enum rt5668_jd_src { + RT5668_JD_NULL, + RT5668_JD1, +}; + +struct rt5668_platform_data { + + int ldo1_en; /* GPIO for LDO1_EN */ + + enum rt5668_dmic1_data_pin dmic1_data_pin; + enum rt5668_dmic1_clk_pin dmic1_clk_pin; + enum rt5668_jd_src jd_src; +}; + +#endif + diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 665edb5b77ff..251e67f180fe 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -137,6 +137,7 @@ config SND_SOC_ALL_CODECS select SND_SOC_RT5660 if I2C select SND_SOC_RT5663 if I2C select SND_SOC_RT5665 if I2C + select SND_SOC_RT5668 if I2C select SND_SOC_RT5670 if I2C select SND_SOC_RT5677 if I2C && SPI_MASTER select SND_SOC_SGTL5000 if I2C @@ -771,6 +772,7 @@ config SND_SOC_RL6231 default y if SND_SOC_RT5660=y default y if SND_SOC_RT5663=y default y if SND_SOC_RT5665=y + default y if SND_SOC_RT5668=y default y if SND_SOC_RT5670=y default y if SND_SOC_RT5677=y default y if SND_SOC_RT1305=y @@ -783,6 +785,7 @@ config SND_SOC_RL6231 default m if SND_SOC_RT5660=m default m if SND_SOC_RT5663=m default m if SND_SOC_RT5665=m + default m if SND_SOC_RT5668=m default m if SND_SOC_RT5670=m default m if SND_SOC_RT5677=m default m if SND_SOC_RT1305=m @@ -850,6 +853,9 @@ config SND_SOC_RT5663 config SND_SOC_RT5665 tristate +config SND_SOC_RT5668 + tristate + config SND_SOC_RT5670 tristate diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index cccd7749e319..d3b73021a401 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -141,6 +141,7 @@ snd-soc-rt5659-objs := rt5659.o snd-soc-rt5660-objs := rt5660.o snd-soc-rt5663-objs := rt5663.o snd-soc-rt5665-objs := rt5665.o +snd-soc-rt5668-objs := rt5668.o snd-soc-rt5670-objs := rt5670.o snd-soc-rt5677-objs := rt5677.o snd-soc-rt5677-spi-objs := rt5677-spi.o @@ -396,6 +397,7 @@ obj-$(CONFIG_SND_SOC_RT5659) += snd-soc-rt5659.o obj-$(CONFIG_SND_SOC_RT5660) += snd-soc-rt5660.o obj-$(CONFIG_SND_SOC_RT5663) += snd-soc-rt5663.o obj-$(CONFIG_SND_SOC_RT5665) += snd-soc-rt5665.o +obj-$(CONFIG_SND_SOC_RT5668) += snd-soc-rt5668.o obj-$(CONFIG_SND_SOC_RT5670) += snd-soc-rt5670.o obj-$(CONFIG_SND_SOC_RT5677) += snd-soc-rt5677.o obj-$(CONFIG_SND_SOC_RT5677_SPI) += snd-soc-rt5677-spi.o diff --git a/sound/soc/codecs/rt5668.c b/sound/soc/codecs/rt5668.c new file mode 100644 index 000000000000..52a343f96eb2 --- /dev/null +++ b/sound/soc/codecs/rt5668.c @@ -0,0 +1,2639 @@ +/* + * rt5668.c -- RT5668B ALSA SoC audio component driver + * + * Copyright 2018 Realtek Semiconductor Corp. + * Author: Bard Liao + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rl6231.h" +#include "rt5668.h" + +#define RT5668_NUM_SUPPLIES 3 + +static const char *rt5668_supply_names[RT5668_NUM_SUPPLIES] = { + "AVDD", + "MICVDD", + "VBAT", +}; + +struct rt5668_priv { + struct snd_soc_component *component; + struct rt5668_platform_data pdata; + struct regmap *regmap; + struct snd_soc_jack *hs_jack; + struct regulator_bulk_data supplies[RT5668_NUM_SUPPLIES]; + struct delayed_work jack_detect_work; + struct delayed_work jd_check_work; + struct mutex calibrate_mutex; + + int sysclk; + int sysclk_src; + int lrck[RT5668_AIFS]; + int bclk[RT5668_AIFS]; + int master[RT5668_AIFS]; + + int pll_src; + int pll_in; + int pll_out; + + int jack_type; +}; + +static const struct reg_default rt5668_reg[] = { + {0x0002, 0x8080}, + {0x0003, 0x8000}, + {0x0005, 0x0000}, + {0x0006, 0x0000}, + {0x0008, 0x800f}, + {0x000b, 0x0000}, + {0x0010, 0x4040}, + {0x0011, 0x0000}, + {0x0012, 0x1404}, + {0x0013, 0x1000}, + {0x0014, 0xa00a}, + {0x0015, 0x0404}, + {0x0016, 0x0404}, + {0x0019, 0xafaf}, + {0x001c, 0x2f2f}, + {0x001f, 0x0000}, + {0x0022, 0x5757}, + {0x0023, 0x0039}, + {0x0024, 0x000b}, + {0x0026, 0xc0c4}, + {0x0029, 0x8080}, + {0x002a, 0xa0a0}, + {0x002b, 0x0300}, + {0x0030, 0x0000}, + {0x003c, 0x0080}, + {0x0044, 0x0c0c}, + {0x0049, 0x0000}, + {0x0061, 0x0000}, + {0x0062, 0x0000}, + {0x0063, 0x003f}, + {0x0064, 0x0000}, + {0x0065, 0x0000}, + {0x0066, 0x0030}, + {0x0067, 0x0000}, + {0x006b, 0x0000}, + {0x006c, 0x0000}, + {0x006d, 0x2200}, + {0x006e, 0x0a10}, + {0x0070, 0x8000}, + {0x0071, 0x8000}, + {0x0073, 0x0000}, + {0x0074, 0x0000}, + {0x0075, 0x0002}, + {0x0076, 0x0001}, + {0x0079, 0x0000}, + {0x007a, 0x0000}, + {0x007b, 0x0000}, + {0x007c, 0x0100}, + {0x007e, 0x0000}, + {0x0080, 0x0000}, + {0x0081, 0x0000}, + {0x0082, 0x0000}, + {0x0083, 0x0000}, + {0x0084, 0x0000}, + {0x0085, 0x0000}, + {0x0086, 0x0005}, + {0x0087, 0x0000}, + {0x0088, 0x0000}, + {0x008c, 0x0003}, + {0x008d, 0x0000}, + {0x008e, 0x0060}, + {0x008f, 0x1000}, + {0x0091, 0x0c26}, + {0x0092, 0x0073}, + {0x0093, 0x0000}, + {0x0094, 0x0080}, + {0x0098, 0x0000}, + {0x009a, 0x0000}, + {0x009b, 0x0000}, + {0x009c, 0x0000}, + {0x009d, 0x0000}, + {0x009e, 0x100c}, + {0x009f, 0x0000}, + {0x00a0, 0x0000}, + {0x00a3, 0x0002}, + {0x00a4, 0x0001}, + {0x00ae, 0x2040}, + {0x00af, 0x0000}, + {0x00b6, 0x0000}, + {0x00b7, 0x0000}, + {0x00b8, 0x0000}, + {0x00b9, 0x0002}, + {0x00be, 0x0000}, + {0x00c0, 0x0160}, + {0x00c1, 0x82a0}, + {0x00c2, 0x0000}, + {0x00d0, 0x0000}, + {0x00d1, 0x2244}, + {0x00d2, 0x3300}, + {0x00d3, 0x2200}, + {0x00d4, 0x0000}, + {0x00d9, 0x0009}, + {0x00da, 0x0000}, + {0x00db, 0x0000}, + {0x00dc, 0x00c0}, + {0x00dd, 0x2220}, + {0x00de, 0x3131}, + {0x00df, 0x3131}, + {0x00e0, 0x3131}, + {0x00e2, 0x0000}, + {0x00e3, 0x4000}, + {0x00e4, 0x0aa0}, + {0x00e5, 0x3131}, + {0x00e6, 0x3131}, + {0x00e7, 0x3131}, + {0x00e8, 0x3131}, + {0x00ea, 0xb320}, + {0x00eb, 0x0000}, + {0x00f0, 0x0000}, + {0x00f1, 0x00d0}, + {0x00f2, 0x00d0}, + {0x00f6, 0x0000}, + {0x00fa, 0x0000}, + {0x00fb, 0x0000}, + {0x00fc, 0x0000}, + {0x00fd, 0x0000}, + {0x00fe, 0x10ec}, + {0x00ff, 0x6530}, + {0x0100, 0xa0a0}, + {0x010b, 0x0000}, + {0x010c, 0xae00}, + {0x010d, 0xaaa0}, + {0x010e, 0x8aa2}, + {0x010f, 0x02a2}, + {0x0110, 0xc000}, + {0x0111, 0x04a2}, + {0x0112, 0x2800}, + {0x0113, 0x0000}, + {0x0117, 0x0100}, + {0x0125, 0x0410}, + {0x0132, 0x6026}, + {0x0136, 0x5555}, + {0x0138, 0x3700}, + {0x013a, 0x2000}, + {0x013b, 0x2000}, + {0x013c, 0x2005}, + {0x013f, 0x0000}, + {0x0142, 0x0000}, + {0x0145, 0x0002}, + {0x0146, 0x0000}, + {0x0147, 0x0000}, + {0x0148, 0x0000}, + {0x0149, 0x0000}, + {0x0150, 0x79a1}, + {0x0151, 0x0000}, + {0x0160, 0x4ec0}, + {0x0161, 0x0080}, + {0x0162, 0x0200}, + {0x0163, 0x0800}, + {0x0164, 0x0000}, + {0x0165, 0x0000}, + {0x0166, 0x0000}, + {0x0167, 0x000f}, + {0x0168, 0x000f}, + {0x0169, 0x0021}, + {0x0190, 0x413d}, + {0x0194, 0x0000}, + {0x0195, 0x0000}, + {0x0197, 0x0022}, + {0x0198, 0x0000}, + {0x0199, 0x0000}, + {0x01af, 0x0000}, + {0x01b0, 0x0400}, + {0x01b1, 0x0000}, + {0x01b2, 0x0000}, + {0x01b3, 0x0000}, + {0x01b4, 0x0000}, + {0x01b5, 0x0000}, + {0x01b6, 0x01c3}, + {0x01b7, 0x02a0}, + {0x01b8, 0x03e9}, + {0x01b9, 0x1389}, + {0x01ba, 0xc351}, + {0x01bb, 0x0009}, + {0x01bc, 0x0018}, + {0x01bd, 0x002a}, + {0x01be, 0x004c}, + {0x01bf, 0x0097}, + {0x01c0, 0x433d}, + {0x01c1, 0x2800}, + {0x01c2, 0x0000}, + {0x01c3, 0x0000}, + {0x01c4, 0x0000}, + {0x01c5, 0x0000}, + {0x01c6, 0x0000}, + {0x01c7, 0x0000}, + {0x01c8, 0x40af}, + {0x01c9, 0x0702}, + {0x01ca, 0x0000}, + {0x01cb, 0x0000}, + {0x01cc, 0x5757}, + {0x01cd, 0x5757}, + {0x01ce, 0x5757}, + {0x01cf, 0x5757}, + {0x01d0, 0x5757}, + {0x01d1, 0x5757}, + {0x01d2, 0x5757}, + {0x01d3, 0x5757}, + {0x01d4, 0x5757}, + {0x01d5, 0x5757}, + {0x01d6, 0x0000}, + {0x01d7, 0x0008}, + {0x01d8, 0x0029}, + {0x01d9, 0x3333}, + {0x01da, 0x0000}, + {0x01db, 0x0004}, + {0x01dc, 0x0000}, + {0x01de, 0x7c00}, + {0x01df, 0x0320}, + {0x01e0, 0x06a1}, + {0x01e1, 0x0000}, + {0x01e2, 0x0000}, + {0x01e3, 0x0000}, + {0x01e4, 0x0000}, + {0x01e6, 0x0001}, + {0x01e7, 0x0000}, + {0x01e8, 0x0000}, + {0x01ea, 0x0000}, + {0x01eb, 0x0000}, + {0x01ec, 0x0000}, + {0x01ed, 0x0000}, + {0x01ee, 0x0000}, + {0x01ef, 0x0000}, + {0x01f0, 0x0000}, + {0x01f1, 0x0000}, + {0x01f2, 0x0000}, + {0x01f3, 0x0000}, + {0x01f4, 0x0000}, + {0x0210, 0x6297}, + {0x0211, 0xa005}, + {0x0212, 0x824c}, + {0x0213, 0xf7ff}, + {0x0214, 0xf24c}, + {0x0215, 0x0102}, + {0x0216, 0x00a3}, + {0x0217, 0x0048}, + {0x0218, 0xa2c0}, + {0x0219, 0x0400}, + {0x021a, 0x00c8}, + {0x021b, 0x00c0}, + {0x021c, 0x0000}, + {0x0250, 0x4500}, + {0x0251, 0x40b3}, + {0x0252, 0x0000}, + {0x0253, 0x0000}, + {0x0254, 0x0000}, + {0x0255, 0x0000}, + {0x0256, 0x0000}, + {0x0257, 0x0000}, + {0x0258, 0x0000}, + {0x0259, 0x0000}, + {0x025a, 0x0005}, + {0x0270, 0x0000}, + {0x02ff, 0x0110}, + {0x0300, 0x001f}, + {0x0301, 0x032c}, + {0x0302, 0x5f21}, + {0x0303, 0x4000}, + {0x0304, 0x4000}, + {0x0305, 0x06d5}, + {0x0306, 0x8000}, + {0x0307, 0x0700}, + {0x0310, 0x4560}, + {0x0311, 0xa4a8}, + {0x0312, 0x7418}, + {0x0313, 0x0000}, + {0x0314, 0x0006}, + {0x0315, 0xffff}, + {0x0316, 0xc400}, + {0x0317, 0x0000}, + {0x03c0, 0x7e00}, + {0x03c1, 0x8000}, + {0x03c2, 0x8000}, + {0x03c3, 0x8000}, + {0x03c4, 0x8000}, + {0x03c5, 0x8000}, + {0x03c6, 0x8000}, + {0x03c7, 0x8000}, + {0x03c8, 0x8000}, + {0x03c9, 0x8000}, + {0x03ca, 0x8000}, + {0x03cb, 0x8000}, + {0x03cc, 0x8000}, + {0x03d0, 0x0000}, + {0x03d1, 0x0000}, + {0x03d2, 0x0000}, + {0x03d3, 0x0000}, + {0x03d4, 0x2000}, + {0x03d5, 0x2000}, + {0x03d6, 0x0000}, + {0x03d7, 0x0000}, + {0x03d8, 0x2000}, + {0x03d9, 0x2000}, + {0x03da, 0x2000}, + {0x03db, 0x2000}, + {0x03dc, 0x0000}, + {0x03dd, 0x0000}, + {0x03de, 0x0000}, + {0x03df, 0x2000}, + {0x03e0, 0x0000}, + {0x03e1, 0x0000}, + {0x03e2, 0x0000}, + {0x03e3, 0x0000}, + {0x03e4, 0x0000}, + {0x03e5, 0x0000}, + {0x03e6, 0x0000}, + {0x03e7, 0x0000}, + {0x03e8, 0x0000}, + {0x03e9, 0x0000}, + {0x03ea, 0x0000}, + {0x03eb, 0x0000}, + {0x03ec, 0x0000}, + {0x03ed, 0x0000}, + {0x03ee, 0x0000}, + {0x03ef, 0x0000}, + {0x03f0, 0x0800}, + {0x03f1, 0x0800}, + {0x03f2, 0x0800}, + {0x03f3, 0x0800}, +}; + +static bool rt5668_volatile_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case RT5668_RESET: + case RT5668_CBJ_CTRL_2: + case RT5668_INT_ST_1: + case RT5668_4BTN_IL_CMD_1: + case RT5668_AJD1_CTRL: + case RT5668_HP_CALIB_CTRL_1: + case RT5668_DEVICE_ID: + case RT5668_I2C_MODE: + case RT5668_HP_CALIB_CTRL_10: + case RT5668_EFUSE_CTRL_2: + case RT5668_JD_TOP_VC_VTRL: + case RT5668_HP_IMP_SENS_CTRL_19: + case RT5668_IL_CMD_1: + case RT5668_SAR_IL_CMD_2: + case RT5668_SAR_IL_CMD_4: + case RT5668_SAR_IL_CMD_10: + case RT5668_SAR_IL_CMD_11: + case RT5668_EFUSE_CTRL_6...RT5668_EFUSE_CTRL_11: + case RT5668_HP_CALIB_STA_1...RT5668_HP_CALIB_STA_11: + return true; + default: + return false; + } +} + +static bool rt5668_readable_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case RT5668_RESET: + case RT5668_VERSION_ID: + case RT5668_VENDOR_ID: + case RT5668_DEVICE_ID: + case RT5668_HP_CTRL_1: + case RT5668_HP_CTRL_2: + case RT5668_HPL_GAIN: + case RT5668_HPR_GAIN: + case RT5668_I2C_CTRL: + case RT5668_CBJ_BST_CTRL: + case RT5668_CBJ_CTRL_1: + case RT5668_CBJ_CTRL_2: + case RT5668_CBJ_CTRL_3: + case RT5668_CBJ_CTRL_4: + case RT5668_CBJ_CTRL_5: + case RT5668_CBJ_CTRL_6: + case RT5668_CBJ_CTRL_7: + case RT5668_DAC1_DIG_VOL: + case RT5668_STO1_ADC_DIG_VOL: + case RT5668_STO1_ADC_BOOST: + case RT5668_HP_IMP_GAIN_1: + case RT5668_HP_IMP_GAIN_2: + case RT5668_SIDETONE_CTRL: + case RT5668_STO1_ADC_MIXER: + case RT5668_AD_DA_MIXER: + case RT5668_STO1_DAC_MIXER: + case RT5668_A_DAC1_MUX: + case RT5668_DIG_INF2_DATA: + case RT5668_REC_MIXER: + case RT5668_CAL_REC: + case RT5668_ALC_BACK_GAIN: + case RT5668_PWR_DIG_1: + case RT5668_PWR_DIG_2: + case RT5668_PWR_ANLG_1: + case RT5668_PWR_ANLG_2: + case RT5668_PWR_ANLG_3: + case RT5668_PWR_MIXER: + case RT5668_PWR_VOL: + case RT5668_CLK_DET: + case RT5668_RESET_LPF_CTRL: + case RT5668_RESET_HPF_CTRL: + case RT5668_DMIC_CTRL_1: + case RT5668_I2S1_SDP: + case RT5668_I2S2_SDP: + case RT5668_ADDA_CLK_1: + case RT5668_ADDA_CLK_2: + case RT5668_I2S1_F_DIV_CTRL_1: + case RT5668_I2S1_F_DIV_CTRL_2: + case RT5668_TDM_CTRL: + case RT5668_TDM_ADDA_CTRL_1: + case RT5668_TDM_ADDA_CTRL_2: + case RT5668_DATA_SEL_CTRL_1: + case RT5668_TDM_TCON_CTRL: + case RT5668_GLB_CLK: + case RT5668_PLL_CTRL_1: + case RT5668_PLL_CTRL_2: + case RT5668_PLL_TRACK_1: + case RT5668_PLL_TRACK_2: + case RT5668_PLL_TRACK_3: + case RT5668_PLL_TRACK_4: + case RT5668_PLL_TRACK_5: + case RT5668_PLL_TRACK_6: + case RT5668_PLL_TRACK_11: + case RT5668_SDW_REF_CLK: + case RT5668_DEPOP_1: + case RT5668_DEPOP_2: + case RT5668_HP_CHARGE_PUMP_1: + case RT5668_HP_CHARGE_PUMP_2: + case RT5668_MICBIAS_1: + case RT5668_MICBIAS_2: + case RT5668_PLL_TRACK_12: + case RT5668_PLL_TRACK_14: + case RT5668_PLL2_CTRL_1: + case RT5668_PLL2_CTRL_2: + case RT5668_PLL2_CTRL_3: + case RT5668_PLL2_CTRL_4: + case RT5668_RC_CLK_CTRL: + case RT5668_I2S_M_CLK_CTRL_1: + case RT5668_I2S2_F_DIV_CTRL_1: + case RT5668_I2S2_F_DIV_CTRL_2: + case RT5668_EQ_CTRL_1: + case RT5668_EQ_CTRL_2: + case RT5668_IRQ_CTRL_1: + case RT5668_IRQ_CTRL_2: + case RT5668_IRQ_CTRL_3: + case RT5668_IRQ_CTRL_4: + case RT5668_INT_ST_1: + case RT5668_GPIO_CTRL_1: + case RT5668_GPIO_CTRL_2: + case RT5668_GPIO_CTRL_3: + case RT5668_HP_AMP_DET_CTRL_1: + case RT5668_HP_AMP_DET_CTRL_2: + case RT5668_MID_HP_AMP_DET: + case RT5668_LOW_HP_AMP_DET: + case RT5668_DELAY_BUF_CTRL: + case RT5668_SV_ZCD_1: + case RT5668_SV_ZCD_2: + case RT5668_IL_CMD_1: + case RT5668_IL_CMD_2: + case RT5668_IL_CMD_3: + case RT5668_IL_CMD_4: + case RT5668_IL_CMD_5: + case RT5668_IL_CMD_6: + case RT5668_4BTN_IL_CMD_1: + case RT5668_4BTN_IL_CMD_2: + case RT5668_4BTN_IL_CMD_3: + case RT5668_4BTN_IL_CMD_4: + case RT5668_4BTN_IL_CMD_5: + case RT5668_4BTN_IL_CMD_6: + case RT5668_4BTN_IL_CMD_7: + case RT5668_ADC_STO1_HP_CTRL_1: + case RT5668_ADC_STO1_HP_CTRL_2: + case RT5668_AJD1_CTRL: + case RT5668_JD1_THD: + case RT5668_JD2_THD: + case RT5668_JD_CTRL_1: + case RT5668_DUMMY_1: + case RT5668_DUMMY_2: + case RT5668_DUMMY_3: + case RT5668_DAC_ADC_DIG_VOL1: + case RT5668_BIAS_CUR_CTRL_2: + case RT5668_BIAS_CUR_CTRL_3: + case RT5668_BIAS_CUR_CTRL_4: + case RT5668_BIAS_CUR_CTRL_5: + case RT5668_BIAS_CUR_CTRL_6: + case RT5668_BIAS_CUR_CTRL_7: + case RT5668_BIAS_CUR_CTRL_8: + case RT5668_BIAS_CUR_CTRL_9: + case RT5668_BIAS_CUR_CTRL_10: + case RT5668_VREF_REC_OP_FB_CAP_CTRL: + case RT5668_CHARGE_PUMP_1: + case RT5668_DIG_IN_CTRL_1: + case RT5668_PAD_DRIVING_CTRL: + case RT5668_SOFT_RAMP_DEPOP: + case RT5668_CHOP_DAC: + case RT5668_CHOP_ADC: + case RT5668_CALIB_ADC_CTRL: + case RT5668_VOL_TEST: + case RT5668_SPKVDD_DET_STA: + case RT5668_TEST_MODE_CTRL_1: + case RT5668_TEST_MODE_CTRL_2: + case RT5668_TEST_MODE_CTRL_3: + case RT5668_TEST_MODE_CTRL_4: + case RT5668_TEST_MODE_CTRL_5: + case RT5668_PLL1_INTERNAL: + case RT5668_PLL2_INTERNAL: + case RT5668_STO_NG2_CTRL_1: + case RT5668_STO_NG2_CTRL_2: + case RT5668_STO_NG2_CTRL_3: + case RT5668_STO_NG2_CTRL_4: + case RT5668_STO_NG2_CTRL_5: + case RT5668_STO_NG2_CTRL_6: + case RT5668_STO_NG2_CTRL_7: + case RT5668_STO_NG2_CTRL_8: + case RT5668_STO_NG2_CTRL_9: + case RT5668_STO_NG2_CTRL_10: + case RT5668_STO1_DAC_SIL_DET: + case RT5668_SIL_PSV_CTRL1: + case RT5668_SIL_PSV_CTRL2: + case RT5668_SIL_PSV_CTRL3: + case RT5668_SIL_PSV_CTRL4: + case RT5668_SIL_PSV_CTRL5: + case RT5668_HP_IMP_SENS_CTRL_01: + case RT5668_HP_IMP_SENS_CTRL_02: + case RT5668_HP_IMP_SENS_CTRL_03: + case RT5668_HP_IMP_SENS_CTRL_04: + case RT5668_HP_IMP_SENS_CTRL_05: + case RT5668_HP_IMP_SENS_CTRL_06: + case RT5668_HP_IMP_SENS_CTRL_07: + case RT5668_HP_IMP_SENS_CTRL_08: + case RT5668_HP_IMP_SENS_CTRL_09: + case RT5668_HP_IMP_SENS_CTRL_10: + case RT5668_HP_IMP_SENS_CTRL_11: + case RT5668_HP_IMP_SENS_CTRL_12: + case RT5668_HP_IMP_SENS_CTRL_13: + case RT5668_HP_IMP_SENS_CTRL_14: + case RT5668_HP_IMP_SENS_CTRL_15: + case RT5668_HP_IMP_SENS_CTRL_16: + case RT5668_HP_IMP_SENS_CTRL_17: + case RT5668_HP_IMP_SENS_CTRL_18: + case RT5668_HP_IMP_SENS_CTRL_19: + case RT5668_HP_IMP_SENS_CTRL_20: + case RT5668_HP_IMP_SENS_CTRL_21: + case RT5668_HP_IMP_SENS_CTRL_22: + case RT5668_HP_IMP_SENS_CTRL_23: + case RT5668_HP_IMP_SENS_CTRL_24: + case RT5668_HP_IMP_SENS_CTRL_25: + case RT5668_HP_IMP_SENS_CTRL_26: + case RT5668_HP_IMP_SENS_CTRL_27: + case RT5668_HP_IMP_SENS_CTRL_28: + case RT5668_HP_IMP_SENS_CTRL_29: + case RT5668_HP_IMP_SENS_CTRL_30: + case RT5668_HP_IMP_SENS_CTRL_31: + case RT5668_HP_IMP_SENS_CTRL_32: + case RT5668_HP_IMP_SENS_CTRL_33: + case RT5668_HP_IMP_SENS_CTRL_34: + case RT5668_HP_IMP_SENS_CTRL_35: + case RT5668_HP_IMP_SENS_CTRL_36: + case RT5668_HP_IMP_SENS_CTRL_37: + case RT5668_HP_IMP_SENS_CTRL_38: + case RT5668_HP_IMP_SENS_CTRL_39: + case RT5668_HP_IMP_SENS_CTRL_40: + case RT5668_HP_IMP_SENS_CTRL_41: + case RT5668_HP_IMP_SENS_CTRL_42: + case RT5668_HP_IMP_SENS_CTRL_43: + case RT5668_HP_LOGIC_CTRL_1: + case RT5668_HP_LOGIC_CTRL_2: + case RT5668_HP_LOGIC_CTRL_3: + case RT5668_HP_CALIB_CTRL_1: + case RT5668_HP_CALIB_CTRL_2: + case RT5668_HP_CALIB_CTRL_3: + case RT5668_HP_CALIB_CTRL_4: + case RT5668_HP_CALIB_CTRL_5: + case RT5668_HP_CALIB_CTRL_6: + case RT5668_HP_CALIB_CTRL_7: + case RT5668_HP_CALIB_CTRL_9: + case RT5668_HP_CALIB_CTRL_10: + case RT5668_HP_CALIB_CTRL_11: + case RT5668_HP_CALIB_STA_1: + case RT5668_HP_CALIB_STA_2: + case RT5668_HP_CALIB_STA_3: + case RT5668_HP_CALIB_STA_4: + case RT5668_HP_CALIB_STA_5: + case RT5668_HP_CALIB_STA_6: + case RT5668_HP_CALIB_STA_7: + case RT5668_HP_CALIB_STA_8: + case RT5668_HP_CALIB_STA_9: + case RT5668_HP_CALIB_STA_10: + case RT5668_HP_CALIB_STA_11: + case RT5668_SAR_IL_CMD_1: + case RT5668_SAR_IL_CMD_2: + case RT5668_SAR_IL_CMD_3: + case RT5668_SAR_IL_CMD_4: + case RT5668_SAR_IL_CMD_5: + case RT5668_SAR_IL_CMD_6: + case RT5668_SAR_IL_CMD_7: + case RT5668_SAR_IL_CMD_8: + case RT5668_SAR_IL_CMD_9: + case RT5668_SAR_IL_CMD_10: + case RT5668_SAR_IL_CMD_11: + case RT5668_SAR_IL_CMD_12: + case RT5668_SAR_IL_CMD_13: + case RT5668_EFUSE_CTRL_1: + case RT5668_EFUSE_CTRL_2: + case RT5668_EFUSE_CTRL_3: + case RT5668_EFUSE_CTRL_4: + case RT5668_EFUSE_CTRL_5: + case RT5668_EFUSE_CTRL_6: + case RT5668_EFUSE_CTRL_7: + case RT5668_EFUSE_CTRL_8: + case RT5668_EFUSE_CTRL_9: + case RT5668_EFUSE_CTRL_10: + case RT5668_EFUSE_CTRL_11: + case RT5668_JD_TOP_VC_VTRL: + case RT5668_DRC1_CTRL_0: + case RT5668_DRC1_CTRL_1: + case RT5668_DRC1_CTRL_2: + case RT5668_DRC1_CTRL_3: + case RT5668_DRC1_CTRL_4: + case RT5668_DRC1_CTRL_5: + case RT5668_DRC1_CTRL_6: + case RT5668_DRC1_HARD_LMT_CTRL_1: + case RT5668_DRC1_HARD_LMT_CTRL_2: + case RT5668_DRC1_PRIV_1: + case RT5668_DRC1_PRIV_2: + case RT5668_DRC1_PRIV_3: + case RT5668_DRC1_PRIV_4: + case RT5668_DRC1_PRIV_5: + case RT5668_DRC1_PRIV_6: + case RT5668_DRC1_PRIV_7: + case RT5668_DRC1_PRIV_8: + case RT5668_EQ_AUTO_RCV_CTRL1: + case RT5668_EQ_AUTO_RCV_CTRL2: + case RT5668_EQ_AUTO_RCV_CTRL3: + case RT5668_EQ_AUTO_RCV_CTRL4: + case RT5668_EQ_AUTO_RCV_CTRL5: + case RT5668_EQ_AUTO_RCV_CTRL6: + case RT5668_EQ_AUTO_RCV_CTRL7: + case RT5668_EQ_AUTO_RCV_CTRL8: + case RT5668_EQ_AUTO_RCV_CTRL9: + case RT5668_EQ_AUTO_RCV_CTRL10: + case RT5668_EQ_AUTO_RCV_CTRL11: + case RT5668_EQ_AUTO_RCV_CTRL12: + case RT5668_EQ_AUTO_RCV_CTRL13: + case RT5668_ADC_L_EQ_LPF1_A1: + case RT5668_R_EQ_LPF1_A1: + case RT5668_L_EQ_LPF1_H0: + case RT5668_R_EQ_LPF1_H0: + case RT5668_L_EQ_BPF1_A1: + case RT5668_R_EQ_BPF1_A1: + case RT5668_L_EQ_BPF1_A2: + case RT5668_R_EQ_BPF1_A2: + case RT5668_L_EQ_BPF1_H0: + case RT5668_R_EQ_BPF1_H0: + case RT5668_L_EQ_BPF2_A1: + case RT5668_R_EQ_BPF2_A1: + case RT5668_L_EQ_BPF2_A2: + case RT5668_R_EQ_BPF2_A2: + case RT5668_L_EQ_BPF2_H0: + case RT5668_R_EQ_BPF2_H0: + case RT5668_L_EQ_BPF3_A1: + case RT5668_R_EQ_BPF3_A1: + case RT5668_L_EQ_BPF3_A2: + case RT5668_R_EQ_BPF3_A2: + case RT5668_L_EQ_BPF3_H0: + case RT5668_R_EQ_BPF3_H0: + case RT5668_L_EQ_BPF4_A1: + case RT5668_R_EQ_BPF4_A1: + case RT5668_L_EQ_BPF4_A2: + case RT5668_R_EQ_BPF4_A2: + case RT5668_L_EQ_BPF4_H0: + case RT5668_R_EQ_BPF4_H0: + case RT5668_L_EQ_HPF1_A1: + case RT5668_R_EQ_HPF1_A1: + case RT5668_L_EQ_HPF1_H0: + case RT5668_R_EQ_HPF1_H0: + case RT5668_L_EQ_PRE_VOL: + case RT5668_R_EQ_PRE_VOL: + case RT5668_L_EQ_POST_VOL: + case RT5668_R_EQ_POST_VOL: + case RT5668_I2C_MODE: + return true; + default: + return false; + } +} + +static const DECLARE_TLV_DB_SCALE(hp_vol_tlv, -2250, 150, 0); +static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0); +static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0); +static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0); + +/* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */ +static const DECLARE_TLV_DB_RANGE(bst_tlv, + 0, 0, TLV_DB_SCALE_ITEM(0, 0, 0), + 1, 1, TLV_DB_SCALE_ITEM(2000, 0, 0), + 2, 2, TLV_DB_SCALE_ITEM(2400, 0, 0), + 3, 5, TLV_DB_SCALE_ITEM(3000, 500, 0), + 6, 6, TLV_DB_SCALE_ITEM(4400, 0, 0), + 7, 7, TLV_DB_SCALE_ITEM(5000, 0, 0), + 8, 8, TLV_DB_SCALE_ITEM(5200, 0, 0) +); + +/* Interface data select */ +static const char * const rt5668_data_select[] = { + "L/R", "R/L", "L/L", "R/R" +}; + +static SOC_ENUM_SINGLE_DECL(rt5668_if2_adc_enum, + RT5668_DIG_INF2_DATA, RT5668_IF2_ADC_SEL_SFT, rt5668_data_select); + +static SOC_ENUM_SINGLE_DECL(rt5668_if1_01_adc_enum, + RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC1_SEL_SFT, rt5668_data_select); + +static SOC_ENUM_SINGLE_DECL(rt5668_if1_23_adc_enum, + RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC2_SEL_SFT, rt5668_data_select); + +static SOC_ENUM_SINGLE_DECL(rt5668_if1_45_adc_enum, + RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC3_SEL_SFT, rt5668_data_select); + +static SOC_ENUM_SINGLE_DECL(rt5668_if1_67_adc_enum, + RT5668_TDM_ADDA_CTRL_1, RT5668_IF1_ADC4_SEL_SFT, rt5668_data_select); + +static const struct snd_kcontrol_new rt5668_if2_adc_swap_mux = + SOC_DAPM_ENUM("IF2 ADC Swap Mux", rt5668_if2_adc_enum); + +static const struct snd_kcontrol_new rt5668_if1_01_adc_swap_mux = + SOC_DAPM_ENUM("IF1 01 ADC Swap Mux", rt5668_if1_01_adc_enum); + +static const struct snd_kcontrol_new rt5668_if1_23_adc_swap_mux = + SOC_DAPM_ENUM("IF1 23 ADC Swap Mux", rt5668_if1_23_adc_enum); + +static const struct snd_kcontrol_new rt5668_if1_45_adc_swap_mux = + SOC_DAPM_ENUM("IF1 45 ADC Swap Mux", rt5668_if1_45_adc_enum); + +static const struct snd_kcontrol_new rt5668_if1_67_adc_swap_mux = + SOC_DAPM_ENUM("IF1 67 ADC Swap Mux", rt5668_if1_67_adc_enum); + +static void rt5668_reset(struct regmap *regmap) +{ + regmap_write(regmap, RT5668_RESET, 0); + regmap_write(regmap, RT5668_I2C_MODE, 1); +} +/** + * rt5668_sel_asrc_clk_src - select ASRC clock source for a set of filters + * @component: SoC audio component device. + * @filter_mask: mask of filters. + * @clk_src: clock source + * + * The ASRC function is for asynchronous MCLK and LRCK. Also, since RT5668 can + * only support standard 32fs or 64fs i2s format, ASRC should be enabled to + * support special i2s clock format such as Intel's 100fs(100 * sampling rate). + * ASRC function will track i2s clock and generate a corresponding system clock + * for codec. This function provides an API to select the clock source for a + * set of filters specified by the mask. And the component driver will turn on + * ASRC for these filters if ASRC is selected as their clock source. + */ +int rt5668_sel_asrc_clk_src(struct snd_soc_component *component, + unsigned int filter_mask, unsigned int clk_src) +{ + + switch (clk_src) { + case RT5668_CLK_SEL_SYS: + case RT5668_CLK_SEL_I2S1_ASRC: + case RT5668_CLK_SEL_I2S2_ASRC: + break; + + default: + return -EINVAL; + } + + if (filter_mask & RT5668_DA_STEREO1_FILTER) { + snd_soc_component_update_bits(component, RT5668_PLL_TRACK_2, + RT5668_FILTER_CLK_SEL_MASK, + clk_src << RT5668_FILTER_CLK_SEL_SFT); + } + + if (filter_mask & RT5668_AD_STEREO1_FILTER) { + snd_soc_component_update_bits(component, RT5668_PLL_TRACK_3, + RT5668_FILTER_CLK_SEL_MASK, + clk_src << RT5668_FILTER_CLK_SEL_SFT); + } + + return 0; +} +EXPORT_SYMBOL_GPL(rt5668_sel_asrc_clk_src); + +static int rt5668_button_detect(struct snd_soc_component *component) +{ + int btn_type, val; + + val = snd_soc_component_read32(component, RT5668_4BTN_IL_CMD_1); + btn_type = val & 0xfff0; + snd_soc_component_write(component, RT5668_4BTN_IL_CMD_1, val); + pr_debug("%s btn_type=%x\n", __func__, btn_type); + + return btn_type; +} + +static void rt5668_enable_push_button_irq(struct snd_soc_component *component, + bool enable) +{ + if (enable) { + snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1, + RT5668_SAR_BUTT_DET_MASK, RT5668_SAR_BUTT_DET_EN); + snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_13, + RT5668_SAR_SOUR_MASK, RT5668_SAR_SOUR_BTN); + snd_soc_component_write(component, RT5668_IL_CMD_1, 0x0040); + snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2, + RT5668_4BTN_IL_MASK | RT5668_4BTN_IL_RST_MASK, + RT5668_4BTN_IL_EN | RT5668_4BTN_IL_NOR); + snd_soc_component_update_bits(component, RT5668_IRQ_CTRL_3, + RT5668_IL_IRQ_MASK, RT5668_IL_IRQ_EN); + } else { + snd_soc_component_update_bits(component, RT5668_IRQ_CTRL_3, + RT5668_IL_IRQ_MASK, RT5668_IL_IRQ_DIS); + snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1, + RT5668_SAR_BUTT_DET_MASK, RT5668_SAR_BUTT_DET_DIS); + snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2, + RT5668_4BTN_IL_MASK, RT5668_4BTN_IL_DIS); + snd_soc_component_update_bits(component, RT5668_4BTN_IL_CMD_2, + RT5668_4BTN_IL_RST_MASK, RT5668_4BTN_IL_RST); + snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_13, + RT5668_SAR_SOUR_MASK, RT5668_SAR_SOUR_TYPE); + } +} + +/** + * rt5668_headset_detect - Detect headset. + * @component: SoC audio component device. + * @jack_insert: Jack insert or not. + * + * Detect whether is headset or not when jack inserted. + * + * Returns detect status. + */ +static int rt5668_headset_detect(struct snd_soc_component *component, + int jack_insert) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + struct snd_soc_dapm_context *dapm = + snd_soc_component_get_dapm(component); + unsigned int val, count; + + if (jack_insert) { + snd_soc_dapm_force_enable_pin(dapm, "CBJ Power"); + snd_soc_dapm_sync(dapm); + snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_1, + RT5668_TRIG_JD_MASK, RT5668_TRIG_JD_HIGH); + + count = 0; + val = snd_soc_component_read32(component, RT5668_CBJ_CTRL_2) + & RT5668_JACK_TYPE_MASK; + while (val == 0 && count < 50) { + usleep_range(10000, 15000); + val = snd_soc_component_read32(component, + RT5668_CBJ_CTRL_2) & RT5668_JACK_TYPE_MASK; + count++; + } + + switch (val) { + case 0x1: + case 0x2: + rt5668->jack_type = SND_JACK_HEADSET; + rt5668_enable_push_button_irq(component, true); + break; + default: + rt5668->jack_type = SND_JACK_HEADPHONE; + } + + } else { + rt5668_enable_push_button_irq(component, false); + snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_1, + RT5668_TRIG_JD_MASK, RT5668_TRIG_JD_LOW); + snd_soc_dapm_disable_pin(dapm, "CBJ Power"); + snd_soc_dapm_sync(dapm); + + rt5668->jack_type = 0; + } + + dev_dbg(component->dev, "jack_type = %d\n", rt5668->jack_type); + return rt5668->jack_type; +} + +static irqreturn_t rt5668_irq(int irq, void *data) +{ + struct rt5668_priv *rt5668 = data; + + mod_delayed_work(system_power_efficient_wq, + &rt5668->jack_detect_work, msecs_to_jiffies(250)); + + return IRQ_HANDLED; +} + +static void rt5668_jd_check_handler(struct work_struct *work) +{ + struct rt5668_priv *rt5668 = container_of(work, struct rt5668_priv, + jd_check_work.work); + + if (snd_soc_component_read32(rt5668->component, RT5668_AJD1_CTRL) + & RT5668_JDH_RS_MASK) { + /* jack out */ + rt5668->jack_type = rt5668_headset_detect(rt5668->component, 0); + + snd_soc_jack_report(rt5668->hs_jack, rt5668->jack_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3); + } else { + schedule_delayed_work(&rt5668->jd_check_work, 500); + } +} + +static int rt5668_set_jack_detect(struct snd_soc_component *component, + struct snd_soc_jack *hs_jack, void *data) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + switch (rt5668->pdata.jd_src) { + case RT5668_JD1: + snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_2, + RT5668_EXT_JD_SRC, RT5668_EXT_JD_SRC_MANUAL); + snd_soc_component_write(component, RT5668_CBJ_CTRL_1, 0xd002); + snd_soc_component_update_bits(component, RT5668_CBJ_CTRL_3, + RT5668_CBJ_IN_BUF_EN, RT5668_CBJ_IN_BUF_EN); + snd_soc_component_update_bits(component, RT5668_SAR_IL_CMD_1, + RT5668_SAR_POW_MASK, RT5668_SAR_POW_EN); + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP1_PIN_MASK, RT5668_GP1_PIN_IRQ); + regmap_update_bits(rt5668->regmap, RT5668_RC_CLK_CTRL, + RT5668_POW_IRQ | RT5668_POW_JDH | + RT5668_POW_ANA, RT5668_POW_IRQ | + RT5668_POW_JDH | RT5668_POW_ANA); + regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_2, + RT5668_PWR_JDH | RT5668_PWR_JDL, + RT5668_PWR_JDH | RT5668_PWR_JDL); + regmap_update_bits(rt5668->regmap, RT5668_IRQ_CTRL_2, + RT5668_JD1_EN_MASK | RT5668_JD1_POL_MASK, + RT5668_JD1_EN | RT5668_JD1_POL_NOR); + mod_delayed_work(system_power_efficient_wq, + &rt5668->jack_detect_work, msecs_to_jiffies(250)); + break; + + case RT5668_JD_NULL: + regmap_update_bits(rt5668->regmap, RT5668_IRQ_CTRL_2, + RT5668_JD1_EN_MASK, RT5668_JD1_DIS); + regmap_update_bits(rt5668->regmap, RT5668_RC_CLK_CTRL, + RT5668_POW_JDH | RT5668_POW_JDL, 0); + break; + + default: + dev_warn(component->dev, "Wrong JD source\n"); + break; + } + + rt5668->hs_jack = hs_jack; + + return 0; +} + +static void rt5668_jack_detect_handler(struct work_struct *work) +{ + struct rt5668_priv *rt5668 = + container_of(work, struct rt5668_priv, jack_detect_work.work); + int val, btn_type; + + while (!rt5668->component) + usleep_range(10000, 15000); + + while (!rt5668->component->card->instantiated) + usleep_range(10000, 15000); + + mutex_lock(&rt5668->calibrate_mutex); + + val = snd_soc_component_read32(rt5668->component, RT5668_AJD1_CTRL) + & RT5668_JDH_RS_MASK; + if (!val) { + /* jack in */ + if (rt5668->jack_type == 0) { + /* jack was out, report jack type */ + rt5668->jack_type = + rt5668_headset_detect(rt5668->component, 1); + } else { + /* jack is already in, report button event */ + rt5668->jack_type = SND_JACK_HEADSET; + btn_type = rt5668_button_detect(rt5668->component); + /** + * rt5668 can report three kinds of button behavior, + * one click, double click and hold. However, + * currently we will report button pressed/released + * event. So all the three button behaviors are + * treated as button pressed. + */ + switch (btn_type) { + case 0x8000: + case 0x4000: + case 0x2000: + rt5668->jack_type |= SND_JACK_BTN_0; + break; + case 0x1000: + case 0x0800: + case 0x0400: + rt5668->jack_type |= SND_JACK_BTN_1; + break; + case 0x0200: + case 0x0100: + case 0x0080: + rt5668->jack_type |= SND_JACK_BTN_2; + break; + case 0x0040: + case 0x0020: + case 0x0010: + rt5668->jack_type |= SND_JACK_BTN_3; + break; + case 0x0000: /* unpressed */ + break; + default: + btn_type = 0; + dev_err(rt5668->component->dev, + "Unexpected button code 0x%04x\n", + btn_type); + break; + } + } + } else { + /* jack out */ + rt5668->jack_type = rt5668_headset_detect(rt5668->component, 0); + } + + snd_soc_jack_report(rt5668->hs_jack, rt5668->jack_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3); + + if (rt5668->jack_type & (SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3)) + schedule_delayed_work(&rt5668->jd_check_work, 0); + else + cancel_delayed_work_sync(&rt5668->jd_check_work); + + mutex_unlock(&rt5668->calibrate_mutex); +} + +static const struct snd_kcontrol_new rt5668_snd_controls[] = { + /* Headphone Output Volume */ + SOC_DOUBLE_R_TLV("Headphone Playback Volume", RT5668_HPL_GAIN, + RT5668_HPR_GAIN, RT5668_G_HP_SFT, 15, 1, hp_vol_tlv), + + /* DAC Digital Volume */ + SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5668_DAC1_DIG_VOL, + RT5668_L_VOL_SFT, RT5668_R_VOL_SFT, 175, 0, dac_vol_tlv), + + /* IN Boost Volume */ + SOC_SINGLE_TLV("CBJ Boost Volume", RT5668_CBJ_BST_CTRL, + RT5668_BST_CBJ_SFT, 8, 0, bst_tlv), + + /* ADC Digital Volume Control */ + SOC_DOUBLE("STO1 ADC Capture Switch", RT5668_STO1_ADC_DIG_VOL, + RT5668_L_MUTE_SFT, RT5668_R_MUTE_SFT, 1, 1), + SOC_DOUBLE_TLV("STO1 ADC Capture Volume", RT5668_STO1_ADC_DIG_VOL, + RT5668_L_VOL_SFT, RT5668_R_VOL_SFT, 127, 0, adc_vol_tlv), + + /* ADC Boost Volume Control */ + SOC_DOUBLE_TLV("STO1 ADC Boost Gain Volume", RT5668_STO1_ADC_BOOST, + RT5668_STO1_ADC_L_BST_SFT, RT5668_STO1_ADC_R_BST_SFT, + 3, 0, adc_bst_tlv), +}; + + +static int rt5668_div_sel(struct rt5668_priv *rt5668, + int target, const int div[], int size) +{ + int i; + + if (rt5668->sysclk < target) { + pr_err("sysclk rate %d is too low\n", + rt5668->sysclk); + return 0; + } + + for (i = 0; i < size - 1; i++) { + pr_info("div[%d]=%d\n", i, div[i]); + if (target * div[i] == rt5668->sysclk) + return i; + if (target * div[i + 1] > rt5668->sysclk) { + pr_err("can't find div for sysclk %d\n", + rt5668->sysclk); + return i; + } + } + + if (target * div[i] < rt5668->sysclk) + pr_err("sysclk rate %d is too high\n", + rt5668->sysclk); + + return size - 1; + +} + +/** + * set_dmic_clk - Set parameter of dmic. + * + * @w: DAPM widget. + * @kcontrol: The kcontrol of this widget. + * @event: Event id. + * + * Choose dmic clock between 1MHz and 3MHz. + * It is better for clock to approximate 3MHz. + */ +static int set_dmic_clk(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + int idx = -EINVAL; + static const int div[] = {2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128}; + + idx = rt5668_div_sel(rt5668, 1500000, div, ARRAY_SIZE(div)); + + snd_soc_component_update_bits(component, RT5668_DMIC_CTRL_1, + RT5668_DMIC_CLK_MASK, idx << RT5668_DMIC_CLK_SFT); + + return 0; +} + +static int set_filter_clk(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + int ref, val, reg, idx = -EINVAL; + static const int div[] = {1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48}; + + val = snd_soc_component_read32(component, RT5668_GPIO_CTRL_1) && + RT5668_GP4_PIN_MASK; + if (w->shift == RT5668_PWR_ADC_S1F_BIT && + val == RT5668_GP4_PIN_ADCDAT2) + ref = 256 * rt5668->lrck[RT5668_AIF2]; + else + ref = 256 * rt5668->lrck[RT5668_AIF1]; + + idx = rt5668_div_sel(rt5668, ref, div, ARRAY_SIZE(div)); + + if (w->shift == RT5668_PWR_ADC_S1F_BIT) + reg = RT5668_PLL_TRACK_3; + else + reg = RT5668_PLL_TRACK_2; + + snd_soc_component_update_bits(component, reg, + RT5668_FILTER_CLK_SEL_MASK, idx << RT5668_FILTER_CLK_SEL_SFT); + + return 0; +} + +static int is_sys_clk_from_pll1(struct snd_soc_dapm_widget *w, + struct snd_soc_dapm_widget *sink) +{ + unsigned int val; + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + + val = snd_soc_component_read32(component, RT5668_GLB_CLK); + val &= RT5668_SCLK_SRC_MASK; + if (val == RT5668_SCLK_SRC_PLL1) + return 1; + else + return 0; +} + +static int is_using_asrc(struct snd_soc_dapm_widget *w, + struct snd_soc_dapm_widget *sink) +{ + unsigned int reg, shift, val; + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + + switch (w->shift) { + case RT5668_ADC_STO1_ASRC_SFT: + reg = RT5668_PLL_TRACK_3; + shift = RT5668_FILTER_CLK_SEL_SFT; + break; + case RT5668_DAC_STO1_ASRC_SFT: + reg = RT5668_PLL_TRACK_2; + shift = RT5668_FILTER_CLK_SEL_SFT; + break; + default: + return 0; + } + + val = (snd_soc_component_read32(component, reg) >> shift) & 0xf; + switch (val) { + case RT5668_CLK_SEL_I2S1_ASRC: + case RT5668_CLK_SEL_I2S2_ASRC: + return 1; + default: + return 0; + } + +} + +/* Digital Mixer */ +static const struct snd_kcontrol_new rt5668_sto1_adc_l_mix[] = { + SOC_DAPM_SINGLE("ADC1 Switch", RT5668_STO1_ADC_MIXER, + RT5668_M_STO1_ADC_L1_SFT, 1, 1), + SOC_DAPM_SINGLE("ADC2 Switch", RT5668_STO1_ADC_MIXER, + RT5668_M_STO1_ADC_L2_SFT, 1, 1), +}; + +static const struct snd_kcontrol_new rt5668_sto1_adc_r_mix[] = { + SOC_DAPM_SINGLE("ADC1 Switch", RT5668_STO1_ADC_MIXER, + RT5668_M_STO1_ADC_R1_SFT, 1, 1), + SOC_DAPM_SINGLE("ADC2 Switch", RT5668_STO1_ADC_MIXER, + RT5668_M_STO1_ADC_R2_SFT, 1, 1), +}; + +static const struct snd_kcontrol_new rt5668_dac_l_mix[] = { + SOC_DAPM_SINGLE("Stereo ADC Switch", RT5668_AD_DA_MIXER, + RT5668_M_ADCMIX_L_SFT, 1, 1), + SOC_DAPM_SINGLE("DAC1 Switch", RT5668_AD_DA_MIXER, + RT5668_M_DAC1_L_SFT, 1, 1), +}; + +static const struct snd_kcontrol_new rt5668_dac_r_mix[] = { + SOC_DAPM_SINGLE("Stereo ADC Switch", RT5668_AD_DA_MIXER, + RT5668_M_ADCMIX_R_SFT, 1, 1), + SOC_DAPM_SINGLE("DAC1 Switch", RT5668_AD_DA_MIXER, + RT5668_M_DAC1_R_SFT, 1, 1), +}; + +static const struct snd_kcontrol_new rt5668_sto1_dac_l_mix[] = { + SOC_DAPM_SINGLE("DAC L1 Switch", RT5668_STO1_DAC_MIXER, + RT5668_M_DAC_L1_STO_L_SFT, 1, 1), + SOC_DAPM_SINGLE("DAC R1 Switch", RT5668_STO1_DAC_MIXER, + RT5668_M_DAC_R1_STO_L_SFT, 1, 1), +}; + +static const struct snd_kcontrol_new rt5668_sto1_dac_r_mix[] = { + SOC_DAPM_SINGLE("DAC L1 Switch", RT5668_STO1_DAC_MIXER, + RT5668_M_DAC_L1_STO_R_SFT, 1, 1), + SOC_DAPM_SINGLE("DAC R1 Switch", RT5668_STO1_DAC_MIXER, + RT5668_M_DAC_R1_STO_R_SFT, 1, 1), +}; + +/* Analog Input Mixer */ +static const struct snd_kcontrol_new rt5668_rec1_l_mix[] = { + SOC_DAPM_SINGLE("CBJ Switch", RT5668_REC_MIXER, + RT5668_M_CBJ_RM1_L_SFT, 1, 1), +}; + +/* STO1 ADC1 Source */ +/* MX-26 [13] [5] */ +static const char * const rt5668_sto1_adc1_src[] = { + "DAC MIX", "ADC" +}; + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adc1l_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADC1L_SRC_SFT, rt5668_sto1_adc1_src); + +static const struct snd_kcontrol_new rt5668_sto1_adc1l_mux = + SOC_DAPM_ENUM("Stereo1 ADC1L Source", rt5668_sto1_adc1l_enum); + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adc1r_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADC1R_SRC_SFT, rt5668_sto1_adc1_src); + +static const struct snd_kcontrol_new rt5668_sto1_adc1r_mux = + SOC_DAPM_ENUM("Stereo1 ADC1L Source", rt5668_sto1_adc1r_enum); + +/* STO1 ADC Source */ +/* MX-26 [11:10] [3:2] */ +static const char * const rt5668_sto1_adc_src[] = { + "ADC1 L", "ADC1 R" +}; + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adcl_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADCL_SRC_SFT, rt5668_sto1_adc_src); + +static const struct snd_kcontrol_new rt5668_sto1_adcl_mux = + SOC_DAPM_ENUM("Stereo1 ADCL Source", rt5668_sto1_adcl_enum); + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adcr_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADCR_SRC_SFT, rt5668_sto1_adc_src); + +static const struct snd_kcontrol_new rt5668_sto1_adcr_mux = + SOC_DAPM_ENUM("Stereo1 ADCR Source", rt5668_sto1_adcr_enum); + +/* STO1 ADC2 Source */ +/* MX-26 [12] [4] */ +static const char * const rt5668_sto1_adc2_src[] = { + "DAC MIX", "DMIC" +}; + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adc2l_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADC2L_SRC_SFT, rt5668_sto1_adc2_src); + +static const struct snd_kcontrol_new rt5668_sto1_adc2l_mux = + SOC_DAPM_ENUM("Stereo1 ADC2L Source", rt5668_sto1_adc2l_enum); + +static SOC_ENUM_SINGLE_DECL( + rt5668_sto1_adc2r_enum, RT5668_STO1_ADC_MIXER, + RT5668_STO1_ADC2R_SRC_SFT, rt5668_sto1_adc2_src); + +static const struct snd_kcontrol_new rt5668_sto1_adc2r_mux = + SOC_DAPM_ENUM("Stereo1 ADC2R Source", rt5668_sto1_adc2r_enum); + +/* MX-79 [6:4] I2S1 ADC data location */ +static const unsigned int rt5668_if1_adc_slot_values[] = { + 0, + 2, + 4, + 6, +}; + +static const char * const rt5668_if1_adc_slot_src[] = { + "Slot 0", "Slot 2", "Slot 4", "Slot 6" +}; + +static SOC_VALUE_ENUM_SINGLE_DECL(rt5668_if1_adc_slot_enum, + RT5668_TDM_CTRL, RT5668_TDM_ADC_LCA_SFT, RT5668_TDM_ADC_LCA_MASK, + rt5668_if1_adc_slot_src, rt5668_if1_adc_slot_values); + +static const struct snd_kcontrol_new rt5668_if1_adc_slot_mux = + SOC_DAPM_ENUM("IF1 ADC Slot location", rt5668_if1_adc_slot_enum); + +/* Analog DAC L1 Source, Analog DAC R1 Source*/ +/* MX-2B [4], MX-2B [0]*/ +static const char * const rt5668_alg_dac1_src[] = { + "Stereo1 DAC Mixer", "DAC1" +}; + +static SOC_ENUM_SINGLE_DECL( + rt5668_alg_dac_l1_enum, RT5668_A_DAC1_MUX, + RT5668_A_DACL1_SFT, rt5668_alg_dac1_src); + +static const struct snd_kcontrol_new rt5668_alg_dac_l1_mux = + SOC_DAPM_ENUM("Analog DAC L1 Source", rt5668_alg_dac_l1_enum); + +static SOC_ENUM_SINGLE_DECL( + rt5668_alg_dac_r1_enum, RT5668_A_DAC1_MUX, + RT5668_A_DACR1_SFT, rt5668_alg_dac1_src); + +static const struct snd_kcontrol_new rt5668_alg_dac_r1_mux = + SOC_DAPM_ENUM("Analog DAC R1 Source", rt5668_alg_dac_r1_enum); + +/* Out Switch */ +static const struct snd_kcontrol_new hpol_switch = + SOC_DAPM_SINGLE_AUTODISABLE("Switch", RT5668_HP_CTRL_1, + RT5668_L_MUTE_SFT, 1, 1); +static const struct snd_kcontrol_new hpor_switch = + SOC_DAPM_SINGLE_AUTODISABLE("Switch", RT5668_HP_CTRL_1, + RT5668_R_MUTE_SFT, 1, 1); + +static int rt5668_hp_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write(component, + RT5668_HP_LOGIC_CTRL_2, 0x0012); + snd_soc_component_write(component, + RT5668_HP_CTRL_2, 0x6000); + snd_soc_component_update_bits(component, RT5668_STO_NG2_CTRL_1, + RT5668_NG2_EN_MASK, RT5668_NG2_EN); + snd_soc_component_update_bits(component, + RT5668_DEPOP_1, 0x60, 0x60); + break; + + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_update_bits(component, + RT5668_DEPOP_1, 0x60, 0x0); + snd_soc_component_write(component, + RT5668_HP_CTRL_2, 0x0000); + break; + + default: + return 0; + } + + return 0; + +} + +static int set_dmic_power(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + switch (event) { + case SND_SOC_DAPM_POST_PMU: + /*Add delay to avoid pop noise*/ + msleep(150); + break; + + default: + return 0; + } + + return 0; +} + +static int rt5655_set_verf(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + switch (w->shift) { + case RT5668_PWR_VREF1_BIT: + snd_soc_component_update_bits(component, + RT5668_PWR_ANLG_1, RT5668_PWR_FV1, 0); + break; + + case RT5668_PWR_VREF2_BIT: + snd_soc_component_update_bits(component, + RT5668_PWR_ANLG_1, RT5668_PWR_FV2, 0); + break; + + default: + break; + } + break; + + case SND_SOC_DAPM_POST_PMU: + usleep_range(15000, 20000); + switch (w->shift) { + case RT5668_PWR_VREF1_BIT: + snd_soc_component_update_bits(component, + RT5668_PWR_ANLG_1, RT5668_PWR_FV1, + RT5668_PWR_FV1); + break; + + case RT5668_PWR_VREF2_BIT: + snd_soc_component_update_bits(component, + RT5668_PWR_ANLG_1, RT5668_PWR_FV2, + RT5668_PWR_FV2); + break; + + default: + break; + } + break; + + default: + return 0; + } + + return 0; +} + +static const unsigned int rt5668_adcdat_pin_values[] = { + 1, + 3, +}; + +static const char * const rt5668_adcdat_pin_select[] = { + "ADCDAT1", + "ADCDAT2", +}; + +static SOC_VALUE_ENUM_SINGLE_DECL(rt5668_adcdat_pin_enum, + RT5668_GPIO_CTRL_1, RT5668_GP4_PIN_SFT, RT5668_GP4_PIN_MASK, + rt5668_adcdat_pin_select, rt5668_adcdat_pin_values); + +static const struct snd_kcontrol_new rt5668_adcdat_pin_ctrl = + SOC_DAPM_ENUM("ADCDAT", rt5668_adcdat_pin_enum); + +static const struct snd_soc_dapm_widget rt5668_dapm_widgets[] = { + SND_SOC_DAPM_SUPPLY("LDO2", RT5668_PWR_ANLG_3, RT5668_PWR_LDO2_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("PLL1", RT5668_PWR_ANLG_3, RT5668_PWR_PLL_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("PLL2B", RT5668_PWR_ANLG_3, RT5668_PWR_PLL2B_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("PLL2F", RT5668_PWR_ANLG_3, RT5668_PWR_PLL2F_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("Vref1", RT5668_PWR_ANLG_1, RT5668_PWR_VREF1_BIT, 0, + rt5655_set_verf, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU), + SND_SOC_DAPM_SUPPLY("Vref2", RT5668_PWR_ANLG_1, RT5668_PWR_VREF2_BIT, 0, + rt5655_set_verf, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU), + + /* ASRC */ + SND_SOC_DAPM_SUPPLY_S("DAC STO1 ASRC", 1, RT5668_PLL_TRACK_1, + RT5668_DAC_STO1_ASRC_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("ADC STO1 ASRC", 1, RT5668_PLL_TRACK_1, + RT5668_ADC_STO1_ASRC_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("AD ASRC", 1, RT5668_PLL_TRACK_1, + RT5668_AD_ASRC_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("DA ASRC", 1, RT5668_PLL_TRACK_1, + RT5668_DA_ASRC_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("DMIC ASRC", 1, RT5668_PLL_TRACK_1, + RT5668_DMIC_ASRC_SFT, 0, NULL, 0), + + /* Input Side */ + SND_SOC_DAPM_SUPPLY("MICBIAS1", RT5668_PWR_ANLG_2, RT5668_PWR_MB1_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("MICBIAS2", RT5668_PWR_ANLG_2, RT5668_PWR_MB2_BIT, + 0, NULL, 0), + + /* Input Lines */ + SND_SOC_DAPM_INPUT("DMIC L1"), + SND_SOC_DAPM_INPUT("DMIC R1"), + + SND_SOC_DAPM_INPUT("IN1P"), + + SND_SOC_DAPM_SUPPLY("DMIC CLK", SND_SOC_NOPM, 0, 0, + set_dmic_clk, SND_SOC_DAPM_PRE_PMU), + SND_SOC_DAPM_SUPPLY("DMIC1 Power", RT5668_DMIC_CTRL_1, + RT5668_DMIC_1_EN_SFT, 0, set_dmic_power, SND_SOC_DAPM_POST_PMU), + + /* Boost */ + SND_SOC_DAPM_PGA("BST1 CBJ", SND_SOC_NOPM, + 0, 0, NULL, 0), + + SND_SOC_DAPM_SUPPLY("CBJ Power", RT5668_PWR_ANLG_3, + RT5668_PWR_CBJ_BIT, 0, NULL, 0), + + /* REC Mixer */ + SND_SOC_DAPM_MIXER("RECMIX1L", SND_SOC_NOPM, 0, 0, rt5668_rec1_l_mix, + ARRAY_SIZE(rt5668_rec1_l_mix)), + SND_SOC_DAPM_SUPPLY("RECMIX1L Power", RT5668_PWR_ANLG_2, + RT5668_PWR_RM1_L_BIT, 0, NULL, 0), + + /* ADCs */ + SND_SOC_DAPM_ADC("ADC1 L", NULL, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_ADC("ADC1 R", NULL, SND_SOC_NOPM, 0, 0), + + SND_SOC_DAPM_SUPPLY("ADC1 L Power", RT5668_PWR_DIG_1, + RT5668_PWR_ADC_L1_BIT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("ADC1 R Power", RT5668_PWR_DIG_1, + RT5668_PWR_ADC_R1_BIT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("ADC1 clock", RT5668_CHOP_ADC, + RT5668_CKGEN_ADC1_SFT, 0, NULL, 0), + + /* ADC Mux */ + SND_SOC_DAPM_MUX("Stereo1 ADC L1 Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adc1l_mux), + SND_SOC_DAPM_MUX("Stereo1 ADC R1 Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adc1r_mux), + SND_SOC_DAPM_MUX("Stereo1 ADC L2 Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adc2l_mux), + SND_SOC_DAPM_MUX("Stereo1 ADC R2 Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adc2r_mux), + SND_SOC_DAPM_MUX("Stereo1 ADC L Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adcl_mux), + SND_SOC_DAPM_MUX("Stereo1 ADC R Mux", SND_SOC_NOPM, 0, 0, + &rt5668_sto1_adcr_mux), + SND_SOC_DAPM_MUX("IF1_ADC Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if1_adc_slot_mux), + + /* ADC Mixer */ + SND_SOC_DAPM_SUPPLY("ADC Stereo1 Filter", RT5668_PWR_DIG_2, + RT5668_PWR_ADC_S1F_BIT, 0, set_filter_clk, + SND_SOC_DAPM_PRE_PMU), + SND_SOC_DAPM_MIXER("Stereo1 ADC MIXL", RT5668_STO1_ADC_DIG_VOL, + RT5668_L_MUTE_SFT, 1, rt5668_sto1_adc_l_mix, + ARRAY_SIZE(rt5668_sto1_adc_l_mix)), + SND_SOC_DAPM_MIXER("Stereo1 ADC MIXR", RT5668_STO1_ADC_DIG_VOL, + RT5668_R_MUTE_SFT, 1, rt5668_sto1_adc_r_mix, + ARRAY_SIZE(rt5668_sto1_adc_r_mix)), + + /* ADC PGA */ + SND_SOC_DAPM_PGA("Stereo1 ADC MIX", SND_SOC_NOPM, 0, 0, NULL, 0), + + /* Digital Interface */ + SND_SOC_DAPM_SUPPLY("I2S1", RT5668_PWR_DIG_1, RT5668_PWR_I2S1_BIT, + 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("I2S2", RT5668_PWR_DIG_1, RT5668_PWR_I2S2_BIT, + 0, NULL, 0), + SND_SOC_DAPM_PGA("IF1 DAC1", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF1 DAC1 L", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF1 DAC1 R", SND_SOC_NOPM, 0, 0, NULL, 0), + + /* Digital Interface Select */ + SND_SOC_DAPM_MUX("IF1 01 ADC Swap Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if1_01_adc_swap_mux), + SND_SOC_DAPM_MUX("IF1 23 ADC Swap Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if1_23_adc_swap_mux), + SND_SOC_DAPM_MUX("IF1 45 ADC Swap Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if1_45_adc_swap_mux), + SND_SOC_DAPM_MUX("IF1 67 ADC Swap Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if1_67_adc_swap_mux), + SND_SOC_DAPM_MUX("IF2 ADC Swap Mux", SND_SOC_NOPM, 0, 0, + &rt5668_if2_adc_swap_mux), + + SND_SOC_DAPM_MUX("ADCDAT Mux", SND_SOC_NOPM, 0, 0, + &rt5668_adcdat_pin_ctrl), + + /* Audio Interface */ + SND_SOC_DAPM_AIF_OUT("AIF1TX", "AIF1 Capture", 0, + RT5668_I2S1_SDP, RT5668_SEL_ADCDAT_SFT, 1), + SND_SOC_DAPM_AIF_OUT("AIF2TX", "AIF2 Capture", 0, + RT5668_I2S2_SDP, RT5668_I2S2_PIN_CFG_SFT, 1), + SND_SOC_DAPM_AIF_IN("AIF1RX", "AIF1 Playback", 0, SND_SOC_NOPM, 0, 0), + + /* Output Side */ + /* DAC mixer before sound effect */ + SND_SOC_DAPM_MIXER("DAC1 MIXL", SND_SOC_NOPM, 0, 0, + rt5668_dac_l_mix, ARRAY_SIZE(rt5668_dac_l_mix)), + SND_SOC_DAPM_MIXER("DAC1 MIXR", SND_SOC_NOPM, 0, 0, + rt5668_dac_r_mix, ARRAY_SIZE(rt5668_dac_r_mix)), + + /* DAC channel Mux */ + SND_SOC_DAPM_MUX("DAC L1 Source", SND_SOC_NOPM, 0, 0, + &rt5668_alg_dac_l1_mux), + SND_SOC_DAPM_MUX("DAC R1 Source", SND_SOC_NOPM, 0, 0, + &rt5668_alg_dac_r1_mux), + + /* DAC Mixer */ + SND_SOC_DAPM_SUPPLY("DAC Stereo1 Filter", RT5668_PWR_DIG_2, + RT5668_PWR_DAC_S1F_BIT, 0, set_filter_clk, + SND_SOC_DAPM_PRE_PMU), + SND_SOC_DAPM_MIXER("Stereo1 DAC MIXL", SND_SOC_NOPM, 0, 0, + rt5668_sto1_dac_l_mix, ARRAY_SIZE(rt5668_sto1_dac_l_mix)), + SND_SOC_DAPM_MIXER("Stereo1 DAC MIXR", SND_SOC_NOPM, 0, 0, + rt5668_sto1_dac_r_mix, ARRAY_SIZE(rt5668_sto1_dac_r_mix)), + + /* DACs */ + SND_SOC_DAPM_DAC("DAC L1", NULL, RT5668_PWR_DIG_1, + RT5668_PWR_DAC_L1_BIT, 0), + SND_SOC_DAPM_DAC("DAC R1", NULL, RT5668_PWR_DIG_1, + RT5668_PWR_DAC_R1_BIT, 0), + SND_SOC_DAPM_SUPPLY_S("DAC 1 Clock", 3, RT5668_CHOP_DAC, + RT5668_CKGEN_DAC1_SFT, 0, NULL, 0), + + /* HPO */ + SND_SOC_DAPM_PGA_S("HP Amp", 1, SND_SOC_NOPM, 0, 0, rt5668_hp_event, + SND_SOC_DAPM_POST_PMD | SND_SOC_DAPM_PRE_PMU), + + SND_SOC_DAPM_SUPPLY("HP Amp L", RT5668_PWR_ANLG_1, + RT5668_PWR_HA_L_BIT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("HP Amp R", RT5668_PWR_ANLG_1, + RT5668_PWR_HA_R_BIT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("Charge Pump", 1, RT5668_DEPOP_1, + RT5668_PUMP_EN_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY_S("Capless", 2, RT5668_DEPOP_1, + RT5668_CAPLESS_EN_SFT, 0, NULL, 0), + + SND_SOC_DAPM_SWITCH("HPOL Playback", SND_SOC_NOPM, 0, 0, + &hpol_switch), + SND_SOC_DAPM_SWITCH("HPOR Playback", SND_SOC_NOPM, 0, 0, + &hpor_switch), + + /* CLK DET */ + SND_SOC_DAPM_SUPPLY("CLKDET SYS", RT5668_CLK_DET, + RT5668_SYS_CLK_DET_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("CLKDET PLL1", RT5668_CLK_DET, + RT5668_PLL1_CLK_DET_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("CLKDET PLL2", RT5668_CLK_DET, + RT5668_PLL2_CLK_DET_SFT, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("CLKDET", RT5668_CLK_DET, + RT5668_POW_CLK_DET_SFT, 0, NULL, 0), + + /* Output Lines */ + SND_SOC_DAPM_OUTPUT("HPOL"), + SND_SOC_DAPM_OUTPUT("HPOR"), + +}; + +static const struct snd_soc_dapm_route rt5668_dapm_routes[] = { + /*PLL*/ + {"ADC Stereo1 Filter", NULL, "PLL1", is_sys_clk_from_pll1}, + {"DAC Stereo1 Filter", NULL, "PLL1", is_sys_clk_from_pll1}, + + /*ASRC*/ + {"ADC Stereo1 Filter", NULL, "ADC STO1 ASRC", is_using_asrc}, + {"DAC Stereo1 Filter", NULL, "DAC STO1 ASRC", is_using_asrc}, + {"ADC STO1 ASRC", NULL, "AD ASRC"}, + {"DAC STO1 ASRC", NULL, "DA ASRC"}, + + /*Vref*/ + {"MICBIAS1", NULL, "Vref1"}, + {"MICBIAS1", NULL, "Vref2"}, + {"MICBIAS2", NULL, "Vref1"}, + {"MICBIAS2", NULL, "Vref2"}, + + {"CLKDET SYS", NULL, "CLKDET"}, + + {"IN1P", NULL, "LDO2"}, + + {"BST1 CBJ", NULL, "IN1P"}, + {"BST1 CBJ", NULL, "CBJ Power"}, + {"CBJ Power", NULL, "Vref2"}, + + {"RECMIX1L", "CBJ Switch", "BST1 CBJ"}, + {"RECMIX1L", NULL, "RECMIX1L Power"}, + + {"ADC1 L", NULL, "RECMIX1L"}, + {"ADC1 L", NULL, "ADC1 L Power"}, + {"ADC1 L", NULL, "ADC1 clock"}, + + {"DMIC L1", NULL, "DMIC CLK"}, + {"DMIC L1", NULL, "DMIC1 Power"}, + {"DMIC R1", NULL, "DMIC CLK"}, + {"DMIC R1", NULL, "DMIC1 Power"}, + {"DMIC CLK", NULL, "DMIC ASRC"}, + + {"Stereo1 ADC L Mux", "ADC1 L", "ADC1 L"}, + {"Stereo1 ADC L Mux", "ADC1 R", "ADC1 R"}, + {"Stereo1 ADC R Mux", "ADC1 L", "ADC1 L"}, + {"Stereo1 ADC R Mux", "ADC1 R", "ADC1 R"}, + + {"Stereo1 ADC L1 Mux", "ADC", "Stereo1 ADC L Mux"}, + {"Stereo1 ADC L1 Mux", "DAC MIX", "Stereo1 DAC MIXL"}, + {"Stereo1 ADC L2 Mux", "DMIC", "DMIC L1"}, + {"Stereo1 ADC L2 Mux", "DAC MIX", "Stereo1 DAC MIXL"}, + + {"Stereo1 ADC R1 Mux", "ADC", "Stereo1 ADC R Mux"}, + {"Stereo1 ADC R1 Mux", "DAC MIX", "Stereo1 DAC MIXR"}, + {"Stereo1 ADC R2 Mux", "DMIC", "DMIC R1"}, + {"Stereo1 ADC R2 Mux", "DAC MIX", "Stereo1 DAC MIXR"}, + + {"Stereo1 ADC MIXL", "ADC1 Switch", "Stereo1 ADC L1 Mux"}, + {"Stereo1 ADC MIXL", "ADC2 Switch", "Stereo1 ADC L2 Mux"}, + {"Stereo1 ADC MIXL", NULL, "ADC Stereo1 Filter"}, + + {"Stereo1 ADC MIXR", "ADC1 Switch", "Stereo1 ADC R1 Mux"}, + {"Stereo1 ADC MIXR", "ADC2 Switch", "Stereo1 ADC R2 Mux"}, + {"Stereo1 ADC MIXR", NULL, "ADC Stereo1 Filter"}, + + {"Stereo1 ADC MIX", NULL, "Stereo1 ADC MIXL"}, + {"Stereo1 ADC MIX", NULL, "Stereo1 ADC MIXR"}, + + {"IF1 01 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"}, + {"IF1 01 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"}, + {"IF1 01 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"}, + {"IF1 01 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"}, + {"IF1 23 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"}, + {"IF1 23 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"}, + {"IF1 23 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"}, + {"IF1 23 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"}, + {"IF1 45 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"}, + {"IF1 45 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"}, + {"IF1 45 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"}, + {"IF1 45 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"}, + {"IF1 67 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"}, + {"IF1 67 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"}, + {"IF1 67 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"}, + {"IF1 67 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"}, + + {"IF1_ADC Mux", "Slot 0", "IF1 01 ADC Swap Mux"}, + {"IF1_ADC Mux", "Slot 2", "IF1 23 ADC Swap Mux"}, + {"IF1_ADC Mux", "Slot 4", "IF1 45 ADC Swap Mux"}, + {"IF1_ADC Mux", "Slot 6", "IF1 67 ADC Swap Mux"}, + {"IF1_ADC Mux", NULL, "I2S1"}, + {"ADCDAT Mux", "ADCDAT1", "IF1_ADC Mux"}, + {"AIF1TX", NULL, "ADCDAT Mux"}, + {"IF2 ADC Swap Mux", "L/R", "Stereo1 ADC MIX"}, + {"IF2 ADC Swap Mux", "R/L", "Stereo1 ADC MIX"}, + {"IF2 ADC Swap Mux", "L/L", "Stereo1 ADC MIX"}, + {"IF2 ADC Swap Mux", "R/R", "Stereo1 ADC MIX"}, + {"ADCDAT Mux", "ADCDAT2", "IF2 ADC Swap Mux"}, + {"AIF2TX", NULL, "ADCDAT Mux"}, + + {"IF1 DAC1 L", NULL, "AIF1RX"}, + {"IF1 DAC1 L", NULL, "I2S1"}, + {"IF1 DAC1 L", NULL, "DAC Stereo1 Filter"}, + {"IF1 DAC1 R", NULL, "AIF1RX"}, + {"IF1 DAC1 R", NULL, "I2S1"}, + {"IF1 DAC1 R", NULL, "DAC Stereo1 Filter"}, + + {"DAC1 MIXL", "Stereo ADC Switch", "Stereo1 ADC MIXL"}, + {"DAC1 MIXL", "DAC1 Switch", "IF1 DAC1 L"}, + {"DAC1 MIXR", "Stereo ADC Switch", "Stereo1 ADC MIXR"}, + {"DAC1 MIXR", "DAC1 Switch", "IF1 DAC1 R"}, + + {"Stereo1 DAC MIXL", "DAC L1 Switch", "DAC1 MIXL"}, + {"Stereo1 DAC MIXL", "DAC R1 Switch", "DAC1 MIXR"}, + + {"Stereo1 DAC MIXR", "DAC R1 Switch", "DAC1 MIXR"}, + {"Stereo1 DAC MIXR", "DAC L1 Switch", "DAC1 MIXL"}, + + {"DAC L1 Source", "DAC1", "DAC1 MIXL"}, + {"DAC L1 Source", "Stereo1 DAC Mixer", "Stereo1 DAC MIXL"}, + {"DAC R1 Source", "DAC1", "DAC1 MIXR"}, + {"DAC R1 Source", "Stereo1 DAC Mixer", "Stereo1 DAC MIXR"}, + + {"DAC L1", NULL, "DAC L1 Source"}, + {"DAC R1", NULL, "DAC R1 Source"}, + + {"DAC L1", NULL, "DAC 1 Clock"}, + {"DAC R1", NULL, "DAC 1 Clock"}, + + {"HP Amp", NULL, "DAC L1"}, + {"HP Amp", NULL, "DAC R1"}, + {"HP Amp", NULL, "HP Amp L"}, + {"HP Amp", NULL, "HP Amp R"}, + {"HP Amp", NULL, "Capless"}, + {"HP Amp", NULL, "Charge Pump"}, + {"HP Amp", NULL, "CLKDET SYS"}, + {"HP Amp", NULL, "CBJ Power"}, + {"HP Amp", NULL, "Vref2"}, + {"HPOL Playback", "Switch", "HP Amp"}, + {"HPOR Playback", "Switch", "HP Amp"}, + {"HPOL", NULL, "HPOL Playback"}, + {"HPOR", NULL, "HPOR Playback"}, +}; + +static int rt5668_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask, + unsigned int rx_mask, int slots, int slot_width) +{ + struct snd_soc_component *component = dai->component; + unsigned int val = 0; + + switch (slots) { + case 4: + val |= RT5668_TDM_TX_CH_4; + val |= RT5668_TDM_RX_CH_4; + break; + case 6: + val |= RT5668_TDM_TX_CH_6; + val |= RT5668_TDM_RX_CH_6; + break; + case 8: + val |= RT5668_TDM_TX_CH_8; + val |= RT5668_TDM_RX_CH_8; + break; + case 2: + break; + default: + return -EINVAL; + } + + snd_soc_component_update_bits(component, RT5668_TDM_CTRL, + RT5668_TDM_TX_CH_MASK | RT5668_TDM_RX_CH_MASK, val); + + switch (slot_width) { + case 16: + val = RT5668_TDM_CL_16; + break; + case 20: + val = RT5668_TDM_CL_20; + break; + case 24: + val = RT5668_TDM_CL_24; + break; + case 32: + val = RT5668_TDM_CL_32; + break; + default: + return -EINVAL; + } + + snd_soc_component_update_bits(component, RT5668_TDM_TCON_CTRL, + RT5668_TDM_CL_MASK, val); + + return 0; +} + + +static int rt5668_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + unsigned int len_1 = 0, len_2 = 0; + int pre_div, frame_size; + + rt5668->lrck[dai->id] = params_rate(params); + pre_div = rl6231_get_clk_info(rt5668->sysclk, rt5668->lrck[dai->id]); + + frame_size = snd_soc_params_to_frame_size(params); + if (frame_size < 0) { + dev_err(component->dev, "Unsupported frame size: %d\n", + frame_size); + return -EINVAL; + } + + dev_dbg(dai->dev, "lrck is %dHz and pre_div is %d for iis %d\n", + rt5668->lrck[dai->id], pre_div, dai->id); + + switch (params_width(params)) { + case 16: + break; + case 20: + len_1 |= RT5668_I2S1_DL_20; + len_2 |= RT5668_I2S2_DL_20; + break; + case 24: + len_1 |= RT5668_I2S1_DL_24; + len_2 |= RT5668_I2S2_DL_24; + break; + case 32: + len_1 |= RT5668_I2S1_DL_32; + len_2 |= RT5668_I2S2_DL_24; + break; + case 8: + len_1 |= RT5668_I2S2_DL_8; + len_2 |= RT5668_I2S2_DL_8; + break; + default: + return -EINVAL; + } + + switch (dai->id) { + case RT5668_AIF1: + snd_soc_component_update_bits(component, RT5668_I2S1_SDP, + RT5668_I2S1_DL_MASK, len_1); + if (rt5668->master[RT5668_AIF1]) { + snd_soc_component_update_bits(component, + RT5668_ADDA_CLK_1, RT5668_I2S_M_DIV_MASK, + pre_div << RT5668_I2S_M_DIV_SFT); + } + if (params_channels(params) == 1) /* mono mode */ + snd_soc_component_update_bits(component, + RT5668_I2S1_SDP, RT5668_I2S1_MONO_MASK, + RT5668_I2S1_MONO_EN); + else + snd_soc_component_update_bits(component, + RT5668_I2S1_SDP, RT5668_I2S1_MONO_MASK, + RT5668_I2S1_MONO_DIS); + break; + case RT5668_AIF2: + snd_soc_component_update_bits(component, RT5668_I2S2_SDP, + RT5668_I2S2_DL_MASK, len_2); + if (rt5668->master[RT5668_AIF2]) { + snd_soc_component_update_bits(component, + RT5668_I2S_M_CLK_CTRL_1, RT5668_I2S2_M_PD_MASK, + pre_div << RT5668_I2S2_M_PD_SFT); + } + if (params_channels(params) == 1) /* mono mode */ + snd_soc_component_update_bits(component, + RT5668_I2S2_SDP, RT5668_I2S2_MONO_MASK, + RT5668_I2S2_MONO_EN); + else + snd_soc_component_update_bits(component, + RT5668_I2S2_SDP, RT5668_I2S2_MONO_MASK, + RT5668_I2S2_MONO_DIS); + break; + default: + dev_err(component->dev, "Invalid dai->id: %d\n", dai->id); + return -EINVAL; + } + + return 0; +} + +static int rt5668_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt) +{ + struct snd_soc_component *component = dai->component; + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + unsigned int reg_val = 0, tdm_ctrl = 0; + + switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { + case SND_SOC_DAIFMT_CBM_CFM: + rt5668->master[dai->id] = 1; + break; + case SND_SOC_DAIFMT_CBS_CFS: + rt5668->master[dai->id] = 0; + break; + default: + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_INV_MASK) { + case SND_SOC_DAIFMT_NB_NF: + break; + case SND_SOC_DAIFMT_IB_NF: + reg_val |= RT5668_I2S_BP_INV; + tdm_ctrl |= RT5668_TDM_S_BP_INV; + break; + case SND_SOC_DAIFMT_NB_IF: + if (dai->id == RT5668_AIF1) + tdm_ctrl |= RT5668_TDM_S_LP_INV | RT5668_TDM_M_BP_INV; + else + return -EINVAL; + break; + case SND_SOC_DAIFMT_IB_IF: + if (dai->id == RT5668_AIF1) + tdm_ctrl |= RT5668_TDM_S_BP_INV | RT5668_TDM_S_LP_INV | + RT5668_TDM_M_BP_INV | RT5668_TDM_M_LP_INV; + else + return -EINVAL; + break; + default: + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_I2S: + break; + case SND_SOC_DAIFMT_LEFT_J: + reg_val |= RT5668_I2S_DF_LEFT; + tdm_ctrl |= RT5668_TDM_DF_LEFT; + break; + case SND_SOC_DAIFMT_DSP_A: + reg_val |= RT5668_I2S_DF_PCM_A; + tdm_ctrl |= RT5668_TDM_DF_PCM_A; + break; + case SND_SOC_DAIFMT_DSP_B: + reg_val |= RT5668_I2S_DF_PCM_B; + tdm_ctrl |= RT5668_TDM_DF_PCM_B; + break; + default: + return -EINVAL; + } + + switch (dai->id) { + case RT5668_AIF1: + snd_soc_component_update_bits(component, RT5668_I2S1_SDP, + RT5668_I2S_DF_MASK, reg_val); + snd_soc_component_update_bits(component, RT5668_TDM_TCON_CTRL, + RT5668_TDM_MS_MASK | RT5668_TDM_S_BP_MASK | + RT5668_TDM_DF_MASK | RT5668_TDM_M_BP_MASK | + RT5668_TDM_M_LP_MASK | RT5668_TDM_S_LP_MASK, + tdm_ctrl | rt5668->master[dai->id]); + break; + case RT5668_AIF2: + if (rt5668->master[dai->id] == 0) + reg_val |= RT5668_I2S2_MS_S; + snd_soc_component_update_bits(component, RT5668_I2S2_SDP, + RT5668_I2S2_MS_MASK | RT5668_I2S_BP_MASK | + RT5668_I2S_DF_MASK, reg_val); + break; + default: + dev_err(component->dev, "Invalid dai->id: %d\n", dai->id); + return -EINVAL; + } + return 0; +} + +static int rt5668_set_component_sysclk(struct snd_soc_component *component, + int clk_id, int source, unsigned int freq, int dir) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + unsigned int reg_val = 0, src = 0; + + if (freq == rt5668->sysclk && clk_id == rt5668->sysclk_src) + return 0; + + switch (clk_id) { + case RT5668_SCLK_S_MCLK: + reg_val |= RT5668_SCLK_SRC_MCLK; + src = RT5668_CLK_SRC_MCLK; + break; + case RT5668_SCLK_S_PLL1: + reg_val |= RT5668_SCLK_SRC_PLL1; + src = RT5668_CLK_SRC_PLL1; + break; + case RT5668_SCLK_S_PLL2: + reg_val |= RT5668_SCLK_SRC_PLL2; + src = RT5668_CLK_SRC_PLL2; + break; + case RT5668_SCLK_S_RCCLK: + reg_val |= RT5668_SCLK_SRC_RCCLK; + src = RT5668_CLK_SRC_RCCLK; + break; + default: + dev_err(component->dev, "Invalid clock id (%d)\n", clk_id); + return -EINVAL; + } + snd_soc_component_update_bits(component, RT5668_GLB_CLK, + RT5668_SCLK_SRC_MASK, reg_val); + + if (rt5668->master[RT5668_AIF2]) { + snd_soc_component_update_bits(component, + RT5668_I2S_M_CLK_CTRL_1, RT5668_I2S2_SRC_MASK, + src << RT5668_I2S2_SRC_SFT); + } + + rt5668->sysclk = freq; + rt5668->sysclk_src = clk_id; + + dev_dbg(component->dev, "Sysclk is %dHz and clock id is %d\n", + freq, clk_id); + + return 0; +} + +static int rt5668_set_component_pll(struct snd_soc_component *component, + int pll_id, int source, unsigned int freq_in, + unsigned int freq_out) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + struct rl6231_pll_code pll_code; + int ret; + + if (source == rt5668->pll_src && freq_in == rt5668->pll_in && + freq_out == rt5668->pll_out) + return 0; + + if (!freq_in || !freq_out) { + dev_dbg(component->dev, "PLL disabled\n"); + + rt5668->pll_in = 0; + rt5668->pll_out = 0; + snd_soc_component_update_bits(component, RT5668_GLB_CLK, + RT5668_SCLK_SRC_MASK, RT5668_SCLK_SRC_MCLK); + return 0; + } + + switch (source) { + case RT5668_PLL1_S_MCLK: + snd_soc_component_update_bits(component, RT5668_GLB_CLK, + RT5668_PLL1_SRC_MASK, RT5668_PLL1_SRC_MCLK); + break; + case RT5668_PLL1_S_BCLK1: + snd_soc_component_update_bits(component, RT5668_GLB_CLK, + RT5668_PLL1_SRC_MASK, RT5668_PLL1_SRC_BCLK1); + break; + default: + dev_err(component->dev, "Unknown PLL Source %d\n", source); + return -EINVAL; + } + + ret = rl6231_pll_calc(freq_in, freq_out, &pll_code); + if (ret < 0) { + dev_err(component->dev, "Unsupport input clock %d\n", freq_in); + return ret; + } + + dev_dbg(component->dev, "bypass=%d m=%d n=%d k=%d\n", + pll_code.m_bp, (pll_code.m_bp ? 0 : pll_code.m_code), + pll_code.n_code, pll_code.k_code); + + snd_soc_component_write(component, RT5668_PLL_CTRL_1, + pll_code.n_code << RT5668_PLL_N_SFT | pll_code.k_code); + snd_soc_component_write(component, RT5668_PLL_CTRL_2, + (pll_code.m_bp ? 0 : pll_code.m_code) << RT5668_PLL_M_SFT | + pll_code.m_bp << RT5668_PLL_M_BP_SFT); + + rt5668->pll_in = freq_in; + rt5668->pll_out = freq_out; + rt5668->pll_src = source; + + return 0; +} + +static int rt5668_set_bclk_ratio(struct snd_soc_dai *dai, unsigned int ratio) +{ + struct snd_soc_component *component = dai->component; + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + rt5668->bclk[dai->id] = ratio; + + switch (ratio) { + case 64: + snd_soc_component_update_bits(component, RT5668_ADDA_CLK_2, + RT5668_I2S2_BCLK_MS2_MASK, + RT5668_I2S2_BCLK_MS2_64); + break; + case 32: + snd_soc_component_update_bits(component, RT5668_ADDA_CLK_2, + RT5668_I2S2_BCLK_MS2_MASK, + RT5668_I2S2_BCLK_MS2_32); + break; + default: + dev_err(dai->dev, "Invalid bclk ratio %d\n", ratio); + return -EINVAL; + } + + return 0; +} + +static int rt5668_set_bias_level(struct snd_soc_component *component, + enum snd_soc_bias_level level) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + switch (level) { + case SND_SOC_BIAS_PREPARE: + regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1, + RT5668_PWR_MB | RT5668_PWR_BG, + RT5668_PWR_MB | RT5668_PWR_BG); + regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1, + RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO, + RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO); + break; + + case SND_SOC_BIAS_STANDBY: + regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1, + RT5668_PWR_MB, RT5668_PWR_MB); + regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1, + RT5668_DIG_GATE_CTRL, RT5668_DIG_GATE_CTRL); + break; + case SND_SOC_BIAS_OFF: + regmap_update_bits(rt5668->regmap, RT5668_PWR_DIG_1, + RT5668_DIG_GATE_CTRL | RT5668_PWR_LDO, 0); + regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1, + RT5668_PWR_MB | RT5668_PWR_BG, 0); + break; + + default: + break; + } + + return 0; +} + +static int rt5668_probe(struct snd_soc_component *component) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + rt5668->component = component; + + return 0; +} + +static void rt5668_remove(struct snd_soc_component *component) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + rt5668_reset(rt5668->regmap); +} + +#ifdef CONFIG_PM +static int rt5668_suspend(struct snd_soc_component *component) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + regcache_cache_only(rt5668->regmap, true); + regcache_mark_dirty(rt5668->regmap); + return 0; +} + +static int rt5668_resume(struct snd_soc_component *component) +{ + struct rt5668_priv *rt5668 = snd_soc_component_get_drvdata(component); + + regcache_cache_only(rt5668->regmap, false); + regcache_sync(rt5668->regmap); + + return 0; +} +#else +#define rt5668_suspend NULL +#define rt5668_resume NULL +#endif + +#define RT5668_STEREO_RATES SNDRV_PCM_RATE_8000_192000 +#define RT5668_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE | \ + SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S8) + +static const struct snd_soc_dai_ops rt5668_aif1_dai_ops = { + .hw_params = rt5668_hw_params, + .set_fmt = rt5668_set_dai_fmt, + .set_tdm_slot = rt5668_set_tdm_slot, +}; + +static const struct snd_soc_dai_ops rt5668_aif2_dai_ops = { + .hw_params = rt5668_hw_params, + .set_fmt = rt5668_set_dai_fmt, + .set_bclk_ratio = rt5668_set_bclk_ratio, +}; + +static struct snd_soc_dai_driver rt5668_dai[] = { + { + .name = "rt5668-aif1", + .id = RT5668_AIF1, + .playback = { + .stream_name = "AIF1 Playback", + .channels_min = 1, + .channels_max = 2, + .rates = RT5668_STEREO_RATES, + .formats = RT5668_FORMATS, + }, + .capture = { + .stream_name = "AIF1 Capture", + .channels_min = 1, + .channels_max = 2, + .rates = RT5668_STEREO_RATES, + .formats = RT5668_FORMATS, + }, + .ops = &rt5668_aif1_dai_ops, + }, + { + .name = "rt5668-aif2", + .id = RT5668_AIF2, + .capture = { + .stream_name = "AIF2 Capture", + .channels_min = 1, + .channels_max = 2, + .rates = RT5668_STEREO_RATES, + .formats = RT5668_FORMATS, + }, + .ops = &rt5668_aif2_dai_ops, + }, +}; + +static const struct snd_soc_component_driver soc_component_dev_rt5668 = { + .probe = rt5668_probe, + .remove = rt5668_remove, + .suspend = rt5668_suspend, + .resume = rt5668_resume, + .set_bias_level = rt5668_set_bias_level, + .controls = rt5668_snd_controls, + .num_controls = ARRAY_SIZE(rt5668_snd_controls), + .dapm_widgets = rt5668_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(rt5668_dapm_widgets), + .dapm_routes = rt5668_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(rt5668_dapm_routes), + .set_sysclk = rt5668_set_component_sysclk, + .set_pll = rt5668_set_component_pll, + .set_jack = rt5668_set_jack_detect, + .use_pmdown_time = 1, + .endianness = 1, + .non_legacy_dai_naming = 1, +}; + +static const struct regmap_config rt5668_regmap = { + .reg_bits = 16, + .val_bits = 16, + .max_register = RT5668_I2C_MODE, + .volatile_reg = rt5668_volatile_register, + .readable_reg = rt5668_readable_register, + .cache_type = REGCACHE_RBTREE, + .reg_defaults = rt5668_reg, + .num_reg_defaults = ARRAY_SIZE(rt5668_reg), + .use_single_rw = true, +}; + +static const struct i2c_device_id rt5668_i2c_id[] = { + {"rt5668b", 0}, + {} +}; +MODULE_DEVICE_TABLE(i2c, rt5668_i2c_id); + +static int rt5668_parse_dt(struct rt5668_priv *rt5668, struct device *dev) +{ + + of_property_read_u32(dev->of_node, "realtek,dmic1-data-pin", + &rt5668->pdata.dmic1_data_pin); + of_property_read_u32(dev->of_node, "realtek,dmic1-clk-pin", + &rt5668->pdata.dmic1_clk_pin); + of_property_read_u32(dev->of_node, "realtek,jd-src", + &rt5668->pdata.jd_src); + + rt5668->pdata.ldo1_en = of_get_named_gpio(dev->of_node, + "realtek,ldo1-en-gpios", 0); + + return 0; +} + +static void rt5668_calibrate(struct rt5668_priv *rt5668) +{ + int value, count; + + mutex_lock(&rt5668->calibrate_mutex); + + rt5668_reset(rt5668->regmap); + regmap_write(rt5668->regmap, RT5668_PWR_ANLG_1, 0xa2bf); + usleep_range(15000, 20000); + regmap_write(rt5668->regmap, RT5668_PWR_ANLG_1, 0xf2bf); + regmap_write(rt5668->regmap, RT5668_MICBIAS_2, 0x0380); + regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x8001); + regmap_write(rt5668->regmap, RT5668_TEST_MODE_CTRL_1, 0x0000); + regmap_write(rt5668->regmap, RT5668_STO1_DAC_MIXER, 0x2080); + regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0x4040); + regmap_write(rt5668->regmap, RT5668_DEPOP_1, 0x0069); + regmap_write(rt5668->regmap, RT5668_CHOP_DAC, 0x3000); + regmap_write(rt5668->regmap, RT5668_HP_CTRL_2, 0x6000); + regmap_write(rt5668->regmap, RT5668_HP_CHARGE_PUMP_1, 0x0f26); + regmap_write(rt5668->regmap, RT5668_CALIB_ADC_CTRL, 0x7f05); + regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0x686c); + regmap_write(rt5668->regmap, RT5668_CAL_REC, 0x0d0d); + regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_9, 0x000f); + regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x8d01); + regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_2, 0x0321); + regmap_write(rt5668->regmap, RT5668_HP_LOGIC_CTRL_2, 0x0004); + regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_1, 0x7c00); + regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_3, 0x06a1); + regmap_write(rt5668->regmap, RT5668_A_DAC1_MUX, 0x0311); + regmap_write(rt5668->regmap, RT5668_RESET_HPF_CTRL, 0x0000); + regmap_write(rt5668->regmap, RT5668_ADC_STO1_HP_CTRL_1, 0x3320); + + regmap_write(rt5668->regmap, RT5668_HP_CALIB_CTRL_1, 0xfc00); + + for (count = 0; count < 60; count++) { + regmap_read(rt5668->regmap, RT5668_HP_CALIB_STA_1, &value); + if (!(value & 0x8000)) + break; + + usleep_range(10000, 10005); + } + + if (count >= 60) + pr_err("HP Calibration Failure\n"); + + /* restore settings */ + regmap_write(rt5668->regmap, RT5668_STO1_ADC_MIXER, 0xc0c4); + regmap_write(rt5668->regmap, RT5668_PWR_DIG_1, 0x0000); + + mutex_unlock(&rt5668->calibrate_mutex); + +} + +static int rt5668_i2c_probe(struct i2c_client *i2c, + const struct i2c_device_id *id) +{ + struct rt5668_platform_data *pdata = dev_get_platdata(&i2c->dev); + struct rt5668_priv *rt5668; + int i, ret; + unsigned int val; + + rt5668 = devm_kzalloc(&i2c->dev, sizeof(struct rt5668_priv), + GFP_KERNEL); + + if (rt5668 == NULL) + return -ENOMEM; + + i2c_set_clientdata(i2c, rt5668); + + if (pdata) + rt5668->pdata = *pdata; + else + rt5668_parse_dt(rt5668, &i2c->dev); + + rt5668->regmap = devm_regmap_init_i2c(i2c, &rt5668_regmap); + if (IS_ERR(rt5668->regmap)) { + ret = PTR_ERR(rt5668->regmap); + dev_err(&i2c->dev, "Failed to allocate register map: %d\n", + ret); + return ret; + } + + for (i = 0; i < ARRAY_SIZE(rt5668->supplies); i++) + rt5668->supplies[i].supply = rt5668_supply_names[i]; + + ret = devm_regulator_bulk_get(&i2c->dev, ARRAY_SIZE(rt5668->supplies), + rt5668->supplies); + if (ret != 0) { + dev_err(&i2c->dev, "Failed to request supplies: %d\n", ret); + return ret; + } + + ret = regulator_bulk_enable(ARRAY_SIZE(rt5668->supplies), + rt5668->supplies); + if (ret != 0) { + dev_err(&i2c->dev, "Failed to enable supplies: %d\n", ret); + return ret; + } + + if (gpio_is_valid(rt5668->pdata.ldo1_en)) { + if (devm_gpio_request_one(&i2c->dev, rt5668->pdata.ldo1_en, + GPIOF_OUT_INIT_HIGH, "rt5668")) + dev_err(&i2c->dev, "Fail gpio_request gpio_ldo\n"); + } + + /* Sleep for 300 ms miniumum */ + usleep_range(300000, 350000); + + regmap_write(rt5668->regmap, RT5668_I2C_MODE, 0x1); + usleep_range(10000, 15000); + + regmap_read(rt5668->regmap, RT5668_DEVICE_ID, &val); + if (val != DEVICE_ID) { + pr_err("Device with ID register %x is not rt5668\n", val); + return -ENODEV; + } + + rt5668_reset(rt5668->regmap); + + rt5668_calibrate(rt5668); + + regmap_write(rt5668->regmap, RT5668_DEPOP_1, 0x0000); + + /* DMIC pin*/ + if (rt5668->pdata.dmic1_data_pin != RT5668_DMIC1_NULL) { + switch (rt5668->pdata.dmic1_data_pin) { + case RT5668_DMIC1_DATA_GPIO2: /* share with LRCK2 */ + regmap_update_bits(rt5668->regmap, RT5668_DMIC_CTRL_1, + RT5668_DMIC_1_DP_MASK, RT5668_DMIC_1_DP_GPIO2); + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP2_PIN_MASK, RT5668_GP2_PIN_DMIC_SDA); + break; + + case RT5668_DMIC1_DATA_GPIO5: /* share with DACDAT1 */ + regmap_update_bits(rt5668->regmap, RT5668_DMIC_CTRL_1, + RT5668_DMIC_1_DP_MASK, RT5668_DMIC_1_DP_GPIO5); + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP5_PIN_MASK, RT5668_GP5_PIN_DMIC_SDA); + break; + + default: + dev_dbg(&i2c->dev, "invalid DMIC_DAT pin\n"); + break; + } + + switch (rt5668->pdata.dmic1_clk_pin) { + case RT5668_DMIC1_CLK_GPIO1: /* share with IRQ */ + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP1_PIN_MASK, RT5668_GP1_PIN_DMIC_CLK); + break; + + case RT5668_DMIC1_CLK_GPIO3: /* share with BCLK2 */ + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP3_PIN_MASK, RT5668_GP3_PIN_DMIC_CLK); + break; + + default: + dev_dbg(&i2c->dev, "invalid DMIC_CLK pin\n"); + break; + } + } + + regmap_update_bits(rt5668->regmap, RT5668_PWR_ANLG_1, + RT5668_LDO1_DVO_MASK | RT5668_HP_DRIVER_MASK, + RT5668_LDO1_DVO_14 | RT5668_HP_DRIVER_5X); + regmap_write(rt5668->regmap, RT5668_MICBIAS_2, 0x0380); + regmap_update_bits(rt5668->regmap, RT5668_GPIO_CTRL_1, + RT5668_GP4_PIN_MASK | RT5668_GP5_PIN_MASK, + RT5668_GP4_PIN_ADCDAT1 | RT5668_GP5_PIN_DACDAT1); + regmap_write(rt5668->regmap, RT5668_TEST_MODE_CTRL_1, 0x0000); + + INIT_DELAYED_WORK(&rt5668->jack_detect_work, + rt5668_jack_detect_handler); + INIT_DELAYED_WORK(&rt5668->jd_check_work, + rt5668_jd_check_handler); + + mutex_init(&rt5668->calibrate_mutex); + + if (i2c->irq) { + ret = devm_request_threaded_irq(&i2c->dev, i2c->irq, NULL, + rt5668_irq, IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING + | IRQF_ONESHOT, "rt5668", rt5668); + if (ret) + dev_err(&i2c->dev, "Failed to reguest IRQ: %d\n", ret); + + } + + return snd_soc_register_component(&i2c->dev, &soc_component_dev_rt5668, + rt5668_dai, ARRAY_SIZE(rt5668_dai)); +} + +static int rt5668_i2c_remove(struct i2c_client *i2c) +{ + snd_soc_unregister_component(&i2c->dev); + + return 0; +} + +static void rt5668_i2c_shutdown(struct i2c_client *client) +{ + struct rt5668_priv *rt5668 = i2c_get_clientdata(client); + + rt5668_reset(rt5668->regmap); +} + +#ifdef CONFIG_OF +static const struct of_device_id rt5668_of_match[] = { + {.compatible = "realtek,rt5668b"}, + {}, +}; +MODULE_DEVICE_TABLE(of, rt5668_of_match); +#endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id rt5668_acpi_match[] = { + {"10EC5668", 0,}, + {}, +}; +MODULE_DEVICE_TABLE(acpi, rt5668_acpi_match); +#endif + +static struct i2c_driver rt5668_i2c_driver = { + .driver = { + .name = "rt5668b", + .of_match_table = of_match_ptr(rt5668_of_match), + .acpi_match_table = ACPI_PTR(rt5668_acpi_match), + }, + .probe = rt5668_i2c_probe, + .remove = rt5668_i2c_remove, + .shutdown = rt5668_i2c_shutdown, + .id_table = rt5668_i2c_id, +}; +module_i2c_driver(rt5668_i2c_driver); + +MODULE_DESCRIPTION("ASoC RT5668B driver"); +MODULE_AUTHOR("Bard Liao "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/codecs/rt5668.h b/sound/soc/codecs/rt5668.h new file mode 100644 index 000000000000..3e7bcfd569ec --- /dev/null +++ b/sound/soc/codecs/rt5668.h @@ -0,0 +1,1318 @@ +/* + * rt5668.h -- RT5668/RT5658 ALSA SoC audio driver + * + * Copyright 2018 Realtek Microelectronics + * Author: Bard Liao + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __RT5668_H__ +#define __RT5668_H__ + +#include + +#define DEVICE_ID 0x6530 + +/* Info */ +#define RT5668_RESET 0x0000 +#define RT5668_VERSION_ID 0x00fd +#define RT5668_VENDOR_ID 0x00fe +#define RT5668_DEVICE_ID 0x00ff +/* I/O - Output */ +#define RT5668_HP_CTRL_1 0x0002 +#define RT5668_HP_CTRL_2 0x0003 +#define RT5668_HPL_GAIN 0x0005 +#define RT5668_HPR_GAIN 0x0006 + +#define RT5668_I2C_CTRL 0x0008 + +/* I/O - Input */ +#define RT5668_CBJ_BST_CTRL 0x000b +#define RT5668_CBJ_CTRL_1 0x0010 +#define RT5668_CBJ_CTRL_2 0x0011 +#define RT5668_CBJ_CTRL_3 0x0012 +#define RT5668_CBJ_CTRL_4 0x0013 +#define RT5668_CBJ_CTRL_5 0x0014 +#define RT5668_CBJ_CTRL_6 0x0015 +#define RT5668_CBJ_CTRL_7 0x0016 +/* I/O - ADC/DAC/DMIC */ +#define RT5668_DAC1_DIG_VOL 0x0019 +#define RT5668_STO1_ADC_DIG_VOL 0x001c +#define RT5668_STO1_ADC_BOOST 0x001f +#define RT5668_HP_IMP_GAIN_1 0x0022 +#define RT5668_HP_IMP_GAIN_2 0x0023 +/* Mixer - D-D */ +#define RT5668_SIDETONE_CTRL 0x0024 +#define RT5668_STO1_ADC_MIXER 0x0026 +#define RT5668_AD_DA_MIXER 0x0029 +#define RT5668_STO1_DAC_MIXER 0x002a +#define RT5668_A_DAC1_MUX 0x002b +#define RT5668_DIG_INF2_DATA 0x0030 +/* Mixer - ADC */ +#define RT5668_REC_MIXER 0x003c +#define RT5668_CAL_REC 0x0044 +#define RT5668_ALC_BACK_GAIN 0x0049 +/* Power */ +#define RT5668_PWR_DIG_1 0x0061 +#define RT5668_PWR_DIG_2 0x0062 +#define RT5668_PWR_ANLG_1 0x0063 +#define RT5668_PWR_ANLG_2 0x0064 +#define RT5668_PWR_ANLG_3 0x0065 +#define RT5668_PWR_MIXER 0x0066 +#define RT5668_PWR_VOL 0x0067 +/* Clock Detect */ +#define RT5668_CLK_DET 0x006b +/* Filter Auto Reset */ +#define RT5668_RESET_LPF_CTRL 0x006c +#define RT5668_RESET_HPF_CTRL 0x006d +/* DMIC */ +#define RT5668_DMIC_CTRL_1 0x006e +/* Format - ADC/DAC */ +#define RT5668_I2S1_SDP 0x0070 +#define RT5668_I2S2_SDP 0x0071 +#define RT5668_ADDA_CLK_1 0x0073 +#define RT5668_ADDA_CLK_2 0x0074 +#define RT5668_I2S1_F_DIV_CTRL_1 0x0075 +#define RT5668_I2S1_F_DIV_CTRL_2 0x0076 +/* Format - TDM Control */ +#define RT5668_TDM_CTRL 0x0079 +#define RT5668_TDM_ADDA_CTRL_1 0x007a +#define RT5668_TDM_ADDA_CTRL_2 0x007b +#define RT5668_DATA_SEL_CTRL_1 0x007c +#define RT5668_TDM_TCON_CTRL 0x007e +/* Function - Analog */ +#define RT5668_GLB_CLK 0x0080 +#define RT5668_PLL_CTRL_1 0x0081 +#define RT5668_PLL_CTRL_2 0x0082 +#define RT5668_PLL_TRACK_1 0x0083 +#define RT5668_PLL_TRACK_2 0x0084 +#define RT5668_PLL_TRACK_3 0x0085 +#define RT5668_PLL_TRACK_4 0x0086 +#define RT5668_PLL_TRACK_5 0x0087 +#define RT5668_PLL_TRACK_6 0x0088 +#define RT5668_PLL_TRACK_11 0x008c +#define RT5668_SDW_REF_CLK 0x008d +#define RT5668_DEPOP_1 0x008e +#define RT5668_DEPOP_2 0x008f +#define RT5668_HP_CHARGE_PUMP_1 0x0091 +#define RT5668_HP_CHARGE_PUMP_2 0x0092 +#define RT5668_MICBIAS_1 0x0093 +#define RT5668_MICBIAS_2 0x0094 +#define RT5668_PLL_TRACK_12 0x0098 +#define RT5668_PLL_TRACK_14 0x009a +#define RT5668_PLL2_CTRL_1 0x009b +#define RT5668_PLL2_CTRL_2 0x009c +#define RT5668_PLL2_CTRL_3 0x009d +#define RT5668_PLL2_CTRL_4 0x009e +#define RT5668_RC_CLK_CTRL 0x009f +#define RT5668_I2S_M_CLK_CTRL_1 0x00a0 +#define RT5668_I2S2_F_DIV_CTRL_1 0x00a3 +#define RT5668_I2S2_F_DIV_CTRL_2 0x00a4 +/* Function - Digital */ +#define RT5668_EQ_CTRL_1 0x00ae +#define RT5668_EQ_CTRL_2 0x00af +#define RT5668_IRQ_CTRL_1 0x00b6 +#define RT5668_IRQ_CTRL_2 0x00b7 +#define RT5668_IRQ_CTRL_3 0x00b8 +#define RT5668_IRQ_CTRL_4 0x00b9 +#define RT5668_INT_ST_1 0x00be +#define RT5668_GPIO_CTRL_1 0x00c0 +#define RT5668_GPIO_CTRL_2 0x00c1 +#define RT5668_GPIO_CTRL_3 0x00c2 +#define RT5668_HP_AMP_DET_CTRL_1 0x00d0 +#define RT5668_HP_AMP_DET_CTRL_2 0x00d1 +#define RT5668_MID_HP_AMP_DET 0x00d2 +#define RT5668_LOW_HP_AMP_DET 0x00d3 +#define RT5668_DELAY_BUF_CTRL 0x00d4 +#define RT5668_SV_ZCD_1 0x00d9 +#define RT5668_SV_ZCD_2 0x00da +#define RT5668_IL_CMD_1 0x00db +#define RT5668_IL_CMD_2 0x00dc +#define RT5668_IL_CMD_3 0x00dd +#define RT5668_IL_CMD_4 0x00de +#define RT5668_IL_CMD_5 0x00df +#define RT5668_IL_CMD_6 0x00e0 +#define RT5668_4BTN_IL_CMD_1 0x00e2 +#define RT5668_4BTN_IL_CMD_2 0x00e3 +#define RT5668_4BTN_IL_CMD_3 0x00e4 +#define RT5668_4BTN_IL_CMD_4 0x00e5 +#define RT5668_4BTN_IL_CMD_5 0x00e6 +#define RT5668_4BTN_IL_CMD_6 0x00e7 +#define RT5668_4BTN_IL_CMD_7 0x00e8 + +#define RT5668_ADC_STO1_HP_CTRL_1 0x00ea +#define RT5668_ADC_STO1_HP_CTRL_2 0x00eb +#define RT5668_AJD1_CTRL 0x00f0 +#define RT5668_JD1_THD 0x00f1 +#define RT5668_JD2_THD 0x00f2 +#define RT5668_JD_CTRL_1 0x00f6 +/* General Control */ +#define RT5668_DUMMY_1 0x00fa +#define RT5668_DUMMY_2 0x00fb +#define RT5668_DUMMY_3 0x00fc + +#define RT5668_DAC_ADC_DIG_VOL1 0x0100 +#define RT5668_BIAS_CUR_CTRL_2 0x010b +#define RT5668_BIAS_CUR_CTRL_3 0x010c +#define RT5668_BIAS_CUR_CTRL_4 0x010d +#define RT5668_BIAS_CUR_CTRL_5 0x010e +#define RT5668_BIAS_CUR_CTRL_6 0x010f +#define RT5668_BIAS_CUR_CTRL_7 0x0110 +#define RT5668_BIAS_CUR_CTRL_8 0x0111 +#define RT5668_BIAS_CUR_CTRL_9 0x0112 +#define RT5668_BIAS_CUR_CTRL_10 0x0113 +#define RT5668_VREF_REC_OP_FB_CAP_CTRL 0x0117 +#define RT5668_CHARGE_PUMP_1 0x0125 +#define RT5668_DIG_IN_CTRL_1 0x0132 +#define RT5668_PAD_DRIVING_CTRL 0x0136 +#define RT5668_SOFT_RAMP_DEPOP 0x0138 +#define RT5668_CHOP_DAC 0x013a +#define RT5668_CHOP_ADC 0x013b +#define RT5668_CALIB_ADC_CTRL 0x013c +#define RT5668_VOL_TEST 0x013f +#define RT5668_SPKVDD_DET_STA 0x0142 +#define RT5668_TEST_MODE_CTRL_1 0x0145 +#define RT5668_TEST_MODE_CTRL_2 0x0146 +#define RT5668_TEST_MODE_CTRL_3 0x0147 +#define RT5668_TEST_MODE_CTRL_4 0x0148 +#define RT5668_TEST_MODE_CTRL_5 0x0149 +#define RT5668_PLL1_INTERNAL 0x0150 +#define RT5668_PLL2_INTERNAL 0x0151 +#define RT5668_STO_NG2_CTRL_1 0x0160 +#define RT5668_STO_NG2_CTRL_2 0x0161 +#define RT5668_STO_NG2_CTRL_3 0x0162 +#define RT5668_STO_NG2_CTRL_4 0x0163 +#define RT5668_STO_NG2_CTRL_5 0x0164 +#define RT5668_STO_NG2_CTRL_6 0x0165 +#define RT5668_STO_NG2_CTRL_7 0x0166 +#define RT5668_STO_NG2_CTRL_8 0x0167 +#define RT5668_STO_NG2_CTRL_9 0x0168 +#define RT5668_STO_NG2_CTRL_10 0x0169 +#define RT5668_STO1_DAC_SIL_DET 0x0190 +#define RT5668_SIL_PSV_CTRL1 0x0194 +#define RT5668_SIL_PSV_CTRL2 0x0195 +#define RT5668_SIL_PSV_CTRL3 0x0197 +#define RT5668_SIL_PSV_CTRL4 0x0198 +#define RT5668_SIL_PSV_CTRL5 0x0199 +#define RT5668_HP_IMP_SENS_CTRL_01 0x01af +#define RT5668_HP_IMP_SENS_CTRL_02 0x01b0 +#define RT5668_HP_IMP_SENS_CTRL_03 0x01b1 +#define RT5668_HP_IMP_SENS_CTRL_04 0x01b2 +#define RT5668_HP_IMP_SENS_CTRL_05 0x01b3 +#define RT5668_HP_IMP_SENS_CTRL_06 0x01b4 +#define RT5668_HP_IMP_SENS_CTRL_07 0x01b5 +#define RT5668_HP_IMP_SENS_CTRL_08 0x01b6 +#define RT5668_HP_IMP_SENS_CTRL_09 0x01b7 +#define RT5668_HP_IMP_SENS_CTRL_10 0x01b8 +#define RT5668_HP_IMP_SENS_CTRL_11 0x01b9 +#define RT5668_HP_IMP_SENS_CTRL_12 0x01ba +#define RT5668_HP_IMP_SENS_CTRL_13 0x01bb +#define RT5668_HP_IMP_SENS_CTRL_14 0x01bc +#define RT5668_HP_IMP_SENS_CTRL_15 0x01bd +#define RT5668_HP_IMP_SENS_CTRL_16 0x01be +#define RT5668_HP_IMP_SENS_CTRL_17 0x01bf +#define RT5668_HP_IMP_SENS_CTRL_18 0x01c0 +#define RT5668_HP_IMP_SENS_CTRL_19 0x01c1 +#define RT5668_HP_IMP_SENS_CTRL_20 0x01c2 +#define RT5668_HP_IMP_SENS_CTRL_21 0x01c3 +#define RT5668_HP_IMP_SENS_CTRL_22 0x01c4 +#define RT5668_HP_IMP_SENS_CTRL_23 0x01c5 +#define RT5668_HP_IMP_SENS_CTRL_24 0x01c6 +#define RT5668_HP_IMP_SENS_CTRL_25 0x01c7 +#define RT5668_HP_IMP_SENS_CTRL_26 0x01c8 +#define RT5668_HP_IMP_SENS_CTRL_27 0x01c9 +#define RT5668_HP_IMP_SENS_CTRL_28 0x01ca +#define RT5668_HP_IMP_SENS_CTRL_29 0x01cb +#define RT5668_HP_IMP_SENS_CTRL_30 0x01cc +#define RT5668_HP_IMP_SENS_CTRL_31 0x01cd +#define RT5668_HP_IMP_SENS_CTRL_32 0x01ce +#define RT5668_HP_IMP_SENS_CTRL_33 0x01cf +#define RT5668_HP_IMP_SENS_CTRL_34 0x01d0 +#define RT5668_HP_IMP_SENS_CTRL_35 0x01d1 +#define RT5668_HP_IMP_SENS_CTRL_36 0x01d2 +#define RT5668_HP_IMP_SENS_CTRL_37 0x01d3 +#define RT5668_HP_IMP_SENS_CTRL_38 0x01d4 +#define RT5668_HP_IMP_SENS_CTRL_39 0x01d5 +#define RT5668_HP_IMP_SENS_CTRL_40 0x01d6 +#define RT5668_HP_IMP_SENS_CTRL_41 0x01d7 +#define RT5668_HP_IMP_SENS_CTRL_42 0x01d8 +#define RT5668_HP_IMP_SENS_CTRL_43 0x01d9 +#define RT5668_HP_LOGIC_CTRL_1 0x01da +#define RT5668_HP_LOGIC_CTRL_2 0x01db +#define RT5668_HP_LOGIC_CTRL_3 0x01dc +#define RT5668_HP_CALIB_CTRL_1 0x01de +#define RT5668_HP_CALIB_CTRL_2 0x01df +#define RT5668_HP_CALIB_CTRL_3 0x01e0 +#define RT5668_HP_CALIB_CTRL_4 0x01e1 +#define RT5668_HP_CALIB_CTRL_5 0x01e2 +#define RT5668_HP_CALIB_CTRL_6 0x01e3 +#define RT5668_HP_CALIB_CTRL_7 0x01e4 +#define RT5668_HP_CALIB_CTRL_9 0x01e6 +#define RT5668_HP_CALIB_CTRL_10 0x01e7 +#define RT5668_HP_CALIB_CTRL_11 0x01e8 +#define RT5668_HP_CALIB_STA_1 0x01ea +#define RT5668_HP_CALIB_STA_2 0x01eb +#define RT5668_HP_CALIB_STA_3 0x01ec +#define RT5668_HP_CALIB_STA_4 0x01ed +#define RT5668_HP_CALIB_STA_5 0x01ee +#define RT5668_HP_CALIB_STA_6 0x01ef +#define RT5668_HP_CALIB_STA_7 0x01f0 +#define RT5668_HP_CALIB_STA_8 0x01f1 +#define RT5668_HP_CALIB_STA_9 0x01f2 +#define RT5668_HP_CALIB_STA_10 0x01f3 +#define RT5668_HP_CALIB_STA_11 0x01f4 +#define RT5668_SAR_IL_CMD_1 0x0210 +#define RT5668_SAR_IL_CMD_2 0x0211 +#define RT5668_SAR_IL_CMD_3 0x0212 +#define RT5668_SAR_IL_CMD_4 0x0213 +#define RT5668_SAR_IL_CMD_5 0x0214 +#define RT5668_SAR_IL_CMD_6 0x0215 +#define RT5668_SAR_IL_CMD_7 0x0216 +#define RT5668_SAR_IL_CMD_8 0x0217 +#define RT5668_SAR_IL_CMD_9 0x0218 +#define RT5668_SAR_IL_CMD_10 0x0219 +#define RT5668_SAR_IL_CMD_11 0x021a +#define RT5668_SAR_IL_CMD_12 0x021b +#define RT5668_SAR_IL_CMD_13 0x021c +#define RT5668_EFUSE_CTRL_1 0x0250 +#define RT5668_EFUSE_CTRL_2 0x0251 +#define RT5668_EFUSE_CTRL_3 0x0252 +#define RT5668_EFUSE_CTRL_4 0x0253 +#define RT5668_EFUSE_CTRL_5 0x0254 +#define RT5668_EFUSE_CTRL_6 0x0255 +#define RT5668_EFUSE_CTRL_7 0x0256 +#define RT5668_EFUSE_CTRL_8 0x0257 +#define RT5668_EFUSE_CTRL_9 0x0258 +#define RT5668_EFUSE_CTRL_10 0x0259 +#define RT5668_EFUSE_CTRL_11 0x025a +#define RT5668_JD_TOP_VC_VTRL 0x0270 +#define RT5668_DRC1_CTRL_0 0x02ff +#define RT5668_DRC1_CTRL_1 0x0300 +#define RT5668_DRC1_CTRL_2 0x0301 +#define RT5668_DRC1_CTRL_3 0x0302 +#define RT5668_DRC1_CTRL_4 0x0303 +#define RT5668_DRC1_CTRL_5 0x0304 +#define RT5668_DRC1_CTRL_6 0x0305 +#define RT5668_DRC1_HARD_LMT_CTRL_1 0x0306 +#define RT5668_DRC1_HARD_LMT_CTRL_2 0x0307 +#define RT5668_DRC1_PRIV_1 0x0310 +#define RT5668_DRC1_PRIV_2 0x0311 +#define RT5668_DRC1_PRIV_3 0x0312 +#define RT5668_DRC1_PRIV_4 0x0313 +#define RT5668_DRC1_PRIV_5 0x0314 +#define RT5668_DRC1_PRIV_6 0x0315 +#define RT5668_DRC1_PRIV_7 0x0316 +#define RT5668_DRC1_PRIV_8 0x0317 +#define RT5668_EQ_AUTO_RCV_CTRL1 0x03c0 +#define RT5668_EQ_AUTO_RCV_CTRL2 0x03c1 +#define RT5668_EQ_AUTO_RCV_CTRL3 0x03c2 +#define RT5668_EQ_AUTO_RCV_CTRL4 0x03c3 +#define RT5668_EQ_AUTO_RCV_CTRL5 0x03c4 +#define RT5668_EQ_AUTO_RCV_CTRL6 0x03c5 +#define RT5668_EQ_AUTO_RCV_CTRL7 0x03c6 +#define RT5668_EQ_AUTO_RCV_CTRL8 0x03c7 +#define RT5668_EQ_AUTO_RCV_CTRL9 0x03c8 +#define RT5668_EQ_AUTO_RCV_CTRL10 0x03c9 +#define RT5668_EQ_AUTO_RCV_CTRL11 0x03ca +#define RT5668_EQ_AUTO_RCV_CTRL12 0x03cb +#define RT5668_EQ_AUTO_RCV_CTRL13 0x03cc +#define RT5668_ADC_L_EQ_LPF1_A1 0x03d0 +#define RT5668_R_EQ_LPF1_A1 0x03d1 +#define RT5668_L_EQ_LPF1_H0 0x03d2 +#define RT5668_R_EQ_LPF1_H0 0x03d3 +#define RT5668_L_EQ_BPF1_A1 0x03d4 +#define RT5668_R_EQ_BPF1_A1 0x03d5 +#define RT5668_L_EQ_BPF1_A2 0x03d6 +#define RT5668_R_EQ_BPF1_A2 0x03d7 +#define RT5668_L_EQ_BPF1_H0 0x03d8 +#define RT5668_R_EQ_BPF1_H0 0x03d9 +#define RT5668_L_EQ_BPF2_A1 0x03da +#define RT5668_R_EQ_BPF2_A1 0x03db +#define RT5668_L_EQ_BPF2_A2 0x03dc +#define RT5668_R_EQ_BPF2_A2 0x03dd +#define RT5668_L_EQ_BPF2_H0 0x03de +#define RT5668_R_EQ_BPF2_H0 0x03df +#define RT5668_L_EQ_BPF3_A1 0x03e0 +#define RT5668_R_EQ_BPF3_A1 0x03e1 +#define RT5668_L_EQ_BPF3_A2 0x03e2 +#define RT5668_R_EQ_BPF3_A2 0x03e3 +#define RT5668_L_EQ_BPF3_H0 0x03e4 +#define RT5668_R_EQ_BPF3_H0 0x03e5 +#define RT5668_L_EQ_BPF4_A1 0x03e6 +#define RT5668_R_EQ_BPF4_A1 0x03e7 +#define RT5668_L_EQ_BPF4_A2 0x03e8 +#define RT5668_R_EQ_BPF4_A2 0x03e9 +#define RT5668_L_EQ_BPF4_H0 0x03ea +#define RT5668_R_EQ_BPF4_H0 0x03eb +#define RT5668_L_EQ_HPF1_A1 0x03ec +#define RT5668_R_EQ_HPF1_A1 0x03ed +#define RT5668_L_EQ_HPF1_H0 0x03ee +#define RT5668_R_EQ_HPF1_H0 0x03ef +#define RT5668_L_EQ_PRE_VOL 0x03f0 +#define RT5668_R_EQ_PRE_VOL 0x03f1 +#define RT5668_L_EQ_POST_VOL 0x03f2 +#define RT5668_R_EQ_POST_VOL 0x03f3 +#define RT5668_I2C_MODE 0xffff + + +/* global definition */ +#define RT5668_L_MUTE (0x1 << 15) +#define RT5668_L_MUTE_SFT 15 +#define RT5668_VOL_L_MUTE (0x1 << 14) +#define RT5668_VOL_L_SFT 14 +#define RT5668_R_MUTE (0x1 << 7) +#define RT5668_R_MUTE_SFT 7 +#define RT5668_VOL_R_MUTE (0x1 << 6) +#define RT5668_VOL_R_SFT 6 +#define RT5668_L_VOL_MASK (0x3f << 8) +#define RT5668_L_VOL_SFT 8 +#define RT5668_R_VOL_MASK (0x3f) +#define RT5668_R_VOL_SFT 0 + +/*Headphone Amp L/R Analog Gain and Digital NG2 Gain Control (0x0005 0x0006)*/ +#define RT5668_G_HP (0xf << 8) +#define RT5668_G_HP_SFT 8 +#define RT5668_G_STO_DA_DMIX (0xf) +#define RT5668_G_STO_DA_SFT 0 + +/* CBJ Control (0x000b) */ +#define RT5668_BST_CBJ_MASK (0xf << 8) +#define RT5668_BST_CBJ_SFT 8 + +/* Embeeded Jack and Type Detection Control 1 (0x0010) */ +#define RT5668_EMB_JD_EN (0x1 << 15) +#define RT5668_EMB_JD_EN_SFT 15 +#define RT5668_EMB_JD_RST (0x1 << 14) +#define RT5668_JD_MODE (0x1 << 13) +#define RT5668_JD_MODE_SFT 13 +#define RT5668_DET_TYPE (0x1 << 12) +#define RT5668_DET_TYPE_SFT 12 +#define RT5668_POLA_EXT_JD_MASK (0x1 << 11) +#define RT5668_POLA_EXT_JD_LOW (0x1 << 11) +#define RT5668_POLA_EXT_JD_HIGH (0x0 << 11) +#define RT5668_EXT_JD_DIG (0x1 << 9) +#define RT5668_POL_FAST_OFF_MASK (0x1 << 8) +#define RT5668_POL_FAST_OFF_HIGH (0x1 << 8) +#define RT5668_POL_FAST_OFF_LOW (0x0 << 8) +#define RT5668_FAST_OFF_MASK (0x1 << 7) +#define RT5668_FAST_OFF_EN (0x1 << 7) +#define RT5668_FAST_OFF_DIS (0x0 << 7) +#define RT5668_VREF_POW_MASK (0x1 << 6) +#define RT5668_VREF_POW_FSM (0x0 << 6) +#define RT5668_VREF_POW_REG (0x1 << 6) +#define RT5668_MB1_PATH_MASK (0x1 << 5) +#define RT5668_CTRL_MB1_REG (0x1 << 5) +#define RT5668_CTRL_MB1_FSM (0x0 << 5) +#define RT5668_MB2_PATH_MASK (0x1 << 4) +#define RT5668_CTRL_MB2_REG (0x1 << 4) +#define RT5668_CTRL_MB2_FSM (0x0 << 4) +#define RT5668_TRIG_JD_MASK (0x1 << 3) +#define RT5668_TRIG_JD_HIGH (0x1 << 3) +#define RT5668_TRIG_JD_LOW (0x0 << 3) +#define RT5668_MIC_CAP_MASK (0x1 << 1) +#define RT5668_MIC_CAP_HS (0x1 << 1) +#define RT5668_MIC_CAP_HP (0x0 << 1) +#define RT5668_MIC_CAP_SRC_MASK (0x1) +#define RT5668_MIC_CAP_SRC_REG (0x1) +#define RT5668_MIC_CAP_SRC_ANA (0x0) + +/* Embeeded Jack and Type Detection Control 2 (0x0011) */ +#define RT5668_EXT_JD_SRC (0x7 << 4) +#define RT5668_EXT_JD_SRC_SFT 4 +#define RT5668_EXT_JD_SRC_GPIO_JD1 (0x0 << 4) +#define RT5668_EXT_JD_SRC_GPIO_JD2 (0x1 << 4) +#define RT5668_EXT_JD_SRC_JDH (0x2 << 4) +#define RT5668_EXT_JD_SRC_JDL (0x3 << 4) +#define RT5668_EXT_JD_SRC_MANUAL (0x4 << 4) +#define RT5668_JACK_TYPE_MASK (0x3) + +/* Combo Jack and Type Detection Control 3 (0x0012) */ +#define RT5668_CBJ_IN_BUF_EN (0x1 << 7) + +/* Combo Jack and Type Detection Control 4 (0x0013) */ +#define RT5668_SEL_SHT_MID_TON_MASK (0x3 << 12) +#define RT5668_SEL_SHT_MID_TON_2 (0x0 << 12) +#define RT5668_SEL_SHT_MID_TON_3 (0x1 << 12) +#define RT5668_CBJ_JD_TEST_MASK (0x1 << 6) +#define RT5668_CBJ_JD_TEST_NORM (0x0 << 6) +#define RT5668_CBJ_JD_TEST_MODE (0x1 << 6) + +/* DAC1 Digital Volume (0x0019) */ +#define RT5668_DAC_L1_VOL_MASK (0xff << 8) +#define RT5668_DAC_L1_VOL_SFT 8 +#define RT5668_DAC_R1_VOL_MASK (0xff) +#define RT5668_DAC_R1_VOL_SFT 0 + +/* ADC Digital Volume Control (0x001c) */ +#define RT5668_ADC_L_VOL_MASK (0x7f << 8) +#define RT5668_ADC_L_VOL_SFT 8 +#define RT5668_ADC_R_VOL_MASK (0x7f) +#define RT5668_ADC_R_VOL_SFT 0 + +/* Stereo1 ADC Boost Gain Control (0x001f) */ +#define RT5668_STO1_ADC_L_BST_MASK (0x3 << 14) +#define RT5668_STO1_ADC_L_BST_SFT 14 +#define RT5668_STO1_ADC_R_BST_MASK (0x3 << 12) +#define RT5668_STO1_ADC_R_BST_SFT 12 + +/* Sidetone Control (0x0024) */ +#define RT5668_ST_SRC_SEL (0x1 << 8) +#define RT5668_ST_SRC_SFT 8 +#define RT5668_ST_EN_MASK (0x1 << 6) +#define RT5668_ST_DIS (0x0 << 6) +#define RT5668_ST_EN (0x1 << 6) +#define RT5668_ST_EN_SFT 6 + +/* Stereo1 ADC Mixer Control (0x0026) */ +#define RT5668_M_STO1_ADC_L1 (0x1 << 15) +#define RT5668_M_STO1_ADC_L1_SFT 15 +#define RT5668_M_STO1_ADC_L2 (0x1 << 14) +#define RT5668_M_STO1_ADC_L2_SFT 14 +#define RT5668_STO1_ADC1L_SRC_MASK (0x1 << 13) +#define RT5668_STO1_ADC1L_SRC_SFT 13 +#define RT5668_STO1_ADC1_SRC_ADC (0x1 << 13) +#define RT5668_STO1_ADC1_SRC_DACMIX (0x0 << 13) +#define RT5668_STO1_ADC2L_SRC_MASK (0x1 << 12) +#define RT5668_STO1_ADC2L_SRC_SFT 12 +#define RT5668_STO1_ADCL_SRC_MASK (0x3 << 10) +#define RT5668_STO1_ADCL_SRC_SFT 10 +#define RT5668_STO1_DD_L_SRC_MASK (0x1 << 9) +#define RT5668_STO1_DD_L_SRC_SFT 9 +#define RT5668_STO1_DMIC_SRC_MASK (0x1 << 8) +#define RT5668_STO1_DMIC_SRC_SFT 8 +#define RT5668_STO1_DMIC_SRC_DMIC2 (0x1 << 8) +#define RT5668_STO1_DMIC_SRC_DMIC1 (0x0 << 8) +#define RT5668_M_STO1_ADC_R1 (0x1 << 7) +#define RT5668_M_STO1_ADC_R1_SFT 7 +#define RT5668_M_STO1_ADC_R2 (0x1 << 6) +#define RT5668_M_STO1_ADC_R2_SFT 6 +#define RT5668_STO1_ADC1R_SRC_MASK (0x1 << 5) +#define RT5668_STO1_ADC1R_SRC_SFT 5 +#define RT5668_STO1_ADC2R_SRC_MASK (0x1 << 4) +#define RT5668_STO1_ADC2R_SRC_SFT 4 +#define RT5668_STO1_ADCR_SRC_MASK (0x3 << 2) +#define RT5668_STO1_ADCR_SRC_SFT 2 + +/* ADC Mixer to DAC Mixer Control (0x0029) */ +#define RT5668_M_ADCMIX_L (0x1 << 15) +#define RT5668_M_ADCMIX_L_SFT 15 +#define RT5668_M_DAC1_L (0x1 << 14) +#define RT5668_M_DAC1_L_SFT 14 +#define RT5668_DAC1_R_SEL_MASK (0x1 << 10) +#define RT5668_DAC1_R_SEL_SFT 10 +#define RT5668_DAC1_L_SEL_MASK (0x1 << 8) +#define RT5668_DAC1_L_SEL_SFT 8 +#define RT5668_M_ADCMIX_R (0x1 << 7) +#define RT5668_M_ADCMIX_R_SFT 7 +#define RT5668_M_DAC1_R (0x1 << 6) +#define RT5668_M_DAC1_R_SFT 6 + +/* Stereo1 DAC Mixer Control (0x002a) */ +#define RT5668_M_DAC_L1_STO_L (0x1 << 15) +#define RT5668_M_DAC_L1_STO_L_SFT 15 +#define RT5668_G_DAC_L1_STO_L_MASK (0x1 << 14) +#define RT5668_G_DAC_L1_STO_L_SFT 14 +#define RT5668_M_DAC_R1_STO_L (0x1 << 13) +#define RT5668_M_DAC_R1_STO_L_SFT 13 +#define RT5668_G_DAC_R1_STO_L_MASK (0x1 << 12) +#define RT5668_G_DAC_R1_STO_L_SFT 12 +#define RT5668_M_DAC_L1_STO_R (0x1 << 7) +#define RT5668_M_DAC_L1_STO_R_SFT 7 +#define RT5668_G_DAC_L1_STO_R_MASK (0x1 << 6) +#define RT5668_G_DAC_L1_STO_R_SFT 6 +#define RT5668_M_DAC_R1_STO_R (0x1 << 5) +#define RT5668_M_DAC_R1_STO_R_SFT 5 +#define RT5668_G_DAC_R1_STO_R_MASK (0x1 << 4) +#define RT5668_G_DAC_R1_STO_R_SFT 4 + +/* Analog DAC1 Input Source Control (0x002b) */ +#define RT5668_M_ST_STO_L (0x1 << 9) +#define RT5668_M_ST_STO_L_SFT 9 +#define RT5668_M_ST_STO_R (0x1 << 8) +#define RT5668_M_ST_STO_R_SFT 8 +#define RT5668_DAC_L1_SRC_MASK (0x3 << 4) +#define RT5668_A_DACL1_SFT 4 +#define RT5668_DAC_R1_SRC_MASK (0x3) +#define RT5668_A_DACR1_SFT 0 + +/* Digital Interface Data Control (0x0030) */ +#define RT5668_IF2_ADC_SEL_MASK (0x3 << 0) +#define RT5668_IF2_ADC_SEL_SFT 0 + +/* REC Left Mixer Control 2 (0x003c) */ +#define RT5668_G_CBJ_RM1_L (0x7 << 10) +#define RT5668_G_CBJ_RM1_L_SFT 10 +#define RT5668_M_CBJ_RM1_L (0x1 << 7) +#define RT5668_M_CBJ_RM1_L_SFT 7 + +/* Power Management for Digital 1 (0x0061) */ +#define RT5668_PWR_I2S1 (0x1 << 15) +#define RT5668_PWR_I2S1_BIT 15 +#define RT5668_PWR_I2S2 (0x1 << 14) +#define RT5668_PWR_I2S2_BIT 14 +#define RT5668_PWR_DAC_L1 (0x1 << 11) +#define RT5668_PWR_DAC_L1_BIT 11 +#define RT5668_PWR_DAC_R1 (0x1 << 10) +#define RT5668_PWR_DAC_R1_BIT 10 +#define RT5668_PWR_LDO (0x1 << 8) +#define RT5668_PWR_LDO_BIT 8 +#define RT5668_PWR_ADC_L1 (0x1 << 4) +#define RT5668_PWR_ADC_L1_BIT 4 +#define RT5668_PWR_ADC_R1 (0x1 << 3) +#define RT5668_PWR_ADC_R1_BIT 3 +#define RT5668_DIG_GATE_CTRL (0x1 << 0) +#define RT5668_DIG_GATE_CTRL_SFT 0 + + +/* Power Management for Digital 2 (0x0062) */ +#define RT5668_PWR_ADC_S1F (0x1 << 15) +#define RT5668_PWR_ADC_S1F_BIT 15 +#define RT5668_PWR_DAC_S1F (0x1 << 10) +#define RT5668_PWR_DAC_S1F_BIT 10 + +/* Power Management for Analog 1 (0x0063) */ +#define RT5668_PWR_VREF1 (0x1 << 15) +#define RT5668_PWR_VREF1_BIT 15 +#define RT5668_PWR_FV1 (0x1 << 14) +#define RT5668_PWR_FV1_BIT 14 +#define RT5668_PWR_VREF2 (0x1 << 13) +#define RT5668_PWR_VREF2_BIT 13 +#define RT5668_PWR_FV2 (0x1 << 12) +#define RT5668_PWR_FV2_BIT 12 +#define RT5668_LDO1_DBG_MASK (0x3 << 10) +#define RT5668_PWR_MB (0x1 << 9) +#define RT5668_PWR_MB_BIT 9 +#define RT5668_PWR_BG (0x1 << 7) +#define RT5668_PWR_BG_BIT 7 +#define RT5668_LDO1_BYPASS_MASK (0x1 << 6) +#define RT5668_LDO1_BYPASS (0x1 << 6) +#define RT5668_LDO1_NOT_BYPASS (0x0 << 6) +#define RT5668_PWR_MA_BIT 6 +#define RT5668_LDO1_DVO_MASK (0x3 << 4) +#define RT5668_LDO1_DVO_09 (0x0 << 4) +#define RT5668_LDO1_DVO_10 (0x1 << 4) +#define RT5668_LDO1_DVO_12 (0x2 << 4) +#define RT5668_LDO1_DVO_14 (0x3 << 4) +#define RT5668_HP_DRIVER_MASK (0x3 << 2) +#define RT5668_HP_DRIVER_1X (0x0 << 2) +#define RT5668_HP_DRIVER_3X (0x1 << 2) +#define RT5668_HP_DRIVER_5X (0x3 << 2) +#define RT5668_PWR_HA_L (0x1 << 1) +#define RT5668_PWR_HA_L_BIT 1 +#define RT5668_PWR_HA_R (0x1 << 0) +#define RT5668_PWR_HA_R_BIT 0 + +/* Power Management for Analog 2 (0x0064) */ +#define RT5668_PWR_MB1 (0x1 << 11) +#define RT5668_PWR_MB1_PWR_DOWN (0x0 << 11) +#define RT5668_PWR_MB1_BIT 11 +#define RT5668_PWR_MB2 (0x1 << 10) +#define RT5668_PWR_MB2_PWR_DOWN (0x0 << 10) +#define RT5668_PWR_MB2_BIT 10 +#define RT5668_PWR_JDH (0x1 << 3) +#define RT5668_PWR_JDH_BIT 3 +#define RT5668_PWR_JDL (0x1 << 2) +#define RT5668_PWR_JDL_BIT 2 +#define RT5668_PWR_RM1_L (0x1 << 1) +#define RT5668_PWR_RM1_L_BIT 1 + +/* Power Management for Analog 3 (0x0065) */ +#define RT5668_PWR_CBJ (0x1 << 9) +#define RT5668_PWR_CBJ_BIT 9 +#define RT5668_PWR_PLL (0x1 << 6) +#define RT5668_PWR_PLL_BIT 6 +#define RT5668_PWR_PLL2B (0x1 << 5) +#define RT5668_PWR_PLL2B_BIT 5 +#define RT5668_PWR_PLL2F (0x1 << 4) +#define RT5668_PWR_PLL2F_BIT 4 +#define RT5668_PWR_LDO2 (0x1 << 2) +#define RT5668_PWR_LDO2_BIT 2 +#define RT5668_PWR_DET_SPKVDD (0x1 << 1) +#define RT5668_PWR_DET_SPKVDD_BIT 1 + +/* Power Management for Mixer (0x0066) */ +#define RT5668_PWR_STO1_DAC_L (0x1 << 5) +#define RT5668_PWR_STO1_DAC_L_BIT 5 +#define RT5668_PWR_STO1_DAC_R (0x1 << 4) +#define RT5668_PWR_STO1_DAC_R_BIT 4 + +/* MCLK and System Clock Detection Control (0x006b) */ +#define RT5668_SYS_CLK_DET (0x1 << 15) +#define RT5668_SYS_CLK_DET_SFT 15 +#define RT5668_PLL1_CLK_DET (0x1 << 14) +#define RT5668_PLL1_CLK_DET_SFT 14 +#define RT5668_PLL2_CLK_DET (0x1 << 13) +#define RT5668_PLL2_CLK_DET_SFT 13 +#define RT5668_POW_CLK_DET2_SFT 8 +#define RT5668_POW_CLK_DET_SFT 0 + +/* Digital Microphone Control 1 (0x006e) */ +#define RT5668_DMIC_1_EN_MASK (0x1 << 15) +#define RT5668_DMIC_1_EN_SFT 15 +#define RT5668_DMIC_1_DIS (0x0 << 15) +#define RT5668_DMIC_1_EN (0x1 << 15) +#define RT5668_DMIC_1_DP_MASK (0x3 << 4) +#define RT5668_DMIC_1_DP_SFT 4 +#define RT5668_DMIC_1_DP_GPIO2 (0x0 << 4) +#define RT5668_DMIC_1_DP_GPIO5 (0x1 << 4) +#define RT5668_DMIC_CLK_MASK (0xf << 0) +#define RT5668_DMIC_CLK_SFT 0 + +/* I2S1 Audio Serial Data Port Control (0x0070) */ +#define RT5668_SEL_ADCDAT_MASK (0x1 << 15) +#define RT5668_SEL_ADCDAT_OUT (0x0 << 15) +#define RT5668_SEL_ADCDAT_IN (0x1 << 15) +#define RT5668_SEL_ADCDAT_SFT 15 +#define RT5668_I2S1_TX_CHL_MASK (0x7 << 12) +#define RT5668_I2S1_TX_CHL_SFT 12 +#define RT5668_I2S1_TX_CHL_16 (0x0 << 12) +#define RT5668_I2S1_TX_CHL_20 (0x1 << 12) +#define RT5668_I2S1_TX_CHL_24 (0x2 << 12) +#define RT5668_I2S1_TX_CHL_32 (0x3 << 12) +#define RT5668_I2S1_TX_CHL_8 (0x4 << 12) +#define RT5668_I2S1_RX_CHL_MASK (0x7 << 8) +#define RT5668_I2S1_RX_CHL_SFT 8 +#define RT5668_I2S1_RX_CHL_16 (0x0 << 8) +#define RT5668_I2S1_RX_CHL_20 (0x1 << 8) +#define RT5668_I2S1_RX_CHL_24 (0x2 << 8) +#define RT5668_I2S1_RX_CHL_32 (0x3 << 8) +#define RT5668_I2S1_RX_CHL_8 (0x4 << 8) +#define RT5668_I2S1_MONO_MASK (0x1 << 7) +#define RT5668_I2S1_MONO_EN (0x1 << 7) +#define RT5668_I2S1_MONO_DIS (0x0 << 7) +#define RT5668_I2S2_MONO_MASK (0x1 << 6) +#define RT5668_I2S2_MONO_EN (0x1 << 6) +#define RT5668_I2S2_MONO_DIS (0x0 << 6) +#define RT5668_I2S1_DL_MASK (0x7 << 4) +#define RT5668_I2S1_DL_SFT 4 +#define RT5668_I2S1_DL_16 (0x0 << 4) +#define RT5668_I2S1_DL_20 (0x1 << 4) +#define RT5668_I2S1_DL_24 (0x2 << 4) +#define RT5668_I2S1_DL_32 (0x3 << 4) +#define RT5668_I2S1_DL_8 (0x4 << 4) + +/* I2S1/2 Audio Serial Data Port Control (0x0070)(0x0071) */ +#define RT5668_I2S2_MS_MASK (0x1 << 15) +#define RT5668_I2S2_MS_SFT 15 +#define RT5668_I2S2_MS_M (0x0 << 15) +#define RT5668_I2S2_MS_S (0x1 << 15) +#define RT5668_I2S2_PIN_CFG_MASK (0x1 << 14) +#define RT5668_I2S2_PIN_CFG_SFT 14 +#define RT5668_I2S2_CLK_SEL_MASK (0x1 << 11) +#define RT5668_I2S2_CLK_SEL_SFT 11 +#define RT5668_I2S2_OUT_MASK (0x1 << 9) +#define RT5668_I2S2_OUT_SFT 9 +#define RT5668_I2S2_OUT_UM (0x0 << 9) +#define RT5668_I2S2_OUT_M (0x1 << 9) +#define RT5668_I2S_BP_MASK (0x1 << 8) +#define RT5668_I2S_BP_SFT 8 +#define RT5668_I2S_BP_NOR (0x0 << 8) +#define RT5668_I2S_BP_INV (0x1 << 8) +#define RT5668_I2S2_MONO_EN (0x1 << 6) +#define RT5668_I2S2_MONO_DIS (0x0 << 6) +#define RT5668_I2S2_DL_MASK (0x3 << 4) +#define RT5668_I2S2_DL_SFT 4 +#define RT5668_I2S2_DL_16 (0x0 << 4) +#define RT5668_I2S2_DL_20 (0x1 << 4) +#define RT5668_I2S2_DL_24 (0x2 << 4) +#define RT5668_I2S2_DL_8 (0x3 << 4) +#define RT5668_I2S_DF_MASK (0x7) +#define RT5668_I2S_DF_SFT 0 +#define RT5668_I2S_DF_I2S (0x0) +#define RT5668_I2S_DF_LEFT (0x1) +#define RT5668_I2S_DF_PCM_A (0x2) +#define RT5668_I2S_DF_PCM_B (0x3) +#define RT5668_I2S_DF_PCM_A_N (0x6) +#define RT5668_I2S_DF_PCM_B_N (0x7) + +/* ADC/DAC Clock Control 1 (0x0073) */ +#define RT5668_ADC_OSR_MASK (0xf << 12) +#define RT5668_ADC_OSR_SFT 12 +#define RT5668_ADC_OSR_D_1 (0x0 << 12) +#define RT5668_ADC_OSR_D_2 (0x1 << 12) +#define RT5668_ADC_OSR_D_4 (0x2 << 12) +#define RT5668_ADC_OSR_D_6 (0x3 << 12) +#define RT5668_ADC_OSR_D_8 (0x4 << 12) +#define RT5668_ADC_OSR_D_12 (0x5 << 12) +#define RT5668_ADC_OSR_D_16 (0x6 << 12) +#define RT5668_ADC_OSR_D_24 (0x7 << 12) +#define RT5668_ADC_OSR_D_32 (0x8 << 12) +#define RT5668_ADC_OSR_D_48 (0x9 << 12) +#define RT5668_I2S_M_DIV_MASK (0xf << 12) +#define RT5668_I2S_M_DIV_SFT 8 +#define RT5668_I2S_M_D_1 (0x0 << 8) +#define RT5668_I2S_M_D_2 (0x1 << 8) +#define RT5668_I2S_M_D_3 (0x2 << 8) +#define RT5668_I2S_M_D_4 (0x3 << 8) +#define RT5668_I2S_M_D_6 (0x4 << 8) +#define RT5668_I2S_M_D_8 (0x5 << 8) +#define RT5668_I2S_M_D_12 (0x6 << 8) +#define RT5668_I2S_M_D_16 (0x7 << 8) +#define RT5668_I2S_M_D_24 (0x8 << 8) +#define RT5668_I2S_M_D_32 (0x9 << 8) +#define RT5668_I2S_M_D_48 (0x10 << 8) +#define RT5668_I2S_CLK_SRC_MASK (0x7 << 4) +#define RT5668_I2S_CLK_SRC_SFT 4 +#define RT5668_I2S_CLK_SRC_MCLK (0x0 << 4) +#define RT5668_I2S_CLK_SRC_PLL1 (0x1 << 4) +#define RT5668_I2S_CLK_SRC_PLL2 (0x2 << 4) +#define RT5668_I2S_CLK_SRC_SDW (0x3 << 4) +#define RT5668_I2S_CLK_SRC_RCCLK (0x4 << 4) /* 25M */ +#define RT5668_DAC_OSR_MASK (0xf << 0) +#define RT5668_DAC_OSR_SFT 0 +#define RT5668_DAC_OSR_D_1 (0x0 << 0) +#define RT5668_DAC_OSR_D_2 (0x1 << 0) +#define RT5668_DAC_OSR_D_4 (0x2 << 0) +#define RT5668_DAC_OSR_D_6 (0x3 << 0) +#define RT5668_DAC_OSR_D_8 (0x4 << 0) +#define RT5668_DAC_OSR_D_12 (0x5 << 0) +#define RT5668_DAC_OSR_D_16 (0x6 << 0) +#define RT5668_DAC_OSR_D_24 (0x7 << 0) +#define RT5668_DAC_OSR_D_32 (0x8 << 0) +#define RT5668_DAC_OSR_D_48 (0x9 << 0) + +/* ADC/DAC Clock Control 2 (0x0074) */ +#define RT5668_I2S2_BCLK_MS2_MASK (0x1 << 11) +#define RT5668_I2S2_BCLK_MS2_SFT 11 +#define RT5668_I2S2_BCLK_MS2_32 (0x0 << 11) +#define RT5668_I2S2_BCLK_MS2_64 (0x1 << 11) + + +/* TDM control 1 (0x0079) */ +#define RT5668_TDM_TX_CH_MASK (0x3 << 12) +#define RT5668_TDM_TX_CH_2 (0x0 << 12) +#define RT5668_TDM_TX_CH_4 (0x1 << 12) +#define RT5668_TDM_TX_CH_6 (0x2 << 12) +#define RT5668_TDM_TX_CH_8 (0x3 << 12) +#define RT5668_TDM_RX_CH_MASK (0x3 << 8) +#define RT5668_TDM_RX_CH_2 (0x0 << 8) +#define RT5668_TDM_RX_CH_4 (0x1 << 8) +#define RT5668_TDM_RX_CH_6 (0x2 << 8) +#define RT5668_TDM_RX_CH_8 (0x3 << 8) +#define RT5668_TDM_ADC_LCA_MASK (0xf << 4) +#define RT5668_TDM_ADC_LCA_SFT 4 +#define RT5668_TDM_ADC_DL_SFT 0 + +/* TDM control 3 (0x007a) */ +#define RT5668_IF1_ADC1_SEL_SFT 14 +#define RT5668_IF1_ADC2_SEL_SFT 12 +#define RT5668_IF1_ADC3_SEL_SFT 10 +#define RT5668_IF1_ADC4_SEL_SFT 8 +#define RT5668_TDM_ADC_SEL_SFT 4 + +/* TDM/I2S control (0x007e) */ +#define RT5668_TDM_S_BP_MASK (0x1 << 15) +#define RT5668_TDM_S_BP_SFT 15 +#define RT5668_TDM_S_BP_NOR (0x0 << 15) +#define RT5668_TDM_S_BP_INV (0x1 << 15) +#define RT5668_TDM_S_LP_MASK (0x1 << 14) +#define RT5668_TDM_S_LP_SFT 14 +#define RT5668_TDM_S_LP_NOR (0x0 << 14) +#define RT5668_TDM_S_LP_INV (0x1 << 14) +#define RT5668_TDM_DF_MASK (0x7 << 11) +#define RT5668_TDM_DF_SFT 11 +#define RT5668_TDM_DF_I2S (0x0 << 11) +#define RT5668_TDM_DF_LEFT (0x1 << 11) +#define RT5668_TDM_DF_PCM_A (0x2 << 11) +#define RT5668_TDM_DF_PCM_B (0x3 << 11) +#define RT5668_TDM_DF_PCM_A_N (0x6 << 11) +#define RT5668_TDM_DF_PCM_B_N (0x7 << 11) +#define RT5668_TDM_CL_MASK (0x3 << 4) +#define RT5668_TDM_CL_16 (0x0 << 4) +#define RT5668_TDM_CL_20 (0x1 << 4) +#define RT5668_TDM_CL_24 (0x2 << 4) +#define RT5668_TDM_CL_32 (0x3 << 4) +#define RT5668_TDM_M_BP_MASK (0x1 << 2) +#define RT5668_TDM_M_BP_SFT 2 +#define RT5668_TDM_M_BP_NOR (0x0 << 2) +#define RT5668_TDM_M_BP_INV (0x1 << 2) +#define RT5668_TDM_M_LP_MASK (0x1 << 1) +#define RT5668_TDM_M_LP_SFT 1 +#define RT5668_TDM_M_LP_NOR (0x0 << 1) +#define RT5668_TDM_M_LP_INV (0x1 << 1) +#define RT5668_TDM_MS_MASK (0x1 << 0) +#define RT5668_TDM_MS_SFT 0 +#define RT5668_TDM_MS_M (0x0 << 0) +#define RT5668_TDM_MS_S (0x1 << 0) + +/* Global Clock Control (0x0080) */ +#define RT5668_SCLK_SRC_MASK (0x7 << 13) +#define RT5668_SCLK_SRC_SFT 13 +#define RT5668_SCLK_SRC_MCLK (0x0 << 13) +#define RT5668_SCLK_SRC_PLL1 (0x1 << 13) +#define RT5668_SCLK_SRC_PLL2 (0x2 << 13) +#define RT5668_SCLK_SRC_SDW (0x3 << 13) +#define RT5668_SCLK_SRC_RCCLK (0x4 << 13) +#define RT5668_PLL1_SRC_MASK (0x3 << 10) +#define RT5668_PLL1_SRC_SFT 10 +#define RT5668_PLL1_SRC_MCLK (0x0 << 10) +#define RT5668_PLL1_SRC_BCLK1 (0x1 << 10) +#define RT5668_PLL1_SRC_SDW (0x2 << 10) +#define RT5668_PLL1_SRC_RC (0x3 << 10) +#define RT5668_PLL2_SRC_MASK (0x3 << 8) +#define RT5668_PLL2_SRC_SFT 8 +#define RT5668_PLL2_SRC_MCLK (0x0 << 8) +#define RT5668_PLL2_SRC_BCLK1 (0x1 << 8) +#define RT5668_PLL2_SRC_SDW (0x2 << 8) +#define RT5668_PLL2_SRC_RC (0x3 << 8) + + + +#define RT5668_PLL_INP_MAX 40000000 +#define RT5668_PLL_INP_MIN 256000 +/* PLL M/N/K Code Control 1 (0x0081) */ +#define RT5668_PLL_N_MAX 0x001ff +#define RT5668_PLL_N_MASK (RT5668_PLL_N_MAX << 7) +#define RT5668_PLL_N_SFT 7 +#define RT5668_PLL_K_MAX 0x001f +#define RT5668_PLL_K_MASK (RT5668_PLL_K_MAX) +#define RT5668_PLL_K_SFT 0 + +/* PLL M/N/K Code Control 2 (0x0082) */ +#define RT5668_PLL_M_MAX 0x00f +#define RT5668_PLL_M_MASK (RT5668_PLL_M_MAX << 12) +#define RT5668_PLL_M_SFT 12 +#define RT5668_PLL_M_BP (0x1 << 11) +#define RT5668_PLL_M_BP_SFT 11 +#define RT5668_PLL_K_BP (0x1 << 10) +#define RT5668_PLL_K_BP_SFT 10 + +/* PLL tracking mode 1 (0x0083) */ +#define RT5668_DA_ASRC_MASK (0x1 << 13) +#define RT5668_DA_ASRC_SFT 13 +#define RT5668_DAC_STO1_ASRC_MASK (0x1 << 12) +#define RT5668_DAC_STO1_ASRC_SFT 12 +#define RT5668_AD_ASRC_MASK (0x1 << 8) +#define RT5668_AD_ASRC_SFT 8 +#define RT5668_AD_ASRC_SEL_MASK (0x1 << 4) +#define RT5668_AD_ASRC_SEL_SFT 4 +#define RT5668_DMIC_ASRC_MASK (0x1 << 3) +#define RT5668_DMIC_ASRC_SFT 3 +#define RT5668_ADC_STO1_ASRC_MASK (0x1 << 2) +#define RT5668_ADC_STO1_ASRC_SFT 2 +#define RT5668_DA_ASRC_SEL_MASK (0x1 << 0) +#define RT5668_DA_ASRC_SEL_SFT 0 + +/* PLL tracking mode 2 3 (0x0084)(0x0085)*/ +#define RT5668_FILTER_CLK_SEL_MASK (0x7 << 12) +#define RT5668_FILTER_CLK_SEL_SFT 12 + +/* ASRC Control 4 (0x0086) */ +#define RT5668_ASRCIN_FTK_N1_MASK (0x3 << 14) +#define RT5668_ASRCIN_FTK_N1_SFT 14 +#define RT5668_ASRCIN_FTK_N2_MASK (0x3 << 12) +#define RT5668_ASRCIN_FTK_N2_SFT 12 +#define RT5668_ASRCIN_FTK_M1_MASK (0x7 << 8) +#define RT5668_ASRCIN_FTK_M1_SFT 8 +#define RT5668_ASRCIN_FTK_M2_MASK (0x7 << 4) +#define RT5668_ASRCIN_FTK_M2_SFT 4 + +/* SoundWire reference clk (0x008d) */ +#define RT5668_PLL2_OUT_MASK (0x1 << 8) +#define RT5668_PLL2_OUT_98M (0x0 << 8) +#define RT5668_PLL2_OUT_49M (0x1 << 8) +#define RT5668_SDW_REF_2_MASK (0xf << 4) +#define RT5668_SDW_REF_2_SFT 4 +#define RT5668_SDW_REF_2_48K (0x0 << 4) +#define RT5668_SDW_REF_2_96K (0x1 << 4) +#define RT5668_SDW_REF_2_192K (0x2 << 4) +#define RT5668_SDW_REF_2_32K (0x3 << 4) +#define RT5668_SDW_REF_2_24K (0x4 << 4) +#define RT5668_SDW_REF_2_16K (0x5 << 4) +#define RT5668_SDW_REF_2_12K (0x6 << 4) +#define RT5668_SDW_REF_2_8K (0x7 << 4) +#define RT5668_SDW_REF_2_44K (0x8 << 4) +#define RT5668_SDW_REF_2_88K (0x9 << 4) +#define RT5668_SDW_REF_2_176K (0xa << 4) +#define RT5668_SDW_REF_2_353K (0xb << 4) +#define RT5668_SDW_REF_2_22K (0xc << 4) +#define RT5668_SDW_REF_2_384K (0xd << 4) +#define RT5668_SDW_REF_2_11K (0xe << 4) +#define RT5668_SDW_REF_1_MASK (0xf << 0) +#define RT5668_SDW_REF_1_SFT 0 +#define RT5668_SDW_REF_1_48K (0x0 << 0) +#define RT5668_SDW_REF_1_96K (0x1 << 0) +#define RT5668_SDW_REF_1_192K (0x2 << 0) +#define RT5668_SDW_REF_1_32K (0x3 << 0) +#define RT5668_SDW_REF_1_24K (0x4 << 0) +#define RT5668_SDW_REF_1_16K (0x5 << 0) +#define RT5668_SDW_REF_1_12K (0x6 << 0) +#define RT5668_SDW_REF_1_8K (0x7 << 0) +#define RT5668_SDW_REF_1_44K (0x8 << 0) +#define RT5668_SDW_REF_1_88K (0x9 << 0) +#define RT5668_SDW_REF_1_176K (0xa << 0) +#define RT5668_SDW_REF_1_353K (0xb << 0) +#define RT5668_SDW_REF_1_22K (0xc << 0) +#define RT5668_SDW_REF_1_384K (0xd << 0) +#define RT5668_SDW_REF_1_11K (0xe << 0) + +/* Depop Mode Control 1 (0x008e) */ +#define RT5668_PUMP_EN (0x1 << 3) +#define RT5668_PUMP_EN_SFT 3 +#define RT5668_CAPLESS_EN (0x1 << 0) +#define RT5668_CAPLESS_EN_SFT 0 + +/* Depop Mode Control 2 (0x8f) */ +#define RT5668_RAMP_MASK (0x1 << 12) +#define RT5668_RAMP_SFT 12 +#define RT5668_RAMP_DIS (0x0 << 12) +#define RT5668_RAMP_EN (0x1 << 12) +#define RT5668_BPS_MASK (0x1 << 11) +#define RT5668_BPS_SFT 11 +#define RT5668_BPS_DIS (0x0 << 11) +#define RT5668_BPS_EN (0x1 << 11) +#define RT5668_FAST_UPDN_MASK (0x1 << 10) +#define RT5668_FAST_UPDN_SFT 10 +#define RT5668_FAST_UPDN_DIS (0x0 << 10) +#define RT5668_FAST_UPDN_EN (0x1 << 10) +#define RT5668_VLO_MASK (0x1 << 7) +#define RT5668_VLO_SFT 7 +#define RT5668_VLO_3V (0x0 << 7) +#define RT5668_VLO_33V (0x1 << 7) + +/* HPOUT charge pump 1 (0x0091) */ +#define RT5668_OSW_L_MASK (0x1 << 11) +#define RT5668_OSW_L_SFT 11 +#define RT5668_OSW_L_DIS (0x0 << 11) +#define RT5668_OSW_L_EN (0x1 << 11) +#define RT5668_OSW_R_MASK (0x1 << 10) +#define RT5668_OSW_R_SFT 10 +#define RT5668_OSW_R_DIS (0x0 << 10) +#define RT5668_OSW_R_EN (0x1 << 10) +#define RT5668_PM_HP_MASK (0x3 << 8) +#define RT5668_PM_HP_SFT 8 +#define RT5668_PM_HP_LV (0x0 << 8) +#define RT5668_PM_HP_MV (0x1 << 8) +#define RT5668_PM_HP_HV (0x2 << 8) +#define RT5668_IB_HP_MASK (0x3 << 6) +#define RT5668_IB_HP_SFT 6 +#define RT5668_IB_HP_125IL (0x0 << 6) +#define RT5668_IB_HP_25IL (0x1 << 6) +#define RT5668_IB_HP_5IL (0x2 << 6) +#define RT5668_IB_HP_1IL (0x3 << 6) + +/* Micbias Control1 (0x93) */ +#define RT5668_MIC1_OV_MASK (0x3 << 14) +#define RT5668_MIC1_OV_SFT 14 +#define RT5668_MIC1_OV_2V7 (0x0 << 14) +#define RT5668_MIC1_OV_2V4 (0x1 << 14) +#define RT5668_MIC1_OV_2V25 (0x3 << 14) +#define RT5668_MIC1_OV_1V8 (0x4 << 14) +#define RT5668_MIC1_CLK_MASK (0x1 << 13) +#define RT5668_MIC1_CLK_SFT 13 +#define RT5668_MIC1_CLK_DIS (0x0 << 13) +#define RT5668_MIC1_CLK_EN (0x1 << 13) +#define RT5668_MIC1_OVCD_MASK (0x1 << 12) +#define RT5668_MIC1_OVCD_SFT 12 +#define RT5668_MIC1_OVCD_DIS (0x0 << 12) +#define RT5668_MIC1_OVCD_EN (0x1 << 12) +#define RT5668_MIC1_OVTH_MASK (0x3 << 10) +#define RT5668_MIC1_OVTH_SFT 10 +#define RT5668_MIC1_OVTH_768UA (0x0 << 10) +#define RT5668_MIC1_OVTH_960UA (0x1 << 10) +#define RT5668_MIC1_OVTH_1152UA (0x2 << 10) +#define RT5668_MIC1_OVTH_1960UA (0x3 << 10) +#define RT5668_MIC2_OV_MASK (0x3 << 8) +#define RT5668_MIC2_OV_SFT 8 +#define RT5668_MIC2_OV_2V7 (0x0 << 8) +#define RT5668_MIC2_OV_2V4 (0x1 << 8) +#define RT5668_MIC2_OV_2V25 (0x3 << 8) +#define RT5668_MIC2_OV_1V8 (0x4 << 8) +#define RT5668_MIC2_CLK_MASK (0x1 << 7) +#define RT5668_MIC2_CLK_SFT 7 +#define RT5668_MIC2_CLK_DIS (0x0 << 7) +#define RT5668_MIC2_CLK_EN (0x1 << 7) +#define RT5668_MIC2_OVTH_MASK (0x3 << 4) +#define RT5668_MIC2_OVTH_SFT 4 +#define RT5668_MIC2_OVTH_768UA (0x0 << 4) +#define RT5668_MIC2_OVTH_960UA (0x1 << 4) +#define RT5668_MIC2_OVTH_1152UA (0x2 << 4) +#define RT5668_MIC2_OVTH_1960UA (0x3 << 4) +#define RT5668_PWR_MB_MASK (0x1 << 3) +#define RT5668_PWR_MB_SFT 3 +#define RT5668_PWR_MB_PD (0x0 << 3) +#define RT5668_PWR_MB_PU (0x1 << 3) + +/* Micbias Control2 (0x0094) */ +#define RT5668_PWR_CLK25M_MASK (0x1 << 9) +#define RT5668_PWR_CLK25M_SFT 9 +#define RT5668_PWR_CLK25M_PD (0x0 << 9) +#define RT5668_PWR_CLK25M_PU (0x1 << 9) +#define RT5668_PWR_CLK1M_MASK (0x1 << 8) +#define RT5668_PWR_CLK1M_SFT 8 +#define RT5668_PWR_CLK1M_PD (0x0 << 8) +#define RT5668_PWR_CLK1M_PU (0x1 << 8) + +/* RC Clock Control (0x009f) */ +#define RT5668_POW_IRQ (0x1 << 15) +#define RT5668_POW_JDH (0x1 << 14) +#define RT5668_POW_JDL (0x1 << 13) +#define RT5668_POW_ANA (0x1 << 12) + +/* I2S Master Mode Clock Control 1 (0x00a0) */ +#define RT5668_CLK_SRC_MCLK (0x0) +#define RT5668_CLK_SRC_PLL1 (0x1) +#define RT5668_CLK_SRC_PLL2 (0x2) +#define RT5668_CLK_SRC_SDW (0x3) +#define RT5668_CLK_SRC_RCCLK (0x4) +#define RT5668_I2S_PD_1 (0x0) +#define RT5668_I2S_PD_2 (0x1) +#define RT5668_I2S_PD_3 (0x2) +#define RT5668_I2S_PD_4 (0x3) +#define RT5668_I2S_PD_6 (0x4) +#define RT5668_I2S_PD_8 (0x5) +#define RT5668_I2S_PD_12 (0x6) +#define RT5668_I2S_PD_16 (0x7) +#define RT5668_I2S_PD_24 (0x8) +#define RT5668_I2S_PD_32 (0x9) +#define RT5668_I2S_PD_48 (0xa) +#define RT5668_I2S2_SRC_MASK (0x3 << 4) +#define RT5668_I2S2_SRC_SFT 4 +#define RT5668_I2S2_M_PD_MASK (0xf << 0) +#define RT5668_I2S2_M_PD_SFT 0 + +/* IRQ Control 1 (0x00b6) */ +#define RT5668_JD1_PULSE_EN_MASK (0x1 << 10) +#define RT5668_JD1_PULSE_EN_SFT 10 +#define RT5668_JD1_PULSE_DIS (0x0 << 10) +#define RT5668_JD1_PULSE_EN (0x1 << 10) + +/* IRQ Control 2 (0x00b7) */ +#define RT5668_JD1_EN_MASK (0x1 << 15) +#define RT5668_JD1_EN_SFT 15 +#define RT5668_JD1_DIS (0x0 << 15) +#define RT5668_JD1_EN (0x1 << 15) +#define RT5668_JD1_POL_MASK (0x1 << 13) +#define RT5668_JD1_POL_NOR (0x0 << 13) +#define RT5668_JD1_POL_INV (0x1 << 13) + +/* IRQ Control 3 (0x00b8) */ +#define RT5668_IL_IRQ_MASK (0x1 << 7) +#define RT5668_IL_IRQ_DIS (0x0 << 7) +#define RT5668_IL_IRQ_EN (0x1 << 7) + +/* GPIO Control 1 (0x00c0) */ +#define RT5668_GP1_PIN_MASK (0x3 << 14) +#define RT5668_GP1_PIN_SFT 14 +#define RT5668_GP1_PIN_GPIO1 (0x0 << 14) +#define RT5668_GP1_PIN_IRQ (0x1 << 14) +#define RT5668_GP1_PIN_DMIC_CLK (0x2 << 14) +#define RT5668_GP2_PIN_MASK (0x3 << 12) +#define RT5668_GP2_PIN_SFT 12 +#define RT5668_GP2_PIN_GPIO2 (0x0 << 12) +#define RT5668_GP2_PIN_LRCK2 (0x1 << 12) +#define RT5668_GP2_PIN_DMIC_SDA (0x2 << 12) +#define RT5668_GP3_PIN_MASK (0x3 << 10) +#define RT5668_GP3_PIN_SFT 10 +#define RT5668_GP3_PIN_GPIO3 (0x0 << 10) +#define RT5668_GP3_PIN_BCLK2 (0x1 << 10) +#define RT5668_GP3_PIN_DMIC_CLK (0x2 << 10) +#define RT5668_GP4_PIN_MASK (0x3 << 8) +#define RT5668_GP4_PIN_SFT 8 +#define RT5668_GP4_PIN_GPIO4 (0x0 << 8) +#define RT5668_GP4_PIN_ADCDAT1 (0x1 << 8) +#define RT5668_GP4_PIN_DMIC_CLK (0x2 << 8) +#define RT5668_GP4_PIN_ADCDAT2 (0x3 << 8) +#define RT5668_GP5_PIN_MASK (0x3 << 6) +#define RT5668_GP5_PIN_SFT 6 +#define RT5668_GP5_PIN_GPIO5 (0x0 << 6) +#define RT5668_GP5_PIN_DACDAT1 (0x1 << 6) +#define RT5668_GP5_PIN_DMIC_SDA (0x2 << 6) +#define RT5668_GP6_PIN_MASK (0x1 << 5) +#define RT5668_GP6_PIN_SFT 5 +#define RT5668_GP6_PIN_GPIO6 (0x0 << 5) +#define RT5668_GP6_PIN_LRCK1 (0x1 << 5) + +/* GPIO Control 2 (0x00c1)*/ +#define RT5668_GP1_PF_MASK (0x1 << 15) +#define RT5668_GP1_PF_IN (0x0 << 15) +#define RT5668_GP1_PF_OUT (0x1 << 15) +#define RT5668_GP1_OUT_MASK (0x1 << 14) +#define RT5668_GP1_OUT_L (0x0 << 14) +#define RT5668_GP1_OUT_H (0x1 << 14) +#define RT5668_GP2_PF_MASK (0x1 << 13) +#define RT5668_GP2_PF_IN (0x0 << 13) +#define RT5668_GP2_PF_OUT (0x1 << 13) +#define RT5668_GP2_OUT_MASK (0x1 << 12) +#define RT5668_GP2_OUT_L (0x0 << 12) +#define RT5668_GP2_OUT_H (0x1 << 12) +#define RT5668_GP3_PF_MASK (0x1 << 11) +#define RT5668_GP3_PF_IN (0x0 << 11) +#define RT5668_GP3_PF_OUT (0x1 << 11) +#define RT5668_GP3_OUT_MASK (0x1 << 10) +#define RT5668_GP3_OUT_L (0x0 << 10) +#define RT5668_GP3_OUT_H (0x1 << 10) +#define RT5668_GP4_PF_MASK (0x1 << 9) +#define RT5668_GP4_PF_IN (0x0 << 9) +#define RT5668_GP4_PF_OUT (0x1 << 9) +#define RT5668_GP4_OUT_MASK (0x1 << 8) +#define RT5668_GP4_OUT_L (0x0 << 8) +#define RT5668_GP4_OUT_H (0x1 << 8) +#define RT5668_GP5_PF_MASK (0x1 << 7) +#define RT5668_GP5_PF_IN (0x0 << 7) +#define RT5668_GP5_PF_OUT (0x1 << 7) +#define RT5668_GP5_OUT_MASK (0x1 << 6) +#define RT5668_GP5_OUT_L (0x0 << 6) +#define RT5668_GP5_OUT_H (0x1 << 6) +#define RT5668_GP6_PF_MASK (0x1 << 5) +#define RT5668_GP6_PF_IN (0x0 << 5) +#define RT5668_GP6_PF_OUT (0x1 << 5) +#define RT5668_GP6_OUT_MASK (0x1 << 4) +#define RT5668_GP6_OUT_L (0x0 << 4) +#define RT5668_GP6_OUT_H (0x1 << 4) + + +/* GPIO Status (0x00c2) */ +#define RT5668_GP6_STA (0x1 << 6) +#define RT5668_GP5_STA (0x1 << 5) +#define RT5668_GP4_STA (0x1 << 4) +#define RT5668_GP3_STA (0x1 << 3) +#define RT5668_GP2_STA (0x1 << 2) +#define RT5668_GP1_STA (0x1 << 1) + +/* Soft volume and zero cross control 1 (0x00d9) */ +#define RT5668_SV_MASK (0x1 << 15) +#define RT5668_SV_SFT 15 +#define RT5668_SV_DIS (0x0 << 15) +#define RT5668_SV_EN (0x1 << 15) +#define RT5668_ZCD_MASK (0x1 << 10) +#define RT5668_ZCD_SFT 10 +#define RT5668_ZCD_PD (0x0 << 10) +#define RT5668_ZCD_PU (0x1 << 10) +#define RT5668_SV_DLY_MASK (0xf) +#define RT5668_SV_DLY_SFT 0 + +/* Soft volume and zero cross control 2 (0x00da) */ +#define RT5668_ZCD_BST1_CBJ_MASK (0x1 << 7) +#define RT5668_ZCD_BST1_CBJ_SFT 7 +#define RT5668_ZCD_BST1_CBJ_DIS (0x0 << 7) +#define RT5668_ZCD_BST1_CBJ_EN (0x1 << 7) +#define RT5668_ZCD_RECMIX_MASK (0x1) +#define RT5668_ZCD_RECMIX_SFT 0 +#define RT5668_ZCD_RECMIX_DIS (0x0) +#define RT5668_ZCD_RECMIX_EN (0x1) + +/* 4 Button Inline Command Control 2 (0x00e3) */ +#define RT5668_4BTN_IL_MASK (0x1 << 15) +#define RT5668_4BTN_IL_EN (0x1 << 15) +#define RT5668_4BTN_IL_DIS (0x0 << 15) +#define RT5668_4BTN_IL_RST_MASK (0x1 << 14) +#define RT5668_4BTN_IL_NOR (0x1 << 14) +#define RT5668_4BTN_IL_RST (0x0 << 14) + +/* Analog JD Control (0x00f0) */ +#define RT5668_JDH_RS_MASK (0x1 << 4) +#define RT5668_JDH_NO_PLUG (0x1 << 4) +#define RT5668_JDH_PLUG (0x0 << 4) + +/* Chopper and Clock control for DAC (0x013a)*/ +#define RT5668_CKXEN_DAC1_MASK (0x1 << 13) +#define RT5668_CKXEN_DAC1_SFT 13 +#define RT5668_CKGEN_DAC1_MASK (0x1 << 12) +#define RT5668_CKGEN_DAC1_SFT 12 + +/* Chopper and Clock control for ADC (0x013b)*/ +#define RT5668_CKXEN_ADC1_MASK (0x1 << 13) +#define RT5668_CKXEN_ADC1_SFT 13 +#define RT5668_CKGEN_ADC1_MASK (0x1 << 12) +#define RT5668_CKGEN_ADC1_SFT 12 + +/* Volume test (0x013f)*/ +#define RT5668_SEL_CLK_VOL_MASK (0x1 << 15) +#define RT5668_SEL_CLK_VOL_EN (0x1 << 15) +#define RT5668_SEL_CLK_VOL_DIS (0x0 << 15) + +/* Test Mode Control 1 (0x0145) */ +#define RT5668_AD2DA_LB_MASK (0x1 << 10) +#define RT5668_AD2DA_LB_SFT 10 + +/* Stereo Noise Gate Control 1 (0x0160) */ +#define RT5668_NG2_EN_MASK (0x1 << 15) +#define RT5668_NG2_EN (0x1 << 15) +#define RT5668_NG2_DIS (0x0 << 15) + +/* Stereo1 DAC Silence Detection Control (0x0190) */ +#define RT5668_DEB_STO_DAC_MASK (0x7 << 4) +#define RT5668_DEB_80_MS (0x0 << 4) + +/* SAR ADC Inline Command Control 1 (0x0210) */ +#define RT5668_SAR_BUTT_DET_MASK (0x1 << 15) +#define RT5668_SAR_BUTT_DET_EN (0x1 << 15) +#define RT5668_SAR_BUTT_DET_DIS (0x0 << 15) +#define RT5668_SAR_BUTDET_MODE_MASK (0x1 << 14) +#define RT5668_SAR_BUTDET_POW_SAV (0x1 << 14) +#define RT5668_SAR_BUTDET_POW_NORM (0x0 << 14) +#define RT5668_SAR_BUTDET_RST_MASK (0x1 << 13) +#define RT5668_SAR_BUTDET_RST_NORMAL (0x1 << 13) +#define RT5668_SAR_BUTDET_RST (0x0 << 13) +#define RT5668_SAR_POW_MASK (0x1 << 12) +#define RT5668_SAR_POW_EN (0x1 << 12) +#define RT5668_SAR_POW_DIS (0x0 << 12) +#define RT5668_SAR_RST_MASK (0x1 << 11) +#define RT5668_SAR_RST_NORMAL (0x1 << 11) +#define RT5668_SAR_RST (0x0 << 11) +#define RT5668_SAR_BYPASS_MASK (0x1 << 10) +#define RT5668_SAR_BYPASS_EN (0x1 << 10) +#define RT5668_SAR_BYPASS_DIS (0x0 << 10) +#define RT5668_SAR_SEL_MB1_MASK (0x1 << 9) +#define RT5668_SAR_SEL_MB1_SEL (0x1 << 9) +#define RT5668_SAR_SEL_MB1_NOSEL (0x0 << 9) +#define RT5668_SAR_SEL_MB2_MASK (0x1 << 8) +#define RT5668_SAR_SEL_MB2_SEL (0x1 << 8) +#define RT5668_SAR_SEL_MB2_NOSEL (0x0 << 8) +#define RT5668_SAR_SEL_MODE_MASK (0x1 << 7) +#define RT5668_SAR_SEL_MODE_CMP (0x1 << 7) +#define RT5668_SAR_SEL_MODE_ADC (0x0 << 7) +#define RT5668_SAR_SEL_MB1_MB2_MASK (0x1 << 5) +#define RT5668_SAR_SEL_MB1_MB2_AUTO (0x1 << 5) +#define RT5668_SAR_SEL_MB1_MB2_MANU (0x0 << 5) +#define RT5668_SAR_SEL_SIGNAL_MASK (0x1 << 4) +#define RT5668_SAR_SEL_SIGNAL_AUTO (0x1 << 4) +#define RT5668_SAR_SEL_SIGNAL_MANU (0x0 << 4) + +/* SAR ADC Inline Command Control 13 (0x021c) */ +#define RT5668_SAR_SOUR_MASK (0x3f) +#define RT5668_SAR_SOUR_BTN (0x3f) +#define RT5668_SAR_SOUR_TYPE (0x0) + + +/* System Clock Source */ +enum { + RT5668_SCLK_S_MCLK, + RT5668_SCLK_S_PLL1, + RT5668_SCLK_S_PLL2, + RT5668_SCLK_S_RCCLK, +}; + +/* PLL Source */ +enum { + RT5668_PLL1_S_MCLK, + RT5668_PLL1_S_BCLK1, + RT5668_PLL1_S_RCCLK, +}; + +enum { + RT5668_AIF1, + RT5668_AIF2, + RT5668_AIFS +}; + +/* filter mask */ +enum { + RT5668_DA_STEREO1_FILTER = 0x1, + RT5668_AD_STEREO1_FILTER = (0x1 << 1), +}; + +enum { + RT5668_CLK_SEL_SYS, + RT5668_CLK_SEL_I2S1_ASRC, + RT5668_CLK_SEL_I2S2_ASRC, +}; + +int rt5668_sel_asrc_clk_src(struct snd_soc_component *component, + unsigned int filter_mask, unsigned int clk_src); + +#endif /* __RT5668_H__ */ -- cgit v1.2.3 From ae0e28265e216dad11d4cbde42fc15e92919af78 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 11 Apr 2018 09:39:25 +0200 Subject: drm/blend: Add a generic alpha property Some drivers duplicate the logic to create a property to store a per-plane alpha. This is especially useful if we ever want to support extra protocols for Wayland like: https://lists.freedesktop.org/archives/wayland-devel/2017-August/034741.html Let's create a helper in order to move that to the core. Acked-by: Maarten Lankhorst Acked-by: Sean Paul Reviewed-by: Boris Brezillon Reviewed-by: Eric Anholt Reviewed-by: Laurent Pinchart Reviewed-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/6e1ce0db78fcfc407e94913c64819e65109d034d.1523432341.git-series.maxime.ripard@bootlin.com --- drivers/gpu/drm/drm_atomic.c | 4 ++++ drivers/gpu/drm/drm_atomic_helper.c | 4 ++++ drivers/gpu/drm/drm_blend.c | 39 +++++++++++++++++++++++++++++++++++++ include/drm/drm_blend.h | 3 +++ include/drm/drm_plane.h | 6 ++++++ 5 files changed, 56 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 7d25c42f22db..3d9ae057a6cd 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -783,6 +783,8 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane, state->src_w = val; } else if (property == config->prop_src_h) { state->src_h = val; + } else if (property == plane->alpha_property) { + state->alpha = val; } else if (property == plane->rotation_property) { if (!is_power_of_2(val & DRM_MODE_ROTATE_MASK)) return -EINVAL; @@ -848,6 +850,8 @@ drm_atomic_plane_get_property(struct drm_plane *plane, *val = state->src_w; } else if (property == config->prop_src_h) { *val = state->src_h; + } else if (property == plane->alpha_property) { + *val = state->alpha; } else if (property == plane->rotation_property) { *val = state->rotation; } else if (property == plane->zpos_property) { diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index ee03c1ed2521..0587a0a2f3aa 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -3500,6 +3500,10 @@ void drm_atomic_helper_plane_reset(struct drm_plane *plane) if (plane->state) { plane->state->plane = plane; plane->state->rotation = DRM_MODE_ROTATE_0; + + /* Reset the alpha value to fully opaque if it matters */ + if (plane->alpha_property) + plane->state->alpha = plane->alpha_property->values[1]; } } EXPORT_SYMBOL(drm_atomic_helper_plane_reset); diff --git a/drivers/gpu/drm/drm_blend.c b/drivers/gpu/drm/drm_blend.c index 5a81e1b4c076..a16a74d7e15e 100644 --- a/drivers/gpu/drm/drm_blend.c +++ b/drivers/gpu/drm/drm_blend.c @@ -88,6 +88,13 @@ * On top of this basic transformation additional properties can be exposed by * the driver: * + * alpha: + * Alpha is setup with drm_plane_create_alpha_property(). It controls the + * plane-wide opacity, from transparent (0) to opaque (0xffff). It can be + * combined with pixel alpha. + * The pixel values in the framebuffers are expected to not be + * pre-multiplied by the global alpha associated to the plane. + * * rotation: * Rotation is set up with drm_plane_create_rotation_property(). It adds a * rotation and reflection step between the source and destination rectangles. @@ -105,6 +112,38 @@ * exposed and assumed to be black). */ +/** + * drm_plane_create_alpha_property - create a new alpha property + * @plane: drm plane + * + * This function creates a generic, mutable, alpha property and enables support + * for it in the DRM core. It is attached to @plane. + * + * The alpha property will be allowed to be within the bounds of 0 + * (transparent) to 0xffff (opaque). + * + * Returns: + * 0 on success, negative error code on failure. + */ +int drm_plane_create_alpha_property(struct drm_plane *plane) +{ + struct drm_property *prop; + + prop = drm_property_create_range(plane->dev, 0, "alpha", + 0, DRM_BLEND_ALPHA_OPAQUE); + if (!prop) + return -ENOMEM; + + drm_object_attach_property(&plane->base, prop, DRM_BLEND_ALPHA_OPAQUE); + plane->alpha_property = prop; + + if (plane->state) + plane->state->alpha = DRM_BLEND_ALPHA_OPAQUE; + + return 0; +} +EXPORT_SYMBOL(drm_plane_create_alpha_property); + /** * drm_plane_create_rotation_property - create a new rotation property * @plane: drm plane diff --git a/include/drm/drm_blend.h b/include/drm/drm_blend.h index 17606026590b..330c561c4c11 100644 --- a/include/drm/drm_blend.h +++ b/include/drm/drm_blend.h @@ -36,6 +36,9 @@ static inline bool drm_rotation_90_or_270(unsigned int rotation) return rotation & (DRM_MODE_ROTATE_90 | DRM_MODE_ROTATE_270); } +#define DRM_BLEND_ALPHA_OPAQUE 0xffff + +int drm_plane_create_alpha_property(struct drm_plane *plane); int drm_plane_create_rotation_property(struct drm_plane *plane, unsigned int rotation, unsigned int supported_rotations); diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h index d6da26d66a4b..9563bd25f19b 100644 --- a/include/drm/drm_plane.h +++ b/include/drm/drm_plane.h @@ -43,6 +43,7 @@ struct drm_modeset_acquire_ctx; * plane (in 16.16) * @src_w: width of visible portion of plane (in 16.16) * @src_h: height of visible portion of plane (in 16.16) + * @alpha: opacity of the plane * @rotation: rotation of the plane * @zpos: priority of the given plane on crtc (optional) * Note that multiple active planes on the same crtc can have an identical @@ -106,6 +107,9 @@ struct drm_plane_state { uint32_t src_x, src_y; uint32_t src_h, src_w; + /* Plane opacity */ + u16 alpha; + /* Plane rotation */ unsigned int rotation; @@ -496,6 +500,7 @@ enum drm_plane_type { * @funcs: helper functions * @properties: property tracking for this plane * @type: type of plane (overlay, primary, cursor) + * @alpha_property: alpha property for this plane * @zpos_property: zpos property for this plane * @rotation_property: rotation property for this plane * @helper_private: mid-layer private data @@ -571,6 +576,7 @@ struct drm_plane { */ struct drm_plane_state *state; + struct drm_property *alpha_property; struct drm_property *zpos_property; struct drm_property *rotation_property; -- cgit v1.2.3 From ad56b738c5dd223a2f66685830f82194025a6138 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 21 Mar 2018 21:22:47 +0200 Subject: docs/vm: rename documentation files to .rst Signed-off-by: Mike Rapoport Signed-off-by: Jonathan Corbet --- Documentation/ABI/stable/sysfs-devices-node | 2 +- .../ABI/testing/sysfs-kernel-mm-hugepages | 2 +- Documentation/ABI/testing/sysfs-kernel-mm-ksm | 2 +- Documentation/ABI/testing/sysfs-kernel-slab | 4 +- Documentation/admin-guide/kernel-parameters.txt | 12 +- Documentation/dev-tools/kasan.rst | 2 +- Documentation/filesystems/proc.txt | 4 +- Documentation/filesystems/tmpfs.txt | 2 +- Documentation/sysctl/vm.txt | 6 +- Documentation/vm/00-INDEX | 58 +- Documentation/vm/active_mm.rst | 91 +++ Documentation/vm/active_mm.txt | 91 --- Documentation/vm/balance | 102 ---- Documentation/vm/balance.rst | 102 ++++ Documentation/vm/cleancache.rst | 296 ++++++++++ Documentation/vm/cleancache.txt | 296 ---------- Documentation/vm/frontswap.rst | 293 ++++++++++ Documentation/vm/frontswap.txt | 293 ---------- Documentation/vm/highmem.rst | 147 +++++ Documentation/vm/highmem.txt | 147 ----- Documentation/vm/hmm.rst | 374 +++++++++++++ Documentation/vm/hmm.txt | 374 ------------- Documentation/vm/hugetlbfs_reserv.rst | 587 ++++++++++++++++++++ Documentation/vm/hugetlbfs_reserv.txt | 587 -------------------- Documentation/vm/hugetlbpage.rst | 386 +++++++++++++ Documentation/vm/hugetlbpage.txt | 386 ------------- Documentation/vm/hwpoison.rst | 186 +++++++ Documentation/vm/hwpoison.txt | 186 ------- Documentation/vm/idle_page_tracking.rst | 115 ++++ Documentation/vm/idle_page_tracking.txt | 115 ---- Documentation/vm/ksm.rst | 183 ++++++ Documentation/vm/ksm.txt | 183 ------ Documentation/vm/mmu_notifier.rst | 99 ++++ Documentation/vm/mmu_notifier.txt | 99 ---- Documentation/vm/numa | 150 ----- Documentation/vm/numa.rst | 150 +++++ Documentation/vm/numa_memory_policy.rst | 485 ++++++++++++++++ Documentation/vm/numa_memory_policy.txt | 485 ---------------- Documentation/vm/overcommit-accounting | 87 --- Documentation/vm/overcommit-accounting.rst | 87 +++ Documentation/vm/page_frags | 45 -- Documentation/vm/page_frags.rst | 45 ++ Documentation/vm/page_migration | 257 --------- Documentation/vm/page_migration.rst | 257 +++++++++ Documentation/vm/page_owner.rst | 90 +++ Documentation/vm/page_owner.txt | 90 --- Documentation/vm/pagemap.rst | 197 +++++++ Documentation/vm/pagemap.txt | 197 ------- Documentation/vm/remap_file_pages.rst | 33 ++ Documentation/vm/remap_file_pages.txt | 33 -- Documentation/vm/slub.rst | 361 ++++++++++++ Documentation/vm/slub.txt | 361 ------------ Documentation/vm/soft-dirty.rst | 47 ++ Documentation/vm/soft-dirty.txt | 47 -- Documentation/vm/split_page_table_lock | 100 ---- Documentation/vm/split_page_table_lock.rst | 100 ++++ Documentation/vm/swap_numa.rst | 80 +++ Documentation/vm/swap_numa.txt | 80 --- Documentation/vm/transhuge.rst | 573 +++++++++++++++++++ Documentation/vm/transhuge.txt | 573 ------------------- Documentation/vm/unevictable-lru.rst | 614 +++++++++++++++++++++ Documentation/vm/unevictable-lru.txt | 614 --------------------- Documentation/vm/userfaultfd.rst | 241 ++++++++ Documentation/vm/userfaultfd.txt | 241 -------- Documentation/vm/z3fold.rst | 30 + Documentation/vm/z3fold.txt | 30 - Documentation/vm/zsmalloc.rst | 82 +++ Documentation/vm/zsmalloc.txt | 82 --- Documentation/vm/zswap.rst | 135 +++++ Documentation/vm/zswap.txt | 135 ----- MAINTAINERS | 2 +- arch/alpha/Kconfig | 2 +- arch/ia64/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- fs/Kconfig | 2 +- fs/dax.c | 2 +- fs/proc/task_mmu.c | 4 +- include/linux/hmm.h | 2 +- include/linux/memremap.h | 4 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 +- include/linux/swap.h | 2 +- mm/Kconfig | 6 +- mm/cleancache.c | 2 +- mm/frontswap.c | 2 +- mm/hmm.c | 2 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 4 +- mm/ksm.c | 4 +- mm/mmap.c | 2 +- mm/rmap.c | 6 +- mm/util.c | 2 +- 93 files changed, 6546 insertions(+), 6546 deletions(-) create mode 100644 Documentation/vm/active_mm.rst delete mode 100644 Documentation/vm/active_mm.txt delete mode 100644 Documentation/vm/balance create mode 100644 Documentation/vm/balance.rst create mode 100644 Documentation/vm/cleancache.rst delete mode 100644 Documentation/vm/cleancache.txt create mode 100644 Documentation/vm/frontswap.rst delete mode 100644 Documentation/vm/frontswap.txt create mode 100644 Documentation/vm/highmem.rst delete mode 100644 Documentation/vm/highmem.txt create mode 100644 Documentation/vm/hmm.rst delete mode 100644 Documentation/vm/hmm.txt create mode 100644 Documentation/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/vm/hugetlbfs_reserv.txt create mode 100644 Documentation/vm/hugetlbpage.rst delete mode 100644 Documentation/vm/hugetlbpage.txt create mode 100644 Documentation/vm/hwpoison.rst delete mode 100644 Documentation/vm/hwpoison.txt create mode 100644 Documentation/vm/idle_page_tracking.rst delete mode 100644 Documentation/vm/idle_page_tracking.txt create mode 100644 Documentation/vm/ksm.rst delete mode 100644 Documentation/vm/ksm.txt create mode 100644 Documentation/vm/mmu_notifier.rst delete mode 100644 Documentation/vm/mmu_notifier.txt delete mode 100644 Documentation/vm/numa create mode 100644 Documentation/vm/numa.rst create mode 100644 Documentation/vm/numa_memory_policy.rst delete mode 100644 Documentation/vm/numa_memory_policy.txt delete mode 100644 Documentation/vm/overcommit-accounting create mode 100644 Documentation/vm/overcommit-accounting.rst delete mode 100644 Documentation/vm/page_frags create mode 100644 Documentation/vm/page_frags.rst delete mode 100644 Documentation/vm/page_migration create mode 100644 Documentation/vm/page_migration.rst create mode 100644 Documentation/vm/page_owner.rst delete mode 100644 Documentation/vm/page_owner.txt create mode 100644 Documentation/vm/pagemap.rst delete mode 100644 Documentation/vm/pagemap.txt create mode 100644 Documentation/vm/remap_file_pages.rst delete mode 100644 Documentation/vm/remap_file_pages.txt create mode 100644 Documentation/vm/slub.rst delete mode 100644 Documentation/vm/slub.txt create mode 100644 Documentation/vm/soft-dirty.rst delete mode 100644 Documentation/vm/soft-dirty.txt delete mode 100644 Documentation/vm/split_page_table_lock create mode 100644 Documentation/vm/split_page_table_lock.rst create mode 100644 Documentation/vm/swap_numa.rst delete mode 100644 Documentation/vm/swap_numa.txt create mode 100644 Documentation/vm/transhuge.rst delete mode 100644 Documentation/vm/transhuge.txt create mode 100644 Documentation/vm/unevictable-lru.rst delete mode 100644 Documentation/vm/unevictable-lru.txt create mode 100644 Documentation/vm/userfaultfd.rst delete mode 100644 Documentation/vm/userfaultfd.txt create mode 100644 Documentation/vm/z3fold.rst delete mode 100644 Documentation/vm/z3fold.txt create mode 100644 Documentation/vm/zsmalloc.rst delete mode 100644 Documentation/vm/zsmalloc.txt create mode 100644 Documentation/vm/zswap.rst delete mode 100644 Documentation/vm/zswap.txt (limited to 'include') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 5b2d0f08867c..b38f4b734567 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -90,4 +90,4 @@ Date: December 2009 Contact: Lee Schermerhorn Description: The node's huge page size control/query attributes. - See Documentation/vm/hugetlbpage.txt \ No newline at end of file + See Documentation/vm/hugetlbpage.rst \ No newline at end of file diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages index e21c00571cf4..5140b233356c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages +++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -12,4 +12,4 @@ Description: free_hugepages surplus_hugepages resv_hugepages - See Documentation/vm/hugetlbpage.txt for details. + See Documentation/vm/hugetlbpage.rst for details. diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm index 73e653ee2481..dfc13244cda3 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm @@ -40,7 +40,7 @@ Description: Kernel Samepage Merging daemon sysfs interface sleep_millisecs: how many milliseconds ksm should sleep between scans. - See Documentation/vm/ksm.txt for more information. + See Documentation/vm/ksm.rst for more information. What: /sys/kernel/mm/ksm/merge_across_nodes Date: January 2013 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 2cc0a72b64be..29601d93a1c2 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -37,7 +37,7 @@ Description: The alloc_calls file is read-only and lists the kernel code locations from which allocations for this cache were performed. The alloc_calls file only contains information if debugging is - enabled for that cache (see Documentation/vm/slub.txt). + enabled for that cache (see Documentation/vm/slub.rst). What: /sys/kernel/slab/cache/alloc_fastpath Date: February 2008 @@ -219,7 +219,7 @@ Contact: Pekka Enberg , Description: The free_calls file is read-only and lists the locations of object frees if slab debugging is enabled (see - Documentation/vm/slub.txt). + Documentation/vm/slub.rst). What: /sys/kernel/slab/cache/free_fastpath Date: February 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1d1d53f85ddd..5d6e5509c049 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3887,7 +3887,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -3901,7 +3901,7 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.txt. + Documentation/vm/slub.rst. slub_memcg_sysfs= [MM, SLUB] Determines whether to enable sysfs directories for @@ -3915,7 +3915,7 @@ Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.txt. + Documentation/vm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -3924,12 +3924,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slub_nomerge [MM, SLUB] Same with slab_nomerge. This is supported for legacy. @@ -4285,7 +4285,7 @@ Format: [always|madvise|never] Can be used to control the default behavior of the system with respect to transparent hugepages. - See Documentation/vm/transhuge.txt for more details. + See Documentation/vm/transhuge.rst for more details. tsc= Disable clocksource stability checks for TSC. Format: diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index f7a18f274357..aabc8738b3d8 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -120,7 +120,7 @@ A typical out of bounds access report looks like this:: The header of the report discribe what kind of bug happened and what kind of access caused it. It's followed by the description of the accessed slub object -(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and +(see 'SLUB Debug output' section in Documentation/vm/slub.rst for details) and the description of the accessed memory page. In the last section the report shows memory state around the accessed address. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 2a84bb334894..2d3984c70feb 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -515,7 +515,7 @@ guarantees: The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG bits on both physical and virtual pages associated with a process, and the -soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). +soft-dirty bit on pte (see Documentation/vm/soft-dirty.rst for details). To clear the bits for all the pages associated with the process > echo 1 > /proc/PID/clear_refs @@ -536,7 +536,7 @@ Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags using /proc/kpageflags and number of times a page is mapped using -/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. +/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.rst. The /proc/pid/numa_maps is an extension based on maps, showing the memory locality and binding policy, as well as the memory usage (in pages) of diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index a85355cf85f4..627389a34f77 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -105,7 +105,7 @@ policy for the file will revert to "default" policy. NUMA memory allocation policies have optional flags that can be used in conjunction with their modes. These optional flags can be specified when tmpfs is mounted by appending them to the mode before the NodeList. -See Documentation/vm/numa_memory_policy.txt for a list of all available +See Documentation/vm/numa_memory_policy.rst for a list of all available memory allocation policy mode flags and their effect on memory policy. =static is equivalent to MPOL_F_STATIC_NODES diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index ff234d229cbb..ef581a940439 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -516,7 +516,7 @@ nr_hugepages Change the minimum size of the hugepage pool. -See Documentation/vm/hugetlbpage.txt +See Documentation/vm/hugetlbpage.rst ============================================================== @@ -525,7 +525,7 @@ nr_overcommit_hugepages Change the maximum size of the hugepage pool. The maximum is nr_hugepages + nr_overcommit_hugepages. -See Documentation/vm/hugetlbpage.txt +See Documentation/vm/hugetlbpage.rst ============================================================== @@ -668,7 +668,7 @@ and don't use much of it. The default value is 0. -See Documentation/vm/overcommit-accounting and +See Documentation/vm/overcommit-accounting.rst and mm/mmap.c::__vm_enough_memory() for more information. ============================================================== diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 0278f2c85efb..cda564d55b3c 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -1,62 +1,62 @@ 00-INDEX - this file. -active_mm.txt +active_mm.rst - An explanation from Linus about tsk->active_mm vs tsk->mm. -balance +balance.rst - various information on memory balancing. -cleancache.txt +cleancache.rst - Intro to cleancache and page-granularity victim cache. -frontswap.txt +frontswap.rst - Outline frontswap, part of the transcendent memory frontend. -highmem.txt +highmem.rst - Outline of highmem and common issues. -hmm.txt +hmm.rst - Documentation of heterogeneous memory management -hugetlbpage.txt +hugetlbpage.rst - a brief summary of hugetlbpage support in the Linux kernel. -hugetlbfs_reserv.txt +hugetlbfs_reserv.rst - A brief overview of hugetlbfs reservation design/implementation. -hwpoison.txt +hwpoison.rst - explains what hwpoison is -idle_page_tracking.txt +idle_page_tracking.rst - description of the idle page tracking feature. -ksm.txt +ksm.rst - how to use the Kernel Samepage Merging feature. -mmu_notifier.txt +mmu_notifier.rst - a note about clearing pte/pmd and mmu notifications -numa +numa.rst - information about NUMA specific code in the Linux vm. -numa_memory_policy.txt +numa_memory_policy.rst - documentation of concepts and APIs of the 2.6 memory policy support. -overcommit-accounting +overcommit-accounting.rst - description of the Linux kernels overcommit handling modes. -page_frags +page_frags.rst - description of page fragments allocator -page_migration +page_migration.rst - description of page migration in NUMA systems. -pagemap.txt +pagemap.rst - pagemap, from the userspace perspective -page_owner.txt +page_owner.rst - tracking about who allocated each page -remap_file_pages.txt +remap_file_pages.rst - a note about remap_file_pages() system call -slub.txt +slub.rst - a short users guide for SLUB. -soft-dirty.txt +soft-dirty.rst - short explanation for soft-dirty PTEs -split_page_table_lock +split_page_table_lock.rst - Separate per-table lock to improve scalability of the old page_table_lock. -swap_numa.txt +swap_numa.rst - automatic binding of swap device to numa node -transhuge.txt +transhuge.rst - Transparent Hugepage Support, alternative way of using hugepages. -unevictable-lru.txt +unevictable-lru.rst - Unevictable LRU infrastructure -userfaultfd.txt +userfaultfd.rst - description of userfaultfd system call z3fold.txt - outline of z3fold allocator for storing compressed pages -zsmalloc.txt +zsmalloc.rst - outline of zsmalloc allocator for storing compressed pages -zswap.txt +zswap.rst - Intro to compressed cache for swap pages diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst new file mode 100644 index 000000000000..c84471b180f8 --- /dev/null +++ b/Documentation/vm/active_mm.rst @@ -0,0 +1,91 @@ +.. _active_mm: + +========= +Active MM +========= + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + Cc'd to linux-kernel, because I don't write explanations all that often, + and when I do I feel better about more people reading them. + + On Fri, 30 Jul 1999, David Mosberger wrote: + > + > Is there a brief description someplace on how "mm" vs. "active_mm" in + > the task_struct are supposed to be used? (My apologies if this was + > discussed on the mailing lists---I just returned from vacation and + > wasn't able to follow linux-kernel for a while). + + Basically, the new setup is: + + - we have "real address spaces" and "anonymous address spaces". The + difference is that an anonymous address space doesn't care about the + user-level page tables at all, so when we do a context switch into an + anonymous address space we just leave the previous address space + active. + + The obvious use for a "anonymous address space" is any thread that + doesn't need any user mappings - all kernel threads basically fall into + this category, but even "real" threads can temporarily say that for + some amount of time they are not going to be interested in user space, + and that the scheduler might as well try to avoid wasting time on + switching the VM state around. Currently only the old-style bdflush + sync does that. + + - "tsk->mm" points to the "real address space". For an anonymous process, + tsk->mm will be NULL, for the logical reason that an anonymous process + really doesn't _have_ a real address space at all. + + - however, we obviously need to keep track of which address space we + "stole" for such an anonymous user. For that, we have "tsk->active_mm", + which shows what the currently active address space is. + + The rule is that for a process with a real address space (ie tsk->mm is + non-NULL) the active_mm obviously always has to be the same as the real + one. + + For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the + "borrowed" mm while the anonymous process is running. When the + anonymous process gets scheduled away, the borrowed address space is + returned and cleared. + + To support all that, the "struct mm_struct" now has two counters: a + "mm_users" counter that is how many "real address space users" there are, + and a "mm_count" counter that is the number of "lazy" users (ie anonymous + users) plus one if there are any real users. + + Usually there is at least one real user, but it could be that the real + user exited on another CPU while a lazy user was still active, so you do + actually get cases where you have a address space that is _only_ used by + lazy users. That is often a short-lived state, because once that thread + gets scheduled away in favour of a real thread, the "zombie" mm gets + released because "mm_users" becomes zero. + + Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any + more. "init_mm" should be considered just a "lazy context when no other + context is available", and in fact it is mainly used just at bootup when + no real VM has yet been created. So code that used to check + + if (current->mm == &init_mm) + + should generally just do + + if (!current->mm) + + instead (which makes more sense anyway - the test is basically one of "do + we have a user context", and is generally done by the page fault handler + and things like that). + + Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, + because it slightly changes the interfaces to accommodate the alpha (who + would have thought it, but the alpha actually ends up having one of the + ugliest context switch codes - unlike the other architectures where the MM + and register state is separate, the alpha PALcode joins the two, and you + need to switch both together). + + (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt deleted file mode 100644 index c84471b180f8..000000000000 --- a/Documentation/vm/active_mm.txt +++ /dev/null @@ -1,91 +0,0 @@ -.. _active_mm: - -========= -Active MM -========= - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - Cc'd to linux-kernel, because I don't write explanations all that often, - and when I do I feel better about more people reading them. - - On Fri, 30 Jul 1999, David Mosberger wrote: - > - > Is there a brief description someplace on how "mm" vs. "active_mm" in - > the task_struct are supposed to be used? (My apologies if this was - > discussed on the mailing lists---I just returned from vacation and - > wasn't able to follow linux-kernel for a while). - - Basically, the new setup is: - - - we have "real address spaces" and "anonymous address spaces". The - difference is that an anonymous address space doesn't care about the - user-level page tables at all, so when we do a context switch into an - anonymous address space we just leave the previous address space - active. - - The obvious use for a "anonymous address space" is any thread that - doesn't need any user mappings - all kernel threads basically fall into - this category, but even "real" threads can temporarily say that for - some amount of time they are not going to be interested in user space, - and that the scheduler might as well try to avoid wasting time on - switching the VM state around. Currently only the old-style bdflush - sync does that. - - - "tsk->mm" points to the "real address space". For an anonymous process, - tsk->mm will be NULL, for the logical reason that an anonymous process - really doesn't _have_ a real address space at all. - - - however, we obviously need to keep track of which address space we - "stole" for such an anonymous user. For that, we have "tsk->active_mm", - which shows what the currently active address space is. - - The rule is that for a process with a real address space (ie tsk->mm is - non-NULL) the active_mm obviously always has to be the same as the real - one. - - For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the - "borrowed" mm while the anonymous process is running. When the - anonymous process gets scheduled away, the borrowed address space is - returned and cleared. - - To support all that, the "struct mm_struct" now has two counters: a - "mm_users" counter that is how many "real address space users" there are, - and a "mm_count" counter that is the number of "lazy" users (ie anonymous - users) plus one if there are any real users. - - Usually there is at least one real user, but it could be that the real - user exited on another CPU while a lazy user was still active, so you do - actually get cases where you have a address space that is _only_ used by - lazy users. That is often a short-lived state, because once that thread - gets scheduled away in favour of a real thread, the "zombie" mm gets - released because "mm_users" becomes zero. - - Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any - more. "init_mm" should be considered just a "lazy context when no other - context is available", and in fact it is mainly used just at bootup when - no real VM has yet been created. So code that used to check - - if (current->mm == &init_mm) - - should generally just do - - if (!current->mm) - - instead (which makes more sense anyway - the test is basically one of "do - we have a user context", and is generally done by the page fault handler - and things like that). - - Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, - because it slightly changes the interfaces to accommodate the alpha (who - would have thought it, but the alpha actually ends up having one of the - ugliest context switch codes - unlike the other architectures where the MM - and register state is separate, the alpha PALcode joins the two, and you - need to switch both together). - - (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/vm/balance b/Documentation/vm/balance deleted file mode 100644 index 6a1fadf3e173..000000000000 --- a/Documentation/vm/balance +++ /dev/null @@ -1,102 +0,0 @@ -.. _balance: - -================ -Memory Balancing -================ - -Started Jan 2000 by Kanoj Sarcar - -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as -well as for non __GFP_IO allocations. - -The first reason why a caller may avoid reclaim is that the caller can not -sleep due to holding a spinlock or is in interrupt context. The second may -be that the caller is willing to fail the allocation without incurring the -overhead of page reclaim. This may happen for opportunistic high-order -allocation requests that have order-0 fallback options. In such cases, -the caller may also wish to avoid waking kswapd. - -__GFP_IO allocation requests are made to prevent file system deadlocks. - -In the absence of non sleepable allocation requests, it seems detrimental -to be doing balancing. Page reclamation can be kicked off lazily, that -is, only when needed (aka zone free memory is 0), instead of making it -a proactive process. - -That being said, the kernel should try to fulfill requests for direct -mapped pages from the direct mapped pool, instead of falling back on -the dma pool, so as to keep the dma pool filled for dma requests (atomic -or not). A similar argument applies to highmem and direct mapped pages. -OTOH, if there is a lot of free dma pages, it is preferable to satisfy -regular memory requests by allocating one from the dma pool, instead -of incurring the overhead of regular zone balancing. - -In 2.2, memory balancing/page reclamation would kick off only when the -_total_ number of free pages fell below 1/64 th of total memory. With the -right ratio of dma and regular memory, it is quite possible that balancing -would not be done even when the dma zone was completely empty. 2.2 has -been running production machines of varying memory sizes, and seems to be -doing fine even with the presence of this problem. In 2.3, due to -HIGHMEM, this problem is aggravated. - -In 2.3, zone balancing can be done in one of two ways: depending on the -zone size (and possibly of the size of lower class zones), we can decide -at init time how many free pages we should aim for while balancing any -zone. The good part is, while balancing, we do not need to look at sizes -of lower class zones, the bad part is, we might do too frequent balancing -due to ignoring possibly lower usage in the lower class zones. Also, -with a slight change in the allocation routine, it is possible to reduce -the memclass() macro to be a simple equality. - -Another possible solution is that we balance only when the free memory -of a zone _and_ all its lower class zones falls below 1/64th of the -total memory in the zone and its lower class zones. This fixes the 2.2 -balancing problem, and stays as close to 2.2 behavior as possible. Also, -the balancing algorithm works the same way on the various architectures, -which have different numbers and types of zones. If we wanted to get -fancy, we could assign different weights to free pages in different -zones in the future. - -Note that if the size of the regular zone is huge compared to dma zone, -it becomes less significant to consider the free dma pages while -deciding whether to balance the regular zone. The first solution -becomes more attractive then. - -The appended patch implements the second solution. It also "fixes" two -problems: first, kswapd is woken up as in 2.2 on low memory conditions -for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, -so as to give a fighting chance for replace_with_highmem() to get a -HIGHMEM page, as well as to ensure that HIGHMEM allocations do not -fall back into regular zone. This also makes sure that HIGHMEM pages -are not leaked (for example, in situations where a HIGHMEM page is in -the swapcache but is not being used by anyone) - -kswapd also needs to know about the zones it should balance. kswapd is -primarily needed in a situation where balancing can not be done, -probably because all allocation requests are coming from intr context -and all process contexts are sleeping. For 2.3, kswapd does not really -need to balance the highmem zone, since intr context does not request -highmem pages. kswapd looks at the zone_wake_kswapd field in the zone -structure to decide whether a zone needs balancing. - -Page stealing from process memory and shm is done if stealing the page would -alleviate memory pressure on any zone in the page's node that has fallen below -its watermark. - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These -are per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below watermark[WMARK_MIN], the hysteric field -low_on_memory gets set. This stays set till the number of free pages becomes -watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will -try to free some pages in the zone (providing GFP_WAIT is set in the request). -Orthogonal to this, is the decision to poke kswapd to free some zone pages. -That decision is not hysteresis based, and is done when the number of free -pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. - - -(Good) Ideas that I have heard: - -1. Dynamic experience should influence balancing: number of failed requests - for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) -2. Implement a replace_with_highmem()-like replace_with_regular() to preserve - dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst new file mode 100644 index 000000000000..6a1fadf3e173 --- /dev/null +++ b/Documentation/vm/balance.rst @@ -0,0 +1,102 @@ +.. _balance: + +================ +Memory Balancing +================ + +Started Jan 2000 by Kanoj Sarcar + +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. + +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. + +__GFP_IO allocation requests are made to prevent file system deadlocks. + +In the absence of non sleepable allocation requests, it seems detrimental +to be doing balancing. Page reclamation can be kicked off lazily, that +is, only when needed (aka zone free memory is 0), instead of making it +a proactive process. + +That being said, the kernel should try to fulfill requests for direct +mapped pages from the direct mapped pool, instead of falling back on +the dma pool, so as to keep the dma pool filled for dma requests (atomic +or not). A similar argument applies to highmem and direct mapped pages. +OTOH, if there is a lot of free dma pages, it is preferable to satisfy +regular memory requests by allocating one from the dma pool, instead +of incurring the overhead of regular zone balancing. + +In 2.2, memory balancing/page reclamation would kick off only when the +_total_ number of free pages fell below 1/64 th of total memory. With the +right ratio of dma and regular memory, it is quite possible that balancing +would not be done even when the dma zone was completely empty. 2.2 has +been running production machines of varying memory sizes, and seems to be +doing fine even with the presence of this problem. In 2.3, due to +HIGHMEM, this problem is aggravated. + +In 2.3, zone balancing can be done in one of two ways: depending on the +zone size (and possibly of the size of lower class zones), we can decide +at init time how many free pages we should aim for while balancing any +zone. The good part is, while balancing, we do not need to look at sizes +of lower class zones, the bad part is, we might do too frequent balancing +due to ignoring possibly lower usage in the lower class zones. Also, +with a slight change in the allocation routine, it is possible to reduce +the memclass() macro to be a simple equality. + +Another possible solution is that we balance only when the free memory +of a zone _and_ all its lower class zones falls below 1/64th of the +total memory in the zone and its lower class zones. This fixes the 2.2 +balancing problem, and stays as close to 2.2 behavior as possible. Also, +the balancing algorithm works the same way on the various architectures, +which have different numbers and types of zones. If we wanted to get +fancy, we could assign different weights to free pages in different +zones in the future. + +Note that if the size of the regular zone is huge compared to dma zone, +it becomes less significant to consider the free dma pages while +deciding whether to balance the regular zone. The first solution +becomes more attractive then. + +The appended patch implements the second solution. It also "fixes" two +problems: first, kswapd is woken up as in 2.2 on low memory conditions +for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, +so as to give a fighting chance for replace_with_highmem() to get a +HIGHMEM page, as well as to ensure that HIGHMEM allocations do not +fall back into regular zone. This also makes sure that HIGHMEM pages +are not leaked (for example, in situations where a HIGHMEM page is in +the swapcache but is not being used by anyone) + +kswapd also needs to know about the zones it should balance. kswapd is +primarily needed in a situation where balancing can not be done, +probably because all allocation requests are coming from intr context +and all process contexts are sleeping. For 2.3, kswapd does not really +need to balance the highmem zone, since intr context does not request +highmem pages. kswapd looks at the zone_wake_kswapd field in the zone +structure to decide whether a zone needs balancing. + +Page stealing from process memory and shm is done if stealing the page would +alleviate memory pressure on any zone in the page's node that has fallen below +its watermark. + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. + + +(Good) Ideas that I have heard: + +1. Dynamic experience should influence balancing: number of failed requests + for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) +2. Implement a replace_with_highmem()-like replace_with_regular() to preserve + dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/vm/cleancache.rst b/Documentation/vm/cleancache.rst new file mode 100644 index 000000000000..68cba9131c31 --- /dev/null +++ b/Documentation/vm/cleancache.rst @@ -0,0 +1,296 @@ +.. _cleancache: + +========== +Cleancache +========== + +Motivation +========== + +Cleancache is a new optional feature provided by the VFS layer that +potentially dramatically increases page cache effectiveness for +many workloads in many environments at a negligible cost. + +Cleancache can be thought of as a page-granularity victim cache for clean +pages that the kernel's pageframe replacement algorithm (PFRA) would like +to keep around, but can't since there isn't enough memory. So when the +PFRA "evicts" a page, it first attempts to use cleancache code to +put the data contained in that page into "transcendent memory", memory +that is not directly accessible or addressable by the kernel and is +of unknown and possibly time-varying size. + +Later, when a cleancache-enabled filesystem wishes to access a page +in a file on disk, it first checks cleancache to see if it already +contains it; if it does, the page of data is copied into the kernel +and a disk access is avoided. + +Transcendent memory "drivers" for cleancache are currently implemented +in Xen (using hypervisor memory) and zcache (using in-kernel compressed +memory) and other implementations are in development. + +:ref:`FAQs ` are included below. + +Implementation Overview +======================= + +A cleancache "backend" that provides transcendent memory registers itself +to the kernel's cleancache "frontend" by calling cleancache_register_ops, +passing a pointer to a cleancache_ops structure with funcs set appropriately. +The functions provided must conform to certain semantics as follows: + +Most important, cleancache is "ephemeral". Pages which are copied into +cleancache have an indefinite lifetime which is completely unknowable +by the kernel and so may or may not still be in cleancache at any later time. +Thus, as its name implies, cleancache is not suitable for dirty pages. +Cleancache has complete discretion over what pages to preserve and what +pages to discard and when. + +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a +pool id which, if positive, must be saved in the filesystem's superblock; +a negative return value indicates failure. A "put_page" will copy a +(presumably about-to-be-evicted) page into cleancache and associate it with +the pool id, a file key, and a page index into the file. (The combination +of a pool id, a file key, and an index is sometimes called a "handle".) +A "get_page" will copy the page, if found, from cleancache into kernel memory. +An "invalidate_page" will ensure the page no longer is present in cleancache; +an "invalidate_inode" will invalidate all pages associated with the specified +file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate +all pages in all files specified by the given pool id and also surrender +the pool id. + +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache +to treat the pool as shared using a 128-bit UUID as a key. On systems +that may run multiple kernels (such as hard partitioned or virtualized +systems) that may share a clustered filesystem, and where cleancache +may be shared among those kernels, calls to init_shared_fs that specify the +same UUID will receive the same pool id, thus allowing the pages to +be shared. Note that any security requirements must be imposed outside +of the kernel (e.g. by "tools" that control cleancache). Or a +cleancache implementation can simply disable shared_init by always +returning a negative value. + +If a get_page is successful on a non-shared pool, the page is invalidated +(thus making cleancache an "exclusive" cache). On a shared pool, the page +is NOT invalidated on a successful get_page so that it remains accessible to +other sharers. The kernel is responsible for ensuring coherency between +cleancache (shared or not), the page cache, and the filesystem, using +cleancache invalidate operations as required. + +Note that cleancache must enforce put-put-get coherency and get-get +coherency. For the former, if two puts are made to the same handle but +with different data, say AAA by the first put and BBB by the second, a +subsequent get can never return the stale data (AAA). For get-get coherency, +if a get for a given handle fails, subsequent gets for that handle will +never succeed unless preceded by a successful put with that handle. + +Last, cleancache provides no SMP serialization guarantees; if two +different Linux threads are simultaneously putting and invalidating a page +with the same handle, the results are indeterminate. Callers must +lock the page to ensure serial behavior. + +Cleancache Performance Metrics +============================== + +If properly configured, monitoring of cleancache is done via debugfs in +the `/sys/kernel/debug/cleancache` directory. The effectiveness of cleancache +can be measured (across all filesystems) with: + +``succ_gets`` + number of gets that were successful + +``failed_gets`` + number of gets that failed + +``puts`` + number of puts attempted (all "succeed") + +``invalidates`` + number of invalidates attempted + +A backend implementation may provide additional metrics. + +.. _faq: + +FAQ +=== + +* Where's the value? (Andrew Morton) + +Cleancache provides a significant performance benefit to many workloads +in many environments with negligible overhead by improving the +effectiveness of the pagecache. Clean pagecache pages are +saved in transcendent memory (RAM that is otherwise not directly +addressable to the kernel); fetching those pages later avoids "refaults" +and thus disk reads. + +Cleancache (and its sister code "frontswap") provide interfaces for +this transcendent memory (aka "tmem"), which conceptually lies between +fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. +Disallowing direct kernel or userland reads/writes to tmem +is ideal when data is transformed to a different form and size (such +as with compression) or secretly moved (as might be useful for write- +balancing for some RAM-like devices). Evicted page-cache pages (and +swap pages) are a great use for this kind of slower-than-RAM-but-much- +faster-than-disk transcendent memory, and the cleancache (and frontswap) +"page-object-oriented" specification provides a nice way to read and +write -- and indirectly "name" -- the pages. + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to +do it well with no kernel change have essentially failed (except in some +well-publicized special-case workloads). Cleancache -- and frontswap -- +with a fairly small impact on the kernel, provide a huge amount +of flexibility for more dynamic, flexible RAM multiplexing. +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "self-ballooning"), page cache pages +are the first to go, and cleancache allows those pages to be +saved and reclaimed if overall host system memory conditions allow. + +And the identical interface used for cleancache can be used in +physical systems as well. The zcache driver acts as a memory-hungry +device that stores pages of data in a compressed state. And +the proposed "RAMster" driver shares RAM across multiple physical +systems. + +* Why does cleancache have its sticky fingers so deep inside the + filesystems and VFS? (Andrew Morton and Christoph Hellwig) + +The core hooks for cleancache in VFS are in most cases a single line +and the minimum set are placed precisely where needed to maintain +coherency (via cleancache_invalidate operations) between cleancache, +the page cache, and disk. All hooks compile into nothingness if +cleancache is config'ed off and turn into a function-pointer- +compare-to-NULL if config'ed on but no backend claims the ops +functions, or to a compare-struct-element-to-negative if a +backend claims the ops functions but a filesystem doesn't enable +cleancache. + +Some filesystems are built entirely on top of VFS and the hooks +in VFS are sufficient, so don't require an "init_fs" hook; the +initial implementation of cleancache didn't provide this hook. +But for some filesystems (such as btrfs), the VFS hooks are +incomplete and one or more hooks in fs-specific code are required. +And for some other filesystems, such as tmpfs, cleancache may +be counterproductive. So it seemed prudent to require a filesystem +to "opt in" to use cleancache, which requires adding a hook in +each filesystem. Not all filesystems are supported by cleancache +only because they haven't been tested. The existing set should +be sufficient to validate the concept, the opt-in approach means +that untested filesystems are not affected, and the hooks in the +existing filesystems should make it very easy to add more +filesystems in the future. + +The total impact of the hooks to existing fs and mm files is only +about 40 lines added (not counting comments and blank lines). + +* Why not make cleancache asynchronous and batched so it can more + easily interface with real devices with DMA instead of copying each + individual page? (Minchan Kim) + +The one-page-at-a-time copy semantics simplifies the implementation +on both the frontend and backend and also allows the backend to +do fancy things on-the-fly like page compression and +page deduplication. And since the data is "gone" (copied into/out +of the pageframe) before the cleancache get/put call returns, +a great deal of race conditions and potential coherency issues +are avoided. While the interface seems odd for a "real device" +or for real kernel-addressable RAM, it makes perfect sense for +transcendent memory. + +* Why is non-shared cleancache "exclusive"? And where is the + page "invalidated" after a "get"? (Minchan Kim) + +The main reason is to free up space in transcendent memory and +to avoid unnecessary cleancache_invalidate calls. If you want inclusive, +the page can be "put" immediately following the "get". If +put-after-get for inclusive becomes common, the interface could +be easily extended to add a "get_no_invalidate" call. + +The invalidate is done by the cleancache backend implementation. + +* What's the performance impact? + +Performance analysis has been presented at OLS'09 and LCA'10. +Briefly, performance gains can be significant on most workloads, +especially when memory pressure is high (e.g. when RAM is +overcommitted in a virtual workload); and because the hooks are +invoked primarily in place of or in addition to a disk read/write, +overhead is negligible even in worst case workloads. Basically +cleancache replaces I/O with memory-copy-CPU-overhead; on older +single-core systems with slow memory-copy speeds, cleancache +has little value, but in newer multicore machines, especially +consolidated/virtualized machines, it has great value. + +* How do I add cleancache support for filesystem X? (Boaz Harrash) + +Filesystems that are well-behaved and conform to certain +restrictions can utilize cleancache simply by making a call to +cleancache_init_fs at mount time. Unusual, misbehaving, or +poorly layered filesystems must either add additional hooks +and/or undergo extensive additional testing... or should just +not enable the optional cleancache. + +Some points for a filesystem to consider: + + - The FS should be block-device-based (e.g. a ram-based FS such + as tmpfs should not enable cleancache) + - To ensure coherency/correctness, the FS must ensure that all + file removal or truncation operations either go through VFS or + add hooks to do the equivalent cleancache "invalidate" operations + - To ensure coherency/correctness, either inode numbers must + be unique across the lifetime of the on-disk file OR the + FS must provide an "encode_fh" function. + - The FS must call the VFS superblock alloc and deactivate routines + or add hooks to do the equivalent cleancache calls done there. + - To maximize performance, all pages fetched from the FS should + go through the do_mpag_readpage routine or the FS should add + hooks to do the equivalent (cf. btrfs) + - Currently, the FS blocksize must be the same as PAGESIZE. This + is not an architectural restriction, but no backends currently + support anything different. + - A clustered FS should invoke the "shared_init_fs" cleancache + hook to get best performance for some backends. + +* Why not use the KVA of the inode as the key? (Christoph Hellwig) + +If cleancache would use the inode virtual address instead of +inode/filehandle, the pool id could be eliminated. But, this +won't work because cleancache retains pagecache data pages +persistently even when the inode has been pruned from the +inode unused list, and only invalidates the data page if the file +gets removed/truncated. So if cleancache used the inode kva, +there would be potential coherency issues if/when the inode +kva is reused for a different file. Alternately, if cleancache +invalidated the pages when the inode kva was freed, much of the value +of cleancache would be lost because the cache of pages in cleanache +is potentially much larger than the kernel pagecache and is most +useful if the pages survive inode cache removal. + +* Why is a global variable required? + +The cleancache_enabled flag is checked in all of the frequently-used +cleancache hooks. The alternative is a function call to check a static +variable. Since cleancache is enabled dynamically at runtime, systems +that don't enable cleancache would suffer thousands (possibly +tens-of-thousands) of unnecessary function calls per second. So the +global variable allows cleancache to be enabled by default at compile +time, but have insignificant performance impact when cleancache remains +disabled at runtime. + +* Does cleanache work with KVM? + +The memory model of KVM is sufficiently different that a cleancache +backend may have less value for KVM. This remains to be tested, +especially in an overcommitted system. + +* Does cleancache work in userspace? It sounds useful for + memory hungry caches like web browsers. (Jamie Lokier) + +No plans yet, though we agree it sounds useful, at least for +apps that bypass the page cache (e.g. O_DIRECT). + +Last updated: Dan Magenheimer, April 13 2011 diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt deleted file mode 100644 index 68cba9131c31..000000000000 --- a/Documentation/vm/cleancache.txt +++ /dev/null @@ -1,296 +0,0 @@ -.. _cleancache: - -========== -Cleancache -========== - -Motivation -========== - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -:ref:`FAQs ` are included below. - -Implementation Overview -======================= - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -The functions provided must conform to certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -An "invalidate_page" will ensure the page no longer is present in cleancache; -an "invalidate_inode" will invalidate all pages associated with the specified -file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate -all pages in all files specified by the given pool id and also surrender -the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is invalidated -(thus making cleancache an "exclusive" cache). On a shared pool, the page -is NOT invalidated on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache invalidate operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and invalidating a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -Cleancache Performance Metrics -============================== - -If properly configured, monitoring of cleancache is done via debugfs in -the `/sys/kernel/debug/cleancache` directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -``succ_gets`` - number of gets that were successful - -``failed_gets`` - number of gets that failed - -``puts`` - number of puts attempted (all "succeed") - -``invalidates`` - number of invalidates attempted - -A backend implementation may provide additional metrics. - -.. _faq: - -FAQ -=== - -* Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -* Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_invalidate operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -* Why not make cleancache asynchronous and batched so it can more - easily interface with real devices with DMA instead of copying each - individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -* Why is non-shared cleancache "exclusive"? And where is the - page "invalidated" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_invalidate calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_invalidate" call. - -The invalidate is done by the cleancache backend implementation. - -* What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -* How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - - - The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) - - To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "invalidate" operations - - To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. - - The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. - - To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) - - Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. - - A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -* Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only invalidates the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -invalidated the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -* Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -* Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -* Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst new file mode 100644 index 000000000000..1979f430c1c5 --- /dev/null +++ b/Documentation/vm/frontswap.rst @@ -0,0 +1,293 @@ +.. _frontswap: + +========= +Frontswap +========= + +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends" +and the only necessary changes to the core kernel for transcendent memory; +all other supporting code -- the "backends" -- is implemented as drivers. +See the LWN.net article `Transcendent memory in a nutshell`_ +for a detailed overview of frontswap and related kernel parts) + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "store" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "load" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further stores with that swap type. + +Once a page is successfully stored, a matching load on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the store returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a store returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +If a backend chooses, frontswap can be configured as a "writethrough +cache" by calling frontswap_writethrough(). In this mode, the reduction +in swap device writes is lost (and also a non-trivial performance advantage) +in order to allow the backend to arbitrarily "reclaim" space used to +store frontswap pages to more completely manage its memory usage. + +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the `/sys/kernel/debug/frontswap` directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +``failed_stores`` + how many store attempts have failed + +``loads`` + how many loads were attempted (all should succeed) + +``succ_stores`` + how many store attempts have succeeded + +``invalidates`` + how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ +=== + +* Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and +cleancache) interface to transcendent memory provides a nice way to read +and write -- and indirectly "name" -- the pages. + +Frontswap -- and cleancache -- with a fairly small impact on the kernel, +provides a huge amount of flexibility for more dynamic, flexible RAM +utilization in various system configurations: + +In the single kernel case, aka "zcache", pages are compressed and +stored in local memory, thus increasing the total anonymous pages +that can be safely kept in RAM. Zcache essentially trades off CPU +cycles used in compression/decompression for better memory utilization. +Benchmarks have shown little or no impact when memory pressure is +low while providing a significant performance improvement (25%+) +on some workloads under high memory pressure. + +"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory +support for clustered systems. Frontswap pages are locally compressed +as in zcache, but then "remotified" to another system's RAM. This +allows RAM to be dynamically load-balanced back-and-forth as needed, +i.e. when system A is overcommitted, it can swap to system B, and +vice versa. RAMster can also be configured as a memory server so +many servers in a cluster can swap, dynamically as needed, to a single +server configured with a large amount of RAM... without pre-configuring +how much of the RAM is available for each of the clients! + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "selfballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM (if overall host system memory +conditions allow), thus mitigating the potentially awful performance impact +of unplanned swapping. + +A KVM implementation is underway and has been RFC'ed to lkml. And, +using frontswap, investigation is also underway on the use of NVM as +a memory extension technology. + +* Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "store" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +When swap pages are stored in transcendent memory instead of written +out to disk, there is a side effect that this may create more memory +pressure that can potentially outweigh the other advantages. A +backend, such as zcache, must implement policies to carefully (but +dynamically) manage memory limits to ensure this doesn't happen. + +* OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "store" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_store is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_store returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_load() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +store" and (possibly) a "frontswap backend loads", which are presumably much +faster. + +* Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap, + or maybe swap-over-nbd/NFS)? + +No. First, the existing swap subsystem doesn't allow for any kind of +swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, +but this would require fairly drastic changes. Even if it were +rewritten, the existing swap subsystem uses the block I/O layer which +assumes a swap device is fixed size and any page in it is linearly +addressable. Frontswap barely touches the existing swap subsystem, +and works around the constraints of the block I/O subsystem to provide +a great deal of flexibility and dynamicity. + +For example, the acceptance of any swap page by the frontswap backend is +entirely unpredictable. This is critical to the definition of frontswap +backends because it grants completely dynamic discretion to the +backend. In zcache, one cannot know a priori how compressible a page is. +"Poorly" compressible pages can be rejected, and "poorly" can itself be +defined dynamically depending on current memory constraints. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread +is free to manipulate the pages stored by frontswap. For example, +the "remotification" thread in RAMster uses standard asynchronous +kernel sockets to move compressed frontswap pages to a remote machine. +Similarly, a KVM guest-side implementation could do in-guest compression +and use "batched" hypercalls. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. + +There is a downside to the transcendent memory specifications for +frontswap: Since any "store" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. This means +that frontswap cannot contain more pages than the total of swapon'd +swap devices. For example, if NO swap device is configured on some +installation, frontswap is useless. Swapless portable devices +can still use frontswap but a backend for such devices must configure +some kind of "ghost" swap device and ensure that it is never used. + +* Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +* What is frontswap_shrink for? + +When the (non-frontswap) swap subsystem swaps out a page to a real +swap device, that page is only taking up low-value pre-allocated disk +space. But if frontswap has placed a page in transcendent memory, that +page may be taking up valuable real estate. The frontswap_shrink +routine allows code outside of the swap subsystem to force pages out +of the memory managed by frontswap and back into kernel-addressable memory. +For example, in RAMster, a "suction driver" thread will attempt +to "repatriate" pages sent to a remote machine back to the local machine; +this is driven using the frontswap_shrink mechanism when memory pressure +subsides. + +* Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt deleted file mode 100644 index 1979f430c1c5..000000000000 --- a/Documentation/vm/frontswap.txt +++ /dev/null @@ -1,293 +0,0 @@ -.. _frontswap: - -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends" -and the only necessary changes to the core kernel for transcendent memory; -all other supporting code -- the "backends" -- is implemented as drivers. -See the LWN.net article `Transcendent memory in a nutshell`_ -for a detailed overview of frontswap and related kernel parts) - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -If a backend chooses, frontswap can be configured as a "writethrough -cache" by calling frontswap_writethrough(). In this mode, the reduction -in swap device writes is lost (and also a non-trivial performance advantage) -in order to allow the backend to arbitrarily "reclaim" space used to -store frontswap pages to more completely manage its memory usage. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and -cleancache) interface to transcendent memory provides a nice way to read -and write -- and indirectly "name" -- the pages. - -Frontswap -- and cleancache -- with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* What is frontswap_shrink for? - -When the (non-frontswap) swap subsystem swaps out a page to a real -swap device, that page is only taking up low-value pre-allocated disk -space. But if frontswap has placed a page in transcendent memory, that -page may be taking up valuable real estate. The frontswap_shrink -routine allows code outside of the swap subsystem to force pages out -of the memory managed by frontswap and back into kernel-addressable memory. -For example, in RAMster, a "suction driver" thread will attempt -to "repatriate" pages sent to a remote machine back to the local machine; -this is driven using the frontswap_shrink mechanism when memory pressure -subsides. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst new file mode 100644 index 000000000000..0f69a9fec34d --- /dev/null +++ b/Documentation/vm/highmem.rst @@ -0,0 +1,147 @@ +.. _highmem: + +==================== +High Memory Handling +==================== + +By: Peter Zijlstra + +.. contents:: :local: + +What Is High Memory? +==================== + +High memory (highmem) is used when the size of physical memory approaches or +exceeds the maximum size of virtual memory. At that point it becomes +impossible for the kernel to keep all of the available physical memory mapped +at all times. This means the kernel needs to start using temporary mappings of +the pieces of physical memory that it wants to access. + +The part of (physical) memory not covered by a permanent mapping is what we +refer to as 'highmem'. There are various architecture dependent constraints on +where exactly that border lies. + +In the i386 arch, for example, we choose to map the kernel into every process's +VM space so that we don't have to pay the full TLB invalidation costs for +kernel entry/exit. This means the available virtual memory space (4GiB on +i386) has to be divided between user and kernel space. + +The traditional split for architectures using this approach is 3:1, 3GiB for +userspace and the top 1GiB for kernel space:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +This means that the kernel can at most map 1GiB of physical memory at any one +time, but because we need virtual address space for other things - including +temporary maps to access the rest of the physical memory - the actual direct +map will typically be less (usually around ~896MiB). + +Other architectures that have mm context tagged TLBs can have separate kernel +and user maps. Some hardware (like some ARMs), however, have limited virtual +space when they use mm context tags. + + +Temporary Virtual Mappings +========================== + +The kernel contains several ways of creating temporary mappings: + +* vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. + +* kmap(). This permits a short duration mapping of a single page. It needs + global synchronization, but is amortized somewhat. It is also prone to + deadlocks when using in a nested fashion, and so it is not recommended for + new code. + +* kmap_atomic(). This permits a very short duration mapping of a single + page. Since the mapping is restricted to the CPU that issued it, it + performs well, but the issuing task is therefore required to stay on that + CPU until it has finished, lest some other task displace its mappings. + + kmap_atomic() may also be used by interrupt contexts, since it is does not + sleep and the caller may not sleep until after kunmap_atomic() is called. + + It may be assumed that k[un]map_atomic() won't fail. + + +Using kmap_atomic +================= + +When and where to use kmap_atomic() is straightforward. It is used when code +wants to access the contents of a page that might be allocated from high memory +(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two +functions, and they can be used in a manner similar to the following:: + + /* Find the page of interest. */ + struct page *page = find_get_page(mapping, offset); + + /* Gain access to the contents of that page. */ + void *vaddr = kmap_atomic(page); + + /* Do something to the contents of that page. */ + memset(vaddr, 0, PAGE_SIZE); + + /* Unmap that page. */ + kunmap_atomic(vaddr); + +Note that the kunmap_atomic() call takes the result of the kmap_atomic() call +not the argument. + +If you need to map two pages because you want to copy from one page to +another you need to keep the kmap_atomic calls strictly nested, like:: + + vaddr1 = kmap_atomic(page1); + vaddr2 = kmap_atomic(page2); + + memcpy(vaddr1, vaddr2, PAGE_SIZE); + + kunmap_atomic(vaddr2); + kunmap_atomic(vaddr1); + + +Cost of Temporary Mappings +========================== + +The cost of creating temporary mappings can be quite high. The arch has to +manipulate the kernel's page tables, the data TLB and/or the MMU's registers. + +If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping +simply with a bit of arithmetic that will convert the page struct address into +a pointer to the page contents rather than juggling mappings about. In such a +case, the unmap operation may be a null operation. + +If CONFIG_MMU is not set, then there can be no temporary mappings and no +highmem. In such a case, the arithmetic approach will also be used. + + +i386 PAE +======== + +The i386 arch, under some circumstances, will permit you to stick up to 64GiB +of RAM into your 32-bit machine. This has a number of consequences: + +* Linux needs a page-frame structure for each page in the system and the + pageframes need to live in the permanent mapping, which means: + +* you can have 896M/sizeof(struct page) page-frames at most; with struct + page being 32-bytes that would end up being something in the order of 112G + worth of pages; the kernel, however, needs to store more than just + page-frames in that memory... + +* PAE makes your page tables larger - which slows the system down as more + data has to be accessed to traverse in TLB fills and the like. One + advantage is that PAE has more PTE bits and can provide advanced features + like NX and PAT. + +The general recommendation is that you don't use more than 8GiB on a 32-bit +machine - although more might work for you and your workload, you're pretty +much on your own - don't expect kernel developers to really care much if things +come apart. diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.txt deleted file mode 100644 index 0f69a9fec34d..000000000000 --- a/Documentation/vm/highmem.txt +++ /dev/null @@ -1,147 +0,0 @@ -.. _highmem: - -==================== -High Memory Handling -==================== - -By: Peter Zijlstra - -.. contents:: :local: - -What Is High Memory? -==================== - -High memory (highmem) is used when the size of physical memory approaches or -exceeds the maximum size of virtual memory. At that point it becomes -impossible for the kernel to keep all of the available physical memory mapped -at all times. This means the kernel needs to start using temporary mappings of -the pieces of physical memory that it wants to access. - -The part of (physical) memory not covered by a permanent mapping is what we -refer to as 'highmem'. There are various architecture dependent constraints on -where exactly that border lies. - -In the i386 arch, for example, we choose to map the kernel into every process's -VM space so that we don't have to pay the full TLB invalidation costs for -kernel entry/exit. This means the available virtual memory space (4GiB on -i386) has to be divided between user and kernel space. - -The traditional split for architectures using this approach is 3:1, 3GiB for -userspace and the top 1GiB for kernel space:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -This means that the kernel can at most map 1GiB of physical memory at any one -time, but because we need virtual address space for other things - including -temporary maps to access the rest of the physical memory - the actual direct -map will typically be less (usually around ~896MiB). - -Other architectures that have mm context tagged TLBs can have separate kernel -and user maps. Some hardware (like some ARMs), however, have limited virtual -space when they use mm context tags. - - -Temporary Virtual Mappings -========================== - -The kernel contains several ways of creating temporary mappings: - -* vmap(). This can be used to make a long duration mapping of multiple - physical pages into a contiguous virtual space. It needs global - synchronization to unmap. - -* kmap(). This permits a short duration mapping of a single page. It needs - global synchronization, but is amortized somewhat. It is also prone to - deadlocks when using in a nested fashion, and so it is not recommended for - new code. - -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. - - kmap_atomic() may also be used by interrupt contexts, since it is does not - sleep and the caller may not sleep until after kunmap_atomic() is called. - - It may be assumed that k[un]map_atomic() won't fail. - - -Using kmap_atomic -================= - -When and where to use kmap_atomic() is straightforward. It is used when code -wants to access the contents of a page that might be allocated from high memory -(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two -functions, and they can be used in a manner similar to the following:: - - /* Find the page of interest. */ - struct page *page = find_get_page(mapping, offset); - - /* Gain access to the contents of that page. */ - void *vaddr = kmap_atomic(page); - - /* Do something to the contents of that page. */ - memset(vaddr, 0, PAGE_SIZE); - - /* Unmap that page. */ - kunmap_atomic(vaddr); - -Note that the kunmap_atomic() call takes the result of the kmap_atomic() call -not the argument. - -If you need to map two pages because you want to copy from one page to -another you need to keep the kmap_atomic calls strictly nested, like:: - - vaddr1 = kmap_atomic(page1); - vaddr2 = kmap_atomic(page2); - - memcpy(vaddr1, vaddr2, PAGE_SIZE); - - kunmap_atomic(vaddr2); - kunmap_atomic(vaddr1); - - -Cost of Temporary Mappings -========================== - -The cost of creating temporary mappings can be quite high. The arch has to -manipulate the kernel's page tables, the data TLB and/or the MMU's registers. - -If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping -simply with a bit of arithmetic that will convert the page struct address into -a pointer to the page contents rather than juggling mappings about. In such a -case, the unmap operation may be a null operation. - -If CONFIG_MMU is not set, then there can be no temporary mappings and no -highmem. In such a case, the arithmetic approach will also be used. - - -i386 PAE -======== - -The i386 arch, under some circumstances, will permit you to stick up to 64GiB -of RAM into your 32-bit machine. This has a number of consequences: - -* Linux needs a page-frame structure for each page in the system and the - pageframes need to live in the permanent mapping, which means: - -* you can have 896M/sizeof(struct page) page-frames at most; with struct - page being 32-bytes that would end up being something in the order of 112G - worth of pages; the kernel, however, needs to store more than just - page-frames in that memory... - -* PAE makes your page tables larger - which slows the system down as more - data has to be accessed to traverse in TLB fills and the like. One - advantage is that PAE has more PTE bits and can provide advanced features - like NX and PAT. - -The general recommendation is that you don't use more than 8GiB on a 32-bit -machine - although more might work for you and your workload, you're pretty -much on your own - don't expect kernel developers to really care much if things -come apart. diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst new file mode 100644 index 000000000000..3fafa3381730 --- /dev/null +++ b/Documentation/vm/hmm.rst @@ -0,0 +1,374 @@ +.. hmm: + +===================================== +Heterogeneous Memory Management (HMM) +===================================== + +Transparently allow any component of a program to use any memory region of said +program with a device without using device specific memory allocator. This is +becoming a requirement to simplify the use of advance heterogeneous computing +where GPU, DSP or FPGA are use to perform various computations. + +This document is divided as follow, in the first section i expose the problems +related to the use of a device specific allocator. The second section i expose +the hardware limitations that are inherent to many platforms. The third section +gives an overview of HMM designs. The fourth section explains how CPU page- +table mirroring works and what is HMM purpose in this context. Fifth section +deals with how device memory is represented inside the kernel. Finaly the last +section present the new migration helper that allow to leverage the device DMA +engine. + +.. contents:: :local: + +Problems of using device specific memory allocator +================================================== + +Device with large amount of on board memory (several giga bytes) like GPU have +historically manage their memory through dedicated driver specific API. This +creates a disconnect between memory allocated and managed by device driver and +regular application memory (private anonymous, share memory or regular file +back memory). From here on i will refer to this aspect as split address space. +I use share address space to refer to the opposite situation ie one in which +any memory region can be use by device transparently. + +Split address space because device can only access memory allocated through the +device specific API. This imply that all memory object in a program are not +equal from device point of view which complicate large program that rely on a +wide set of libraries. + +Concretly this means that code that wants to leverage device like GPU need to +copy object between genericly allocated memory (malloc, mmap private/share/) +and memory allocated through the device driver API (this still end up with an +mmap but of the device file). + +For flat dataset (array, grid, image, ...) this isn't too hard to achieve but +complex data-set (list, tree, ...) are hard to get right. Duplicating a complex +data-set need to re-map all the pointer relations between each of its elements. +This is error prone and program gets harder to debug because of the duplicate +data-set. + +Split address space also means that library can not transparently use data they +are getting from core program or other library and thus each library might have +to duplicate its input data-set using specific memory allocator. Large project +suffer from this and waste resources because of the various memory copy. + +Duplicating each library API to accept as input or output memory allocted by +each device specific allocator is not a viable option. It would lead to a +combinatorial explosions in the library entry points. + +Finaly with the advance of high level language constructs (in C++ but in other +language too) it is now possible for compiler to leverage GPU or other devices +without even the programmer knowledge. Some of compiler identified patterns are +only do-able with a share address. It is as well more reasonable to use a share +address space for all the other patterns. + + +System bus, device memory characteristics +========================================= + +System bus cripple share address due to few limitations. Most system bus only +allow basic memory access from device to main memory, even cache coherency is +often optional. Access to device memory from CPU is even more limited, most +often than not it is not cache coherent. + +If we only consider the PCIE bus than device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However it only allows +a limited set of atomic operation from device on main memory. This is worse +in the other direction the CPUs can only access a limited range of the device +memory and can not perform atomic operations on it. Thus device memory can not +be consider like regular memory from kernel point of view. + +Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 +and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s). +The final limitation is latency, access to main memory from the device has an +order of magnitude higher latency than when the device access its own memory. + +Some platform are developing new system bus or additions/modifications to PCIE +to address some of those limitations (OpenCAPI, CCIX). They mainly allow two +way cache coherency between CPU and device and allow all atomic operations the +architecture supports. Saddly not all platform are following this trends and +some major architecture are left without hardware solutions to those problems. + +So for share address space to make sense not only we must allow device to +access any memory memory but we must also permit any memory to be migrated to +device memory while device is using it (blocking CPU access while it happens). + + +Share address space and migration +================================= + +HMM intends to provide two main features. First one is to share the address +space by duplication the CPU page table into the device page table so same +address point to same memory and this for any valid main memory address in +the process address space. + +To achieve this, HMM offer a set of helpers to populate the device page table +while keeping track of CPU page table updates. Device page table updates are +not as easy as CPU page table updates. To update the device page table you must +allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics +commands in it to perform the update (unmap, cache invalidations and flush, +...). This can not be done through common code for all device. Hence why HMM +provides helpers to factor out everything that can be while leaving the gory +details to the device driver. + +The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does +allow to allocate a struct page for each page of the device memory. Those page +are special because the CPU can not map them. They however allow to migrate +main memory to device memory using exhisting migration mechanism and everything +looks like if page was swap out to disk from CPU point of view. Using a struct +page gives the easiest and cleanest integration with existing mm mechanisms. +Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory +for the device memory and second to perform migration. Policy decision of what +and when to migrate things is left to the device driver. + +Note that any CPU access to a device page trigger a page fault and a migration +back to main memory ie when a page backing an given address A is migrated from +a main memory page to a device page then any CPU access to address A trigger a +page fault and initiate a migration back to main memory. + + +With this two features, HMM not only allow a device to mirror a process address +space and keeps both CPU and device page table synchronize, but also allow to +leverage device memory by migrating part of data-set that is actively use by a +device. + + +Address space mirroring implementation and API +============================================== + +Address space mirroring main objective is to allow to duplicate range of CPU +page table into a device page table and HMM helps keeping both synchronize. A +device driver that want to mirror a process address space must start with the +registration of an hmm_mirror struct:: + + int hmm_mirror_register(struct hmm_mirror *mirror, + struct mm_struct *mm); + int hmm_mirror_register_locked(struct hmm_mirror *mirror, + struct mm_struct *mm); + +The locked variant is to be use when the driver is already holding the mmap_sem +of the mm in write mode. The mirror struct has a set of callback that are use +to propagate CPU page table:: + + struct hmm_mirror_ops { + /* sync_cpu_device_pagetables() - synchronize page tables + * + * @mirror: pointer to struct hmm_mirror + * @update_type: type of update that occurred to the CPU page table + * @start: virtual start address of the range to update + * @end: virtual end address of the range to update + * + * This callback ultimately originates from mmu_notifiers when the CPU + * page table is updated. The device driver must update its page table + * in response to this callback. The update argument tells what action + * to perform. + * + * The device driver must not return from this callback until the device + * page tables are completely updated (TLBs flushed, etc); this is a + * synchronous call. + */ + void (*update)(struct hmm_mirror *mirror, + enum hmm_update action, + unsigned long start, + unsigned long end); + }; + +Device driver must perform update to the range following action (turn range +read only, or fully unmap, ...). Once driver callback returns the device must +be done with the update. + + +When device driver wants to populate a range of virtual address it can use +either:: + + int hmm_vma_get_pfns(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns); + int hmm_vma_fault(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns, + bool write, + bool block); + +First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and +will not trigger a page fault on missing or non present entry. The second one +do trigger page fault on missing or read only entry if write parameter is true. +Page fault use the generic mm page fault code path just like a CPU page fault. + +Both function copy CPU page table into their pfns array argument. Each entry in +that array correspond to an address in the virtual range. HMM provide a set of +flags to help driver identify special CPU page table entries. + +Locking with the update() callback is the most important aspect the driver must +respect in order to keep things properly synchronize. The usage pattern is:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + again: + ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); + if (ret) + return ret; + take_lock(driver->update); + if (!hmm_vma_range_done(vma, &range)) { + release_lock(driver->update); + goto again; + } + + // Use pfns array content to update device page table + + release_lock(driver->update); + return 0; + } + +The driver->update lock is the same lock that driver takes inside its update() +callback. That lock must be call before hmm_vma_range_done() to avoid any race +with a concurrent CPU page table update. + +HMM implements all this on top of the mmu_notifier API because we wanted to a +simpler API and also to be able to perform optimization latter own like doing +concurrent device update in multi-devices scenario. + +HMM also serve as an impedence missmatch between how CPU page table update are +done (by CPU write to the page table and TLB flushes) from how device update +their own page table. Device update is a multi-step process, first appropriate +commands are write to a buffer, then this buffer is schedule for execution on +the device. It is only once the device has executed commands in the buffer that +the update is done. Creating and scheduling update command buffer can happen +concurrently for multiple devices. Waiting for each device to report commands +as executed is serialize (there is no point in doing this concurrently). + + +Represent and manage device memory from core kernel point of view +================================================================= + +Several differents design were try to support device memory. First one use +device specific data structure to keep information about migrated memory and +HMM hooked itself in various place of mm code to handle any access to address +that were back by device memory. It turns out that this ended up replicating +most of the fields of struct page and also needed many kernel code path to be +updated to understand this new kind of memory. + +Thing is most kernel code path never try to access the memory behind a page +but only care about struct page contents. Because of this HMM switchted to +directly using struct page for device memory which left most kernel code path +un-aware of the difference. We only need to make sure that no one ever try to +map those page from the CPU side. + +HMM provide a set of helpers to register and hotplug device memory as a new +region needing struct page. This is offer through a very simple API:: + + struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, + struct device *device, + unsigned long size); + void hmm_devmem_remove(struct hmm_devmem *devmem); + +The hmm_devmem_ops is where most of the important things are:: + + struct hmm_devmem_ops { + void (*free)(struct hmm_devmem *devmem, struct page *page); + int (*fault)(struct hmm_devmem *devmem, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + unsigned flags, + pmd_t *pmdp); + }; + +The first callback (free()) happens when the last reference on a device page is +drop. This means the device page is now free and no longer use by anyone. The +second callback happens whenever CPU try to access a device page which it can +not do. This second callback must trigger a migration back to system memory. + + +Migrate to and from device memory +================================= + +Because CPU can not access device memory, migration must use device DMA engine +to perform copy from and to device memory. For this we need a new migration +helper:: + + int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long mentries, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private); + +Unlike other migration function it works on a range of virtual address, there +is two reasons for that. First device DMA copy has a high setup overhead cost +and thus batching multiple pages is needed as otherwise the migration overhead +make the whole excersie pointless. The second reason is because driver trigger +such migration base on range of address the device is actively accessing. + +The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy()) +control destination memory allocation and copy operation. Second one is there +to allow device driver to perform cleanup operation after migration:: + + struct migrate_vma_ops { + void (*alloc_and_copy)(struct vm_area_struct *vma, + const unsigned long *src, + unsigned long *dst, + unsigned long start, + unsigned long end, + void *private); + void (*finalize_and_map)(struct vm_area_struct *vma, + const unsigned long *src, + const unsigned long *dst, + unsigned long start, + unsigned long end, + void *private); + }; + +It is important to stress that this migration helpers allow for hole in the +virtual address range. Some pages in the range might not be migrated for all +the usual reasons (page is pin, page is lock, ...). This helper does not fail +but just skip over those pages. + +The alloc_and_copy() might as well decide to not migrate all pages in the +range (for reasons under the callback control). For those the callback just +have to leave the corresponding dst entry empty. + +Finaly the migration of the struct page might fails (for file back page) for +various reasons (failure to freeze reference, or update page cache, ...). If +that happens then the finalize_and_map() can catch any pages that was not +migrated. Note those page were still copied to new page and thus we wasted +bandwidth but this is considered as a rare event and a price that we are +willing to pay to keep all the code simpler. + + +Memory cgroup (memcg) and rss accounting +======================================== + +For now device memory is accounted as any regular page in rss counters (either +anonymous if device page is use for anonymous, file if device page is use for +file back page or shmem if device page is use for share memory). This is a +deliberate choice to keep existing application that might start using device +memory without knowing about it to keep runing unimpacted. + +Drawbacks is that OOM killer might kill an application using a lot of device +memory and not a lot of regular system memory and thus not freeing much system +memory. We want to gather more real world experience on how application and +system react under memory pressure in the presence of device memory before +deciding to account device memory differently. + + +Same decision was made for memory cgroup. Device memory page are accounted +against same memory cgroup a regular page would be accounted to. This does +simplify migration to and from device memory. This also means that migration +back from device memory to regular memory can not fail because it would +go above memory cgroup limit. We might revisit this choice latter on once we +get more experience in how device memory is use and its impact on memory +resource control. + + +Note that device memory can never be pin nor by device driver nor through GUP +and thus such memory is always free upon process exit. Or when last reference +is drop in case of share memory or file back memory. diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt deleted file mode 100644 index 3fafa3381730..000000000000 --- a/Documentation/vm/hmm.txt +++ /dev/null @@ -1,374 +0,0 @@ -.. hmm: - -===================================== -Heterogeneous Memory Management (HMM) -===================================== - -Transparently allow any component of a program to use any memory region of said -program with a device without using device specific memory allocator. This is -becoming a requirement to simplify the use of advance heterogeneous computing -where GPU, DSP or FPGA are use to perform various computations. - -This document is divided as follow, in the first section i expose the problems -related to the use of a device specific allocator. The second section i expose -the hardware limitations that are inherent to many platforms. The third section -gives an overview of HMM designs. The fourth section explains how CPU page- -table mirroring works and what is HMM purpose in this context. Fifth section -deals with how device memory is represented inside the kernel. Finaly the last -section present the new migration helper that allow to leverage the device DMA -engine. - -.. contents:: :local: - -Problems of using device specific memory allocator -================================================== - -Device with large amount of on board memory (several giga bytes) like GPU have -historically manage their memory through dedicated driver specific API. This -creates a disconnect between memory allocated and managed by device driver and -regular application memory (private anonymous, share memory or regular file -back memory). From here on i will refer to this aspect as split address space. -I use share address space to refer to the opposite situation ie one in which -any memory region can be use by device transparently. - -Split address space because device can only access memory allocated through the -device specific API. This imply that all memory object in a program are not -equal from device point of view which complicate large program that rely on a -wide set of libraries. - -Concretly this means that code that wants to leverage device like GPU need to -copy object between genericly allocated memory (malloc, mmap private/share/) -and memory allocated through the device driver API (this still end up with an -mmap but of the device file). - -For flat dataset (array, grid, image, ...) this isn't too hard to achieve but -complex data-set (list, tree, ...) are hard to get right. Duplicating a complex -data-set need to re-map all the pointer relations between each of its elements. -This is error prone and program gets harder to debug because of the duplicate -data-set. - -Split address space also means that library can not transparently use data they -are getting from core program or other library and thus each library might have -to duplicate its input data-set using specific memory allocator. Large project -suffer from this and waste resources because of the various memory copy. - -Duplicating each library API to accept as input or output memory allocted by -each device specific allocator is not a viable option. It would lead to a -combinatorial explosions in the library entry points. - -Finaly with the advance of high level language constructs (in C++ but in other -language too) it is now possible for compiler to leverage GPU or other devices -without even the programmer knowledge. Some of compiler identified patterns are -only do-able with a share address. It is as well more reasonable to use a share -address space for all the other patterns. - - -System bus, device memory characteristics -========================================= - -System bus cripple share address due to few limitations. Most system bus only -allow basic memory access from device to main memory, even cache coherency is -often optional. Access to device memory from CPU is even more limited, most -often than not it is not cache coherent. - -If we only consider the PCIE bus than device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However it only allows -a limited set of atomic operation from device on main memory. This is worse -in the other direction the CPUs can only access a limited range of the device -memory and can not perform atomic operations on it. Thus device memory can not -be consider like regular memory from kernel point of view. - -Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s). -The final limitation is latency, access to main memory from the device has an -order of magnitude higher latency than when the device access its own memory. - -Some platform are developing new system bus or additions/modifications to PCIE -to address some of those limitations (OpenCAPI, CCIX). They mainly allow two -way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Saddly not all platform are following this trends and -some major architecture are left without hardware solutions to those problems. - -So for share address space to make sense not only we must allow device to -access any memory memory but we must also permit any memory to be migrated to -device memory while device is using it (blocking CPU access while it happens). - - -Share address space and migration -================================= - -HMM intends to provide two main features. First one is to share the address -space by duplication the CPU page table into the device page table so same -address point to same memory and this for any valid main memory address in -the process address space. - -To achieve this, HMM offer a set of helpers to populate the device page table -while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table you must -allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics -commands in it to perform the update (unmap, cache invalidations and flush, -...). This can not be done through common code for all device. Hence why HMM -provides helpers to factor out everything that can be while leaving the gory -details to the device driver. - -The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does -allow to allocate a struct page for each page of the device memory. Those page -are special because the CPU can not map them. They however allow to migrate -main memory to device memory using exhisting migration mechanism and everything -looks like if page was swap out to disk from CPU point of view. Using a struct -page gives the easiest and cleanest integration with existing mm mechanisms. -Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory -for the device memory and second to perform migration. Policy decision of what -and when to migrate things is left to the device driver. - -Note that any CPU access to a device page trigger a page fault and a migration -back to main memory ie when a page backing an given address A is migrated from -a main memory page to a device page then any CPU access to address A trigger a -page fault and initiate a migration back to main memory. - - -With this two features, HMM not only allow a device to mirror a process address -space and keeps both CPU and device page table synchronize, but also allow to -leverage device memory by migrating part of data-set that is actively use by a -device. - - -Address space mirroring implementation and API -============================================== - -Address space mirroring main objective is to allow to duplicate range of CPU -page table into a device page table and HMM helps keeping both synchronize. A -device driver that want to mirror a process address space must start with the -registration of an hmm_mirror struct:: - - int hmm_mirror_register(struct hmm_mirror *mirror, - struct mm_struct *mm); - int hmm_mirror_register_locked(struct hmm_mirror *mirror, - struct mm_struct *mm); - -The locked variant is to be use when the driver is already holding the mmap_sem -of the mm in write mode. The mirror struct has a set of callback that are use -to propagate CPU page table:: - - struct hmm_mirror_ops { - /* sync_cpu_device_pagetables() - synchronize page tables - * - * @mirror: pointer to struct hmm_mirror - * @update_type: type of update that occurred to the CPU page table - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * - * This callback ultimately originates from mmu_notifiers when the CPU - * page table is updated. The device driver must update its page table - * in response to this callback. The update argument tells what action - * to perform. - * - * The device driver must not return from this callback until the device - * page tables are completely updated (TLBs flushed, etc); this is a - * synchronous call. - */ - void (*update)(struct hmm_mirror *mirror, - enum hmm_update action, - unsigned long start, - unsigned long end); - }; - -Device driver must perform update to the range following action (turn range -read only, or fully unmap, ...). Once driver callback returns the device must -be done with the update. - - -When device driver wants to populate a range of virtual address it can use -either:: - - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); - int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); - -First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and -will not trigger a page fault on missing or non present entry. The second one -do trigger page fault on missing or read only entry if write parameter is true. -Page fault use the generic mm page fault code path just like a CPU page fault. - -Both function copy CPU page table into their pfns array argument. Each entry in -that array correspond to an address in the virtual range. HMM provide a set of -flags to help driver identify special CPU page table entries. - -Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronize. The usage pattern is:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - again: - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); - if (ret) - return ret; - take_lock(driver->update); - if (!hmm_vma_range_done(vma, &range)) { - release_lock(driver->update); - goto again; - } - - // Use pfns array content to update device page table - - release_lock(driver->update); - return 0; - } - -The driver->update lock is the same lock that driver takes inside its update() -callback. That lock must be call before hmm_vma_range_done() to avoid any race -with a concurrent CPU page table update. - -HMM implements all this on top of the mmu_notifier API because we wanted to a -simpler API and also to be able to perform optimization latter own like doing -concurrent device update in multi-devices scenario. - -HMM also serve as an impedence missmatch between how CPU page table update are -done (by CPU write to the page table and TLB flushes) from how device update -their own page table. Device update is a multi-step process, first appropriate -commands are write to a buffer, then this buffer is schedule for execution on -the device. It is only once the device has executed commands in the buffer that -the update is done. Creating and scheduling update command buffer can happen -concurrently for multiple devices. Waiting for each device to report commands -as executed is serialize (there is no point in doing this concurrently). - - -Represent and manage device memory from core kernel point of view -================================================================= - -Several differents design were try to support device memory. First one use -device specific data structure to keep information about migrated memory and -HMM hooked itself in various place of mm code to handle any access to address -that were back by device memory. It turns out that this ended up replicating -most of the fields of struct page and also needed many kernel code path to be -updated to understand this new kind of memory. - -Thing is most kernel code path never try to access the memory behind a page -but only care about struct page contents. Because of this HMM switchted to -directly using struct page for device memory which left most kernel code path -un-aware of the difference. We only need to make sure that no one ever try to -map those page from the CPU side. - -HMM provide a set of helpers to register and hotplug device memory as a new -region needing struct page. This is offer through a very simple API:: - - struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size); - void hmm_devmem_remove(struct hmm_devmem *devmem); - -The hmm_devmem_ops is where most of the important things are:: - - struct hmm_devmem_ops { - void (*free)(struct hmm_devmem *devmem, struct page *page); - int (*fault)(struct hmm_devmem *devmem, - struct vm_area_struct *vma, - unsigned long addr, - struct page *page, - unsigned flags, - pmd_t *pmdp); - }; - -The first callback (free()) happens when the last reference on a device page is -drop. This means the device page is now free and no longer use by anyone. The -second callback happens whenever CPU try to access a device page which it can -not do. This second callback must trigger a migration back to system memory. - - -Migrate to and from device memory -================================= - -Because CPU can not access device memory, migration must use device DMA engine -to perform copy from and to device memory. For this we need a new migration -helper:: - - int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long mentries, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); - -Unlike other migration function it works on a range of virtual address, there -is two reasons for that. First device DMA copy has a high setup overhead cost -and thus batching multiple pages is needed as otherwise the migration overhead -make the whole excersie pointless. The second reason is because driver trigger -such migration base on range of address the device is actively accessing. - -The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy()) -control destination memory allocation and copy operation. Second one is there -to allow device driver to perform cleanup operation after migration:: - - struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, - const unsigned long *src, - unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - }; - -It is important to stress that this migration helpers allow for hole in the -virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pin, page is lock, ...). This helper does not fail -but just skip over those pages. - -The alloc_and_copy() might as well decide to not migrate all pages in the -range (for reasons under the callback control). For those the callback just -have to leave the corresponding dst entry empty. - -Finaly the migration of the struct page might fails (for file back page) for -various reasons (failure to freeze reference, or update page cache, ...). If -that happens then the finalize_and_map() can catch any pages that was not -migrated. Note those page were still copied to new page and thus we wasted -bandwidth but this is considered as a rare event and a price that we are -willing to pay to keep all the code simpler. - - -Memory cgroup (memcg) and rss accounting -======================================== - -For now device memory is accounted as any regular page in rss counters (either -anonymous if device page is use for anonymous, file if device page is use for -file back page or shmem if device page is use for share memory). This is a -deliberate choice to keep existing application that might start using device -memory without knowing about it to keep runing unimpacted. - -Drawbacks is that OOM killer might kill an application using a lot of device -memory and not a lot of regular system memory and thus not freeing much system -memory. We want to gather more real world experience on how application and -system react under memory pressure in the presence of device memory before -deciding to account device memory differently. - - -Same decision was made for memory cgroup. Device memory page are accounted -against same memory cgroup a regular page would be accounted to. This does -simplify migration to and from device memory. This also means that migration -back from device memory to regular memory can not fail because it would -go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is use and its impact on memory -resource control. - - -Note that device memory can never be pin nor by device driver nor through GUP -and thus such memory is always free upon process exit. Or when last reference -is drop in case of share memory or file back memory. diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..36a87a2ea435 --- /dev/null +++ b/Documentation/vm/hugetlbfs_reserv.rst @@ -0,0 +1,587 @@ +.. _hugetlbfs_reserve: + +===================== +Hugetlbfs Reservation +===================== + +Overview +======== + +Huge pages as described at :ref:`hugetlbpage` are typically +preallocated for application use. These huge pages are instantiated in a +task's address space at page fault time if the VMA indicates huge pages are +to be used. If no huge page exists at page fault time, the task is sent +a SIGBUS and often dies an unhappy death. Shortly after huge page support +was added, it was determined that it would be better to detect a shortage +of huge pages at mmap() time. The idea is that if there were not enough +huge pages to cover the mapping, the mmap() would fail. This was first +done with a simple check in the code at mmap() time to determine if there +were enough free huge pages to cover the mapping. Like most things in the +kernel, the code has evolved over time. However, the basic idea was to +'reserve' huge pages at mmap() time to ensure that huge pages would be +available for page faults in that mapping. The description below attempts to +describe how huge page reserve processing is done in the v4.10 kernel. + + +Audience +======== +This description is primarily targeted at kernel developers who are modifying +hugetlbfs code. + + +The Data Structures +=================== + +resv_huge_pages + This is a global (per-hstate) count of reserved huge pages. Reserved + huge pages are only available to the task which reserved them. + Therefore, the number of huge pages generally available is computed + as (``free_huge_pages - resv_huge_pages``). +Reserve Map + A reserve map is described by the structure:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + There is one reserve map for each huge page mapping in the system. + The regions list within the resv_map describes the regions within + the mapping. A region is described as:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + The 'from' and 'to' fields of the file region structure are huge page + indices into the mapping. Depending on the type of mapping, a + region in the reserv_map may indicate reservations exist for the + range, or reservations do not exist. +Flags for MAP_PRIVATE Reservations + These are stored in the bottom bits of the reservation map pointer. + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + Indicates this task is the owner of the reservations + associated with the mapping. + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + Indicates task originally mapping this range (and creating + reserves) has unmapped a page from this task (the child) + due to a failed COW. +Page Flags + The PagePrivate page flag is used to indicate that a huge page + reservation must be restored when the huge page is freed. More + details will be discussed in the "Freeing huge pages" section. + + +Reservation Map Location (Private or Shared) +============================================ + +A huge page mapping or segment is either private or shared. If private, +it is typically only available to a single address space (task). If shared, +it can be mapped into multiple address spaces (tasks). The location and +semantics of the reservation map is significantly different for two types +of mappings. Location differences are: + +- For private mappings, the reservation map hangs off the the VMA structure. + Specifically, vma->vm_private_data. This reserve map is created at the + time the mapping (mmap(MAP_PRIVATE)) is created. +- For shared mappings, the reservation map hangs off the inode. Specifically, + inode->i_mapping->private_data. Since shared mappings are always backed + by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode + contains a reservation map. As a result, the reservation map is allocated + when the inode is created. + + +Creating Reservations +===================== +Reservations are created when a huge page backed shared memory segment is +created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). +These operations result in a call to the routine hugetlb_reserve_pages():: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +The first thing hugetlb_reserve_pages() does is check for the NORESERVE +flag was specified in either the shmget() or mmap() call. If NORESERVE +was specified, then this routine returns immediately as no reservation +are desired. + +The arguments 'from' and 'to' are huge page indices into the mapping or +underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to +the length of the segment/mapping. For mmap(), the offset argument could +be used to specify the offset into the underlying file. In such a case +the 'from' and 'to' arguments have been adjusted by this offset. + +One of the big differences between PRIVATE and SHARED mappings is the way +in which reservations are represented in the reservation map. + +- For shared mappings, an entry in the reservation map indicates a reservation + exists or did exist for the corresponding page. As reservations are + consumed, the reservation map is not modified. +- For private mappings, the lack of an entry in the reservation map indicates + a reservation exists for the corresponding page. As reservations are + consumed, entries are added to the reservation map. Therefore, the + reservation map can also be used to determine which reservations have + been consumed. + +For private mappings, hugetlb_reserve_pages() creates the reservation map and +hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set +to indicate this VMA owns the reservations. + +The reservation map is consulted to determine how many huge page reservations +are needed for the current mapping/segment. For private mappings, this is +always the value (to - from). However, for shared mappings it is possible that some reservations may already exist within the range (to - from). See the +section :ref:`Reservation Map Modifications ` +for details on how this is accomplished. + +The mapping may be associated with a subpool. If so, the subpool is consulted +to ensure there is sufficient space for the mapping. It is possible that the +subpool has set aside reservations that can be used for the mapping. See the +section :ref:`Subpool Reservations ` for more details. + +After consulting the reservation map and subpool, the number of needed new +reservations is known. The routine hugetlb_acct_memory() is called to check +for and take the requested number of reservations. hugetlb_acct_memory() +calls into routines that potentially allocate and adjust surplus page counts. +However, within those routines the code is simply checking to ensure there +are enough free huge pages to accommodate the reservation. If there are, +the global reservation count resv_huge_pages is adjusted something like the +following:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +Note that the global lock hugetlb_lock is held when checking and adjusting +these counters. + +If there were enough free huge pages and the global count resv_huge_pages +was adjusted, then the reservation map associated with the mapping is +modified to reflect the reservations. In the case of a shared mapping, a +file_region will exist that includes the range 'from' 'to'. For private +mappings, no modifications are made to the reservation map as lack of an +entry indicates a reservation exists. + +If hugetlb_reserve_pages() was successful, the global reservation count and +reservation map associated with the mapping will be modified as required to +ensure reservations exist for the range 'from' - 'to'. + +.. _consume_resv: + +Consuming Reservations/Allocating a Huge Page +============================================= + +Reservations are consumed when huge pages associated with the reservations +are allocated and instantiated in the corresponding mapping. The allocation +is performed within the routine alloc_huge_page():: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page is passed a VMA pointer and a virtual address, so it can +consult the reservation map to determine if a reservation exists. In addition, +alloc_huge_page takes the argument avoid_reserve which indicates reserves +should not be used even if it appears they have been set aside for the +specified address. The avoid_reserve argument is most often used in the case +of Copy on Write and Page Migration where additional copies of an existing +page are being allocated. + +The helper routine vma_needs_reservation() is called to determine if a +reservation exists for the address within the mapping(vma). See the section +:ref:`Reservation Map Helper Routines ` for detailed +information on what this routine does. +The value returned from vma_needs_reservation() is generally +0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. +If a reservation does not exist, and there is a subpool associated with the +mapping the subpool is consulted to determine if it contains reservations. +If the subpool contains reservations, one can be used for this allocation. +However, in every case the avoid_reserve argument overrides the use of +a reservation for the allocation. After determining whether a reservation +exists and can be used for the allocation, the routine dequeue_huge_page_vma() +is called. This routine takes two arguments related to reservations: + +- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- chg, even though this argument is of type long only the values 0 or 1 are + passed to dequeue_huge_page_vma. If the value is 0, it indicates a + reservation exists (see the section "Memory Policy and Reservations" for + possible issues). If the value is 1, it indicates a reservation does not + exist and the page must be taken from the global free pool if possible. + +The free lists associated with the memory policy of the VMA are searched for +a free page. If a page is found, the value free_huge_pages is decremented +when the page is removed from the free list. If there was a reservation +associated with the page, the following adjustments are made:: + + SetPagePrivate(page); /* Indicates allocating this page consumed + * a reservation, and if an error is + * encountered such that the page must be + * freed, the reservation will be restored. */ + resv_huge_pages--; /* Decrement the global reservation count */ + +Note, if no huge page can be found that satisfies the VMA's memory policy +an attempt will be made to allocate one using the buddy allocator. This +brings up the issue of surplus huge pages and overcommit which is beyond +the scope reservations. Even if a surplus page is allocated, the same +reservation based adjustments as above will be made: SetPagePrivate(page) and +resv_huge_pages--. + +After obtaining a new huge page, (page)->private is set to the value of +the subpool associated with the page if it exists. This will be used for +subpool accounting when the page is freed. + +The routine vma_commit_reservation() is then called to adjust the reserve +map based on the consumption of the reservation. In general, this involves +ensuring the page is represented within a file_region structure of the region +map. For shared mappings where the the reservation was present, an entry +in the reserve map already existed so no change is made. However, if there +was no reservation in a shared mapping or this was a private mapping a new +entry must be created. + +It is possible that the reserve map could have been changed between the call +to vma_needs_reservation() at the beginning of alloc_huge_page() and the +call to vma_commit_reservation() after the page was allocated. This would +be possible if hugetlb_reserve_pages was called for the same page in a shared +mapping. In such cases, the reservation count and subpool free page count +will be off by one. This rare condition can be identified by comparing the +return value from vma_needs_reservation and vma_commit_reservation. If such +a race is detected, the subpool and global reserve counts are adjusted to +compensate. See the section +:ref:`Reservation Map Helper Routines ` for more +information on these routines. + + +Instantiate Huge Pages +====================== + +After huge page allocation, the page is typically added to the page tables +of the allocating task. Before this, pages in a shared mapping are added +to the page cache and pages in private mappings are added to an anonymous +reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, +when a huge page that has been instantiated is freed no adjustment is made +to the global reservation count (resv_huge_pages). + + +Freeing Huge Pages +================== + +Huge page freeing is performed by the routine free_huge_page(). This routine +is the destructor for hugetlbfs compound pages. As a result, it is only +passed a pointer to the page struct. When a huge page is freed, reservation +accounting may need to be performed. This would be the case if the page was +associated with a subpool that contained reserves, or the page is being freed +on an error path where a global reserve count must be restored. + +The page->private field points to any subpool associated with the page. +If the PagePrivate flag is set, it indicates the global reserve count should +be adjusted (see the section +:ref:`Consuming Reservations/Allocating a Huge Page ` +for information on how these are set). + +The routine first calls hugepage_subpool_put_pages() for the page. If this +routine returns a value of 0 (which does not equal the value passed 1) it +indicates reserves are associated with the subpool, and this newly free page +must be used to keep the number of subpool reserves above the minimum size. +Therefore, the global resv_huge_pages counter is incremented in this case. + +If the PagePrivate flag was set in the page, the global resv_huge_pages counter +will always be incremented. + +.. _sub_pool_resv: + +Subpool Reservations +==================== + +There is a struct hstate associated with each huge page size. The hstate +tracks all huge pages of the specified size. A subpool represents a subset +of pages within a hstate that is associated with a mounted hugetlbfs +filesystem. + +When a hugetlbfs filesystem is mounted a min_size option can be specified +which indicates the minimum number of huge pages required by the filesystem. +If this option is specified, the number of huge pages corresponding to +min_size are reserved for use by the filesystem. This number is tracked in +the min_hpages field of a struct hugepage_subpool. At mount time, +hugetlb_acct_memory(min_hpages) is called to reserve the specified number of +huge pages. If they can not be reserved, the mount fails. + +The routines hugepage_subpool_get/put_pages() are called when pages are +obtained from or released back to a subpool. They perform all subpool +accounting, and track any reservations associated with the subpool. +hugepage_subpool_get/put_pages are passed the number of huge pages by which +to adjust the subpool 'used page' count (down for get, up for put). Normally, +they return the same value that was passed or an error if not enough pages +exist in the subpool. + +However, if reserves are associated with the subpool a return value less +than the passed value may be returned. This return value indicates the +number of additional global pool adjustments which must be made. For example, +suppose a subpool contains 3 reserved huge pages and someone asks for 5. +The 3 reserved pages associated with the subpool can be used to satisfy part +of the request. But, 2 pages must be obtained from the global pools. To +relay this information to the caller, the value 2 is returned. The caller +is then responsible for attempting to obtain the additional two pages from +the global pools. + + +COW and Reservations +==================== + +Since shared mappings all point to and use the same underlying pages, the +biggest reservation concern for COW is private mappings. In this case, +two tasks can be pointing at the same previously allocated page. One task +attempts to write to the page, so a new page must be allocated so that each +task points to its own page. + +When the page was originally allocated, the reservation for that page was +consumed. When an attempt to allocate a new page is made as a result of +COW, it is possible that no free huge pages are free and the allocation +will fail. + +When the private mapping was originally created, the owner of the mapping +was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation +map of the owner. Since the owner created the mapping, the owner owns all +the reservations associated with the mapping. Therefore, when a write fault +occurs and there is no page available, different action is taken for the owner +and non-owner of the reservation. + +In the case where the faulting task is not the owner, the fault will fail and +the task will typically receive a SIGBUS. + +If the owner is the faulting task, we want it to succeed since it owned the +original reservation. To accomplish this, the page is unmapped from the +non-owning task. In this way, the only reference is from the owning task. +In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer +of the non-owning task. The non-owning task may receive a SIGBUS if it later +faults on a non-present page. But, the original owner of the +mapping/reservation will behave as expected. + + +.. _resv_map_modifications: + +Reservation Map Modifications +============================= + +The following low level routines are used to make modifications to a +reservation map. Typically, these routines are not called directly. Rather, +a reservation map helper routine is called which calls one of these low level +routines. These low level routines are fairly well documented in the source +code (mm/hugetlb.c). These routines are:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +Operations on the reservation map typically involve two operations: + +1) region_chg() is called to examine the reserve map and determine how + many pages in the specified range [f, t) are NOT currently represented. + + The calling code performs global checks and allocations to determine if + there are enough huge pages for the operation to succeed. + +2) + a) If the operation can succeed, region_add() is called to actually modify + the reservation map for the same range [f, t) previously passed to + region_chg(). + b) If the operation can not succeed, region_abort is called for the same + range [f, t) to abort the operation. + +Note that this is a two step process where region_add() and region_abort() +are guaranteed to succeed after a prior call to region_chg() for the same +range. region_chg() is responsible for pre-allocating any data structures +necessary to ensure the subsequent operations (specifically region_add())) +will succeed. + +As mentioned above, region_chg() determines the number of pages in the range +which are NOT currently represented in the map. This number is returned to +the caller. region_add() returns the number of pages in the range added to +the map. In most cases, the return value of region_add() is the same as the +return value of region_chg(). However, in the case of shared mappings it is +possible for changes to the reservation map to be made between the calls to +region_chg() and region_add(). In this case, the return value of region_add() +will not match the return value of region_chg(). It is likely that in such +cases global counts and subpool accounting will be incorrect and in need of +adjustment. It is the responsibility of the caller to check for this condition +and make the appropriate adjustments. + +The routine region_del() is called to remove regions from a reservation map. +It is typically called in the following situations: + +- When a file in the hugetlbfs filesystem is being removed, the inode will + be released and the reservation map freed. Before freeing the reservation + map, all the individual file_region structures must be freed. In this case + region_del is passed the range [0, LONG_MAX). +- When a hugetlbfs file is being truncated. In this case, all allocated pages + after the new file size must be freed. In addition, any file_region entries + in the reservation map past the new end of file must be deleted. In this + case, region_del is passed the range [new_end_of_file, LONG_MAX). +- When a hole is being punched in a hugetlbfs file. In this case, huge pages + are removed from the middle of the file one at a time. As the pages are + removed, region_del() is called to remove the corresponding entry from the + reservation map. In this case, region_del is passed the range + [page_idx, page_idx + 1). + +In every case, region_del() will return the number of pages removed from the +reservation map. In VERY rare cases, region_del() can fail. This can only +happen in the hole punch case where it has to split an existing file_region +entry and can not allocate a new structure. In this error case, region_del() +will return -ENOMEM. The problem here is that the reservation map will +indicate that there is a reservation for the page. However, the subpool and +global reservation counts will not reflect the reservation. To handle this +situation, the routine hugetlb_fix_reserve_counts() is called to adjust the +counters so that they correspond with the reservation map entry that could +not be deleted. + +region_count() is called when unmapping a private huge page mapping. In +private mappings, the lack of a entry in the reservation map indicates that +a reservation exists. Therefore, by counting the number of entries in the +reservation map we know how many reservations were consumed and how many are +outstanding (outstanding = (end - start) - region_count(resv, start, end)). +Since the mapping is going away, the subpool and global reservation counts +are decremented by the number of outstanding reservations. + +.. _resv_map_helpers: + +Reservation Map Helper Routines +=============================== + +Several helper routines exist to query and modify the reservation maps. +These routines are only interested with reservations for a specific huge +page, so they just pass in an address instead of a range. In addition, +they pass in the associated VMA. From the VMA, the type of mapping (private +or shared) and the location of the reservation map (inode or VMA) can be +determined. These routines simply call the underlying routines described +in the section "Reservation Map Modifications". However, they do take into +account the 'opposite' meaning of reservation map entries for private and +shared mappings and hide this detail from the caller:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This routine calls region_chg() for the specified page. If no reservation +exists, 1 is returned. If a reservation exists, 0 is returned:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_add() for the specified page. As in the case of region_chg +and region_add, this routine is to be called after a previous call to +vma_needs_reservation. It will add a reservation entry for the page. It +returns 1 if the reservation was added and 0 if not. The return value should +be compared with the return value of the previous call to +vma_needs_reservation. An unexpected difference indicates the reservation +map was modified between calls:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_abort() for the specified page. As in the case of region_chg +and region_abort, this routine is to be called after a previous call to +vma_needs_reservation. It will abort/end the in progress reservation add +operation:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This is a special wrapper routine to help facilitate reservation cleanup +on error paths. It is only called from the routine restore_reserve_on_error(). +This routine is used in conjunction with vma_needs_reservation in an attempt +to add a reservation to the reservation map. It takes into account the +different reservation map semantics for private and shared mappings. Hence, +region_add is called for shared mappings (as an entry present in the map +indicates a reservation), and region_del is called for private mappings (as +the absence of an entry in the map indicates a reservation). See the section +"Reservation cleanup in error paths" for more information on what needs to +be done on error paths. + + +Reservation Cleanup in Error Paths +================================== + +As mentioned in the section +:ref:`Reservation Map Helper Routines `, reservation +map modifications are performed in two steps. First vma_needs_reservation +is called before a page is allocated. If the allocation is successful, +then vma_commit_reservation is called. If not, vma_end_reservation is called. +Global and subpool reservation counts are adjusted based on success or failure +of the operation and all is well. + +Additionally, after a huge page is instantiated the PagePrivate flag is +cleared so that accounting when the page is ultimately freed is correct. + +However, there are several instances where errors are encountered after a huge +page is allocated but before it is instantiated. In this case, the page +allocation has consumed the reservation and made the appropriate subpool, +reservation map and global count adjustments. If the page is freed at this +time (before instantiation and clearing of PagePrivate), then free_huge_page +will increment the global reservation count. However, the reservation map +indicates the reservation was consumed. This resulting inconsistent state +will cause the 'leak' of a reserved huge page. The global reserve count will +be higher than it should and prevent allocation of a pre-allocated page. + +The routine restore_reserve_on_error() attempts to handle this situation. It +is fairly well documented. The intention of this routine is to restore +the reservation map to the way it was before the page allocation. In this +way, the state of the reservation map will correspond to the global reservation +count after the page is freed. + +The routine restore_reserve_on_error itself may encounter errors while +attempting to restore the reservation map entry. In this case, it will +simply clear the PagePrivate flag of the page. In this way, the global +reserve count will not be incremented when the page is freed. However, the +reservation map will continue to look as though the reservation was consumed. +A page can still be allocated for the address, but it will not use a reserved +page as originally intended. + +There is some code (most notably userfaultfd) which can not call +restore_reserve_on_error. In this case, it simply modifies the PagePrivate +so that a reservation will not be leaked when the huge page is freed. + + +Reservations and Memory Policy +============================== +Per-node huge page lists existed in struct hstate when git was first used +to manage Linux code. The concept of reservations was added some time later. +When reservations were added, no attempt was made to take memory policy +into account. While cpusets are not exactly the same as memory policy, this +comment in hugetlb_acct_memory sums up the interaction between reservations +and cpusets/memory policy:: + + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + +Huge page reservations were added to prevent unexpected page allocation +failures (OOM) at page fault time. However, if an application makes use +of cpusets or memory policy there is no guarantee that huge pages will be +available on the required nodes. This is true even if there are a sufficient +number of global reservations. + + +Mike Kravetz, 7 April 2017 diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.txt deleted file mode 100644 index 36a87a2ea435..000000000000 --- a/Documentation/vm/hugetlbfs_reserv.txt +++ /dev/null @@ -1,587 +0,0 @@ -.. _hugetlbfs_reserve: - -===================== -Hugetlbfs Reservation -===================== - -Overview -======== - -Huge pages as described at :ref:`hugetlbpage` are typically -preallocated for application use. These huge pages are instantiated in a -task's address space at page fault time if the VMA indicates huge pages are -to be used. If no huge page exists at page fault time, the task is sent -a SIGBUS and often dies an unhappy death. Shortly after huge page support -was added, it was determined that it would be better to detect a shortage -of huge pages at mmap() time. The idea is that if there were not enough -huge pages to cover the mapping, the mmap() would fail. This was first -done with a simple check in the code at mmap() time to determine if there -were enough free huge pages to cover the mapping. Like most things in the -kernel, the code has evolved over time. However, the basic idea was to -'reserve' huge pages at mmap() time to ensure that huge pages would be -available for page faults in that mapping. The description below attempts to -describe how huge page reserve processing is done in the v4.10 kernel. - - -Audience -======== -This description is primarily targeted at kernel developers who are modifying -hugetlbfs code. - - -The Data Structures -=================== - -resv_huge_pages - This is a global (per-hstate) count of reserved huge pages. Reserved - huge pages are only available to the task which reserved them. - Therefore, the number of huge pages generally available is computed - as (``free_huge_pages - resv_huge_pages``). -Reserve Map - A reserve map is described by the structure:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - There is one reserve map for each huge page mapping in the system. - The regions list within the resv_map describes the regions within - the mapping. A region is described as:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - The 'from' and 'to' fields of the file region structure are huge page - indices into the mapping. Depending on the type of mapping, a - region in the reserv_map may indicate reservations exist for the - range, or reservations do not exist. -Flags for MAP_PRIVATE Reservations - These are stored in the bottom bits of the reservation map pointer. - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - Indicates this task is the owner of the reservations - associated with the mapping. - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - Indicates task originally mapping this range (and creating - reserves) has unmapped a page from this task (the child) - due to a failed COW. -Page Flags - The PagePrivate page flag is used to indicate that a huge page - reservation must be restored when the huge page is freed. More - details will be discussed in the "Freeing huge pages" section. - - -Reservation Map Location (Private or Shared) -============================================ - -A huge page mapping or segment is either private or shared. If private, -it is typically only available to a single address space (task). If shared, -it can be mapped into multiple address spaces (tasks). The location and -semantics of the reservation map is significantly different for two types -of mappings. Location differences are: - -- For private mappings, the reservation map hangs off the the VMA structure. - Specifically, vma->vm_private_data. This reserve map is created at the - time the mapping (mmap(MAP_PRIVATE)) is created. -- For shared mappings, the reservation map hangs off the inode. Specifically, - inode->i_mapping->private_data. Since shared mappings are always backed - by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode - contains a reservation map. As a result, the reservation map is allocated - when the inode is created. - - -Creating Reservations -===================== -Reservations are created when a huge page backed shared memory segment is -created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). -These operations result in a call to the routine hugetlb_reserve_pages():: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -The first thing hugetlb_reserve_pages() does is check for the NORESERVE -flag was specified in either the shmget() or mmap() call. If NORESERVE -was specified, then this routine returns immediately as no reservation -are desired. - -The arguments 'from' and 'to' are huge page indices into the mapping or -underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to -the length of the segment/mapping. For mmap(), the offset argument could -be used to specify the offset into the underlying file. In such a case -the 'from' and 'to' arguments have been adjusted by this offset. - -One of the big differences between PRIVATE and SHARED mappings is the way -in which reservations are represented in the reservation map. - -- For shared mappings, an entry in the reservation map indicates a reservation - exists or did exist for the corresponding page. As reservations are - consumed, the reservation map is not modified. -- For private mappings, the lack of an entry in the reservation map indicates - a reservation exists for the corresponding page. As reservations are - consumed, entries are added to the reservation map. Therefore, the - reservation map can also be used to determine which reservations have - been consumed. - -For private mappings, hugetlb_reserve_pages() creates the reservation map and -hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set -to indicate this VMA owns the reservations. - -The reservation map is consulted to determine how many huge page reservations -are needed for the current mapping/segment. For private mappings, this is -always the value (to - from). However, for shared mappings it is possible that some reservations may already exist within the range (to - from). See the -section :ref:`Reservation Map Modifications ` -for details on how this is accomplished. - -The mapping may be associated with a subpool. If so, the subpool is consulted -to ensure there is sufficient space for the mapping. It is possible that the -subpool has set aside reservations that can be used for the mapping. See the -section :ref:`Subpool Reservations ` for more details. - -After consulting the reservation map and subpool, the number of needed new -reservations is known. The routine hugetlb_acct_memory() is called to check -for and take the requested number of reservations. hugetlb_acct_memory() -calls into routines that potentially allocate and adjust surplus page counts. -However, within those routines the code is simply checking to ensure there -are enough free huge pages to accommodate the reservation. If there are, -the global reservation count resv_huge_pages is adjusted something like the -following:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -Note that the global lock hugetlb_lock is held when checking and adjusting -these counters. - -If there were enough free huge pages and the global count resv_huge_pages -was adjusted, then the reservation map associated with the mapping is -modified to reflect the reservations. In the case of a shared mapping, a -file_region will exist that includes the range 'from' 'to'. For private -mappings, no modifications are made to the reservation map as lack of an -entry indicates a reservation exists. - -If hugetlb_reserve_pages() was successful, the global reservation count and -reservation map associated with the mapping will be modified as required to -ensure reservations exist for the range 'from' - 'to'. - -.. _consume_resv: - -Consuming Reservations/Allocating a Huge Page -============================================= - -Reservations are consumed when huge pages associated with the reservations -are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page is passed a VMA pointer and a virtual address, so it can -consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves -should not be used even if it appears they have been set aside for the -specified address. The avoid_reserve argument is most often used in the case -of Copy on Write and Page Migration where additional copies of an existing -page are being allocated. - -The helper routine vma_needs_reservation() is called to determine if a -reservation exists for the address within the mapping(vma). See the section -:ref:`Reservation Map Helper Routines ` for detailed -information on what this routine does. -The value returned from vma_needs_reservation() is generally -0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. -If a reservation does not exist, and there is a subpool associated with the -mapping the subpool is consulted to determine if it contains reservations. -If the subpool contains reservations, one can be used for this allocation. -However, in every case the avoid_reserve argument overrides the use of -a reservation for the allocation. After determining whether a reservation -exists and can be used for the allocation, the routine dequeue_huge_page_vma() -is called. This routine takes two arguments related to reservations: - -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() -- chg, even though this argument is of type long only the values 0 or 1 are - passed to dequeue_huge_page_vma. If the value is 0, it indicates a - reservation exists (see the section "Memory Policy and Reservations" for - possible issues). If the value is 1, it indicates a reservation does not - exist and the page must be taken from the global free pool if possible. - -The free lists associated with the memory policy of the VMA are searched for -a free page. If a page is found, the value free_huge_pages is decremented -when the page is removed from the free list. If there was a reservation -associated with the page, the following adjustments are made:: - - SetPagePrivate(page); /* Indicates allocating this page consumed - * a reservation, and if an error is - * encountered such that the page must be - * freed, the reservation will be restored. */ - resv_huge_pages--; /* Decrement the global reservation count */ - -Note, if no huge page can be found that satisfies the VMA's memory policy -an attempt will be made to allocate one using the buddy allocator. This -brings up the issue of surplus huge pages and overcommit which is beyond -the scope reservations. Even if a surplus page is allocated, the same -reservation based adjustments as above will be made: SetPagePrivate(page) and -resv_huge_pages--. - -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. - -The routine vma_commit_reservation() is then called to adjust the reserve -map based on the consumption of the reservation. In general, this involves -ensuring the page is represented within a file_region structure of the region -map. For shared mappings where the the reservation was present, an entry -in the reserve map already existed so no change is made. However, if there -was no reservation in a shared mapping or this was a private mapping a new -entry must be created. - -It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would -be possible if hugetlb_reserve_pages was called for the same page in a shared -mapping. In such cases, the reservation count and subpool free page count -will be off by one. This rare condition can be identified by comparing the -return value from vma_needs_reservation and vma_commit_reservation. If such -a race is detected, the subpool and global reserve counts are adjusted to -compensate. See the section -:ref:`Reservation Map Helper Routines ` for more -information on these routines. - - -Instantiate Huge Pages -====================== - -After huge page allocation, the page is typically added to the page tables -of the allocating task. Before this, pages in a shared mapping are added -to the page cache and pages in private mappings are added to an anonymous -reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, -when a huge page that has been instantiated is freed no adjustment is made -to the global reservation count (resv_huge_pages). - - -Freeing Huge Pages -================== - -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. - -The page->private field points to any subpool associated with the page. -If the PagePrivate flag is set, it indicates the global reserve count should -be adjusted (see the section -:ref:`Consuming Reservations/Allocating a Huge Page ` -for information on how these are set). - -The routine first calls hugepage_subpool_put_pages() for the page. If this -routine returns a value of 0 (which does not equal the value passed 1) it -indicates reserves are associated with the subpool, and this newly free page -must be used to keep the number of subpool reserves above the minimum size. -Therefore, the global resv_huge_pages counter is incremented in this case. - -If the PagePrivate flag was set in the page, the global resv_huge_pages counter -will always be incremented. - -.. _sub_pool_resv: - -Subpool Reservations -==================== - -There is a struct hstate associated with each huge page size. The hstate -tracks all huge pages of the specified size. A subpool represents a subset -of pages within a hstate that is associated with a mounted hugetlbfs -filesystem. - -When a hugetlbfs filesystem is mounted a min_size option can be specified -which indicates the minimum number of huge pages required by the filesystem. -If this option is specified, the number of huge pages corresponding to -min_size are reserved for use by the filesystem. This number is tracked in -the min_hpages field of a struct hugepage_subpool. At mount time, -hugetlb_acct_memory(min_hpages) is called to reserve the specified number of -huge pages. If they can not be reserved, the mount fails. - -The routines hugepage_subpool_get/put_pages() are called when pages are -obtained from or released back to a subpool. They perform all subpool -accounting, and track any reservations associated with the subpool. -hugepage_subpool_get/put_pages are passed the number of huge pages by which -to adjust the subpool 'used page' count (down for get, up for put). Normally, -they return the same value that was passed or an error if not enough pages -exist in the subpool. - -However, if reserves are associated with the subpool a return value less -than the passed value may be returned. This return value indicates the -number of additional global pool adjustments which must be made. For example, -suppose a subpool contains 3 reserved huge pages and someone asks for 5. -The 3 reserved pages associated with the subpool can be used to satisfy part -of the request. But, 2 pages must be obtained from the global pools. To -relay this information to the caller, the value 2 is returned. The caller -is then responsible for attempting to obtain the additional two pages from -the global pools. - - -COW and Reservations -==================== - -Since shared mappings all point to and use the same underlying pages, the -biggest reservation concern for COW is private mappings. In this case, -two tasks can be pointing at the same previously allocated page. One task -attempts to write to the page, so a new page must be allocated so that each -task points to its own page. - -When the page was originally allocated, the reservation for that page was -consumed. When an attempt to allocate a new page is made as a result of -COW, it is possible that no free huge pages are free and the allocation -will fail. - -When the private mapping was originally created, the owner of the mapping -was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation -map of the owner. Since the owner created the mapping, the owner owns all -the reservations associated with the mapping. Therefore, when a write fault -occurs and there is no page available, different action is taken for the owner -and non-owner of the reservation. - -In the case where the faulting task is not the owner, the fault will fail and -the task will typically receive a SIGBUS. - -If the owner is the faulting task, we want it to succeed since it owned the -original reservation. To accomplish this, the page is unmapped from the -non-owning task. In this way, the only reference is from the owning task. -In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer -of the non-owning task. The non-owning task may receive a SIGBUS if it later -faults on a non-present page. But, the original owner of the -mapping/reservation will behave as expected. - - -.. _resv_map_modifications: - -Reservation Map Modifications -============================= - -The following low level routines are used to make modifications to a -reservation map. Typically, these routines are not called directly. Rather, -a reservation map helper routine is called which calls one of these low level -routines. These low level routines are fairly well documented in the source -code (mm/hugetlb.c). These routines are:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -Operations on the reservation map typically involve two operations: - -1) region_chg() is called to examine the reserve map and determine how - many pages in the specified range [f, t) are NOT currently represented. - - The calling code performs global checks and allocations to determine if - there are enough huge pages for the operation to succeed. - -2) - a) If the operation can succeed, region_add() is called to actually modify - the reservation map for the same range [f, t) previously passed to - region_chg(). - b) If the operation can not succeed, region_abort is called for the same - range [f, t) to abort the operation. - -Note that this is a two step process where region_add() and region_abort() -are guaranteed to succeed after a prior call to region_chg() for the same -range. region_chg() is responsible for pre-allocating any data structures -necessary to ensure the subsequent operations (specifically region_add())) -will succeed. - -As mentioned above, region_chg() determines the number of pages in the range -which are NOT currently represented in the map. This number is returned to -the caller. region_add() returns the number of pages in the range added to -the map. In most cases, the return value of region_add() is the same as the -return value of region_chg(). However, in the case of shared mappings it is -possible for changes to the reservation map to be made between the calls to -region_chg() and region_add(). In this case, the return value of region_add() -will not match the return value of region_chg(). It is likely that in such -cases global counts and subpool accounting will be incorrect and in need of -adjustment. It is the responsibility of the caller to check for this condition -and make the appropriate adjustments. - -The routine region_del() is called to remove regions from a reservation map. -It is typically called in the following situations: - -- When a file in the hugetlbfs filesystem is being removed, the inode will - be released and the reservation map freed. Before freeing the reservation - map, all the individual file_region structures must be freed. In this case - region_del is passed the range [0, LONG_MAX). -- When a hugetlbfs file is being truncated. In this case, all allocated pages - after the new file size must be freed. In addition, any file_region entries - in the reservation map past the new end of file must be deleted. In this - case, region_del is passed the range [new_end_of_file, LONG_MAX). -- When a hole is being punched in a hugetlbfs file. In this case, huge pages - are removed from the middle of the file one at a time. As the pages are - removed, region_del() is called to remove the corresponding entry from the - reservation map. In this case, region_del is passed the range - [page_idx, page_idx + 1). - -In every case, region_del() will return the number of pages removed from the -reservation map. In VERY rare cases, region_del() can fail. This can only -happen in the hole punch case where it has to split an existing file_region -entry and can not allocate a new structure. In this error case, region_del() -will return -ENOMEM. The problem here is that the reservation map will -indicate that there is a reservation for the page. However, the subpool and -global reservation counts will not reflect the reservation. To handle this -situation, the routine hugetlb_fix_reserve_counts() is called to adjust the -counters so that they correspond with the reservation map entry that could -not be deleted. - -region_count() is called when unmapping a private huge page mapping. In -private mappings, the lack of a entry in the reservation map indicates that -a reservation exists. Therefore, by counting the number of entries in the -reservation map we know how many reservations were consumed and how many are -outstanding (outstanding = (end - start) - region_count(resv, start, end)). -Since the mapping is going away, the subpool and global reservation counts -are decremented by the number of outstanding reservations. - -.. _resv_map_helpers: - -Reservation Map Helper Routines -=============================== - -Several helper routines exist to query and modify the reservation maps. -These routines are only interested with reservations for a specific huge -page, so they just pass in an address instead of a range. In addition, -they pass in the associated VMA. From the VMA, the type of mapping (private -or shared) and the location of the reservation map (inode or VMA) can be -determined. These routines simply call the underlying routines described -in the section "Reservation Map Modifications". However, they do take into -account the 'opposite' meaning of reservation map entries for private and -shared mappings and hide this detail from the caller:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This routine calls region_chg() for the specified page. If no reservation -exists, 1 is returned. If a reservation exists, 0 is returned:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_add() for the specified page. As in the case of region_chg -and region_add, this routine is to be called after a previous call to -vma_needs_reservation. It will add a reservation entry for the page. It -returns 1 if the reservation was added and 0 if not. The return value should -be compared with the return value of the previous call to -vma_needs_reservation. An unexpected difference indicates the reservation -map was modified between calls:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_abort() for the specified page. As in the case of region_chg -and region_abort, this routine is to be called after a previous call to -vma_needs_reservation. It will abort/end the in progress reservation add -operation:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This is a special wrapper routine to help facilitate reservation cleanup -on error paths. It is only called from the routine restore_reserve_on_error(). -This routine is used in conjunction with vma_needs_reservation in an attempt -to add a reservation to the reservation map. It takes into account the -different reservation map semantics for private and shared mappings. Hence, -region_add is called for shared mappings (as an entry present in the map -indicates a reservation), and region_del is called for private mappings (as -the absence of an entry in the map indicates a reservation). See the section -"Reservation cleanup in error paths" for more information on what needs to -be done on error paths. - - -Reservation Cleanup in Error Paths -================================== - -As mentioned in the section -:ref:`Reservation Map Helper Routines `, reservation -map modifications are performed in two steps. First vma_needs_reservation -is called before a page is allocated. If the allocation is successful, -then vma_commit_reservation is called. If not, vma_end_reservation is called. -Global and subpool reservation counts are adjusted based on success or failure -of the operation and all is well. - -Additionally, after a huge page is instantiated the PagePrivate flag is -cleared so that accounting when the page is ultimately freed is correct. - -However, there are several instances where errors are encountered after a huge -page is allocated but before it is instantiated. In this case, the page -allocation has consumed the reservation and made the appropriate subpool, -reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page -will increment the global reservation count. However, the reservation map -indicates the reservation was consumed. This resulting inconsistent state -will cause the 'leak' of a reserved huge page. The global reserve count will -be higher than it should and prevent allocation of a pre-allocated page. - -The routine restore_reserve_on_error() attempts to handle this situation. It -is fairly well documented. The intention of this routine is to restore -the reservation map to the way it was before the page allocation. In this -way, the state of the reservation map will correspond to the global reservation -count after the page is freed. - -The routine restore_reserve_on_error itself may encounter errors while -attempting to restore the reservation map entry. In this case, it will -simply clear the PagePrivate flag of the page. In this way, the global -reserve count will not be incremented when the page is freed. However, the -reservation map will continue to look as though the reservation was consumed. -A page can still be allocated for the address, but it will not use a reserved -page as originally intended. - -There is some code (most notably userfaultfd) which can not call -restore_reserve_on_error. In this case, it simply modifies the PagePrivate -so that a reservation will not be leaked when the huge page is freed. - - -Reservations and Memory Policy -============================== -Per-node huge page lists existed in struct hstate when git was first used -to manage Linux code. The concept of reservations was added some time later. -When reservations were added, no attempt was made to take memory policy -into account. While cpusets are not exactly the same as memory policy, this -comment in hugetlb_acct_memory sums up the interaction between reservations -and cpusets/memory policy:: - - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - -Huge page reservations were added to prevent unexpected page allocation -failures (OOM) at page fault time. However, if an application makes use -of cpusets or memory policy there is no guarantee that huge pages will be -available on the required nodes. This is true even if there are a sufficient -number of global reservations. - - -Mike Kravetz, 7 April 2017 diff --git a/Documentation/vm/hugetlbpage.rst b/Documentation/vm/hugetlbpage.rst new file mode 100644 index 000000000000..a5da14b05b4b --- /dev/null +++ b/Documentation/vm/hugetlbpage.rst @@ -0,0 +1,386 @@ +.. _hugetlbpage: + +============= +HugeTLB Pages +============= + +Overview +======== + +The intent of this file is to give a brief summary of hugetlbpage support in +the Linux kernel. This support is built on top of multiple page size support +that is provided by most modern architectures. For example, x86 CPUs normally +support 4K and 2M (1G if architecturally supported) page sizes, ia64 +architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M, +256M and ppc64 supports 4K and 16M. A TLB is a cache of virtual-to-physical +translations. Typically this is a very scarce resource on processor. +Operating systems try to make best use of limited number of TLB resources. +This optimization is more critical now as bigger and bigger physical memories +(several GBs) are more readily available. + +Users can use the huge page support in Linux kernel by either using the mmap +system call or standard SYSV shared memory system calls (shmget, shmat). + +First the Linux kernel needs to be built with the CONFIG_HUGETLBFS +(present under "File systems") and CONFIG_HUGETLB_PAGE (selected +automatically when CONFIG_HUGETLBFS is selected) configuration +options. + +The ``/proc/meminfo`` file provides information about the total number of +persistent hugetlb pages in the kernel's huge page pool. It also displays +default huge page size and information about the number of free, reserved +and surplus huge pages in the pool of huge pages of default size. +The huge page size is needed for generating the proper alignment and +size of the arguments to system calls that map huge page regions. + +The output of ``cat /proc/meminfo`` will include lines like:: + + HugePages_Total: uuu + HugePages_Free: vvv + HugePages_Rsvd: www + HugePages_Surp: xxx + Hugepagesize: yyy kB + Hugetlb: zzz kB + +where: + +HugePages_Total + is the size of the pool of huge pages. +HugePages_Free + is the number of huge pages in the pool that are not yet + allocated. +HugePages_Rsvd + is short for "reserved," and is the number of huge pages for + which a commitment to allocate from the pool has been made, + but no allocation has yet been made. Reserved huge pages + guarantee that an application will be able to allocate a + huge page from the pool of huge pages at fault time. +HugePages_Surp + is short for "surplus," and is the number of huge pages in + the pool above the value in ``/proc/sys/vm/nr_hugepages``. The + maximum number of surplus huge pages is controlled by + ``/proc/sys/vm/nr_overcommit_hugepages``. +Hugepagesize + is the default hugepage size (in Kb). +Hugetlb + is the total amount of memory (in kB), consumed by huge + pages of all sizes. + If huge pages of different sizes are in use, this number + will exceed HugePages_Total \* Hugepagesize. To get more + detailed information, please, refer to + ``/sys/kernel/mm/hugepages`` (described below). + + +``/proc/filesystems`` should also show a filesystem of type "hugetlbfs" +configured in the kernel. + +``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge +pages in the kernel's huge page pool. "Persistent" huge pages will be +returned to the huge page pool when freed by a task. A user with root +privileges can dynamically allocate more or free some persistent huge pages +by increasing or decreasing the value of ``nr_hugepages``. + +Pages that are used as huge pages are reserved inside the kernel and cannot +be used for other purposes. Huge pages cannot be swapped out under +memory pressure. + +Once a number of huge pages have been pre-allocated to the kernel huge page +pool, a user with appropriate privilege can use either the mmap system call +or shared memory system calls to use the huge pages. See the discussion of +Using Huge Pages, below. + +The administrator can allocate persistent huge pages on the kernel boot +command line by specifying the "hugepages=N" parameter, where 'N' = the +number of huge pages requested. This is the most reliable method of +allocating huge pages as memory has not yet become fragmented. + +Some platforms support multiple huge page sizes. To allocate huge pages +of a specific size, one must precede the huge pages boot command parameters +with a huge page size selection parameter "hugepagesz=". must +be specified in bytes with optional scale suffix [kKmMgG]. The default huge +page size may be selected with the "default_hugepagesz=" boot parameter. + +When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` +indicates the current number of pre-allocated huge pages of the default size. +Thus, one can use the following command to dynamically allocate/deallocate +default sized persistent huge pages:: + + echo 20 > /proc/sys/vm/nr_hugepages + +This command will try to adjust the number of default sized huge pages in the +huge page pool to 20, allocating or freeing huge pages, as required. + +On a NUMA platform, the kernel will attempt to distribute the huge page pool +over all the set of allowed nodes specified by the NUMA memory policy of the +task that modifies ``nr_hugepages``. The default for the allowed nodes--when the +task has default memory policy--is all on-line nodes with memory. Allowed +nodes with insufficient available, contiguous memory for a huge page will be +silently skipped when allocating persistent huge pages. See the discussion +below of the interaction of task memory policy, cpusets and per node attributes +with the allocation and freeing of persistent huge pages. + +The success or failure of huge page allocation depends on the amount of +physically contiguous memory that is present in system at the time of the +allocation attempt. If the kernel is unable to allocate huge pages from +some nodes in a NUMA system, it will attempt to make up the difference by +allocating extra pages on other nodes with sufficient available contiguous +memory, if any. + +System administrators may want to put this command in one of the local rc +init files. This will enable the kernel to allocate huge pages early in +the boot process when the possibility of getting physical contiguous pages +is still very high. Administrators can verify the number of huge pages +actually allocated by checking the sysctl or meminfo. To check the per node +distribution of huge pages in a NUMA system, use:: + + cat /sys/devices/system/node/node*/meminfo | fgrep Huge + +``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of +huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are +requested by applications. Writing any non-zero value into this file +indicates that the hugetlb subsystem is allowed to try to obtain that +number of "surplus" huge pages from the kernel's normal page pool, when the +persistent huge page pool is exhausted. As these surplus huge pages become +unused, they are freed back to the kernel's normal page pool. + +When increasing the huge page pool size via ``nr_hugepages``, any existing +surplus pages will first be promoted to persistent huge pages. Then, additional +huge pages will be allocated, if necessary and if possible, to fulfill +the new persistent huge page pool size. + +The administrator may shrink the pool of persistent huge pages for +the default huge page size by setting the ``nr_hugepages`` sysctl to a +smaller value. The kernel will attempt to balance the freeing of huge pages +across all nodes in the memory policy of the task modifying ``nr_hugepages``. +Any free huge pages on the selected nodes will be freed back to the kernel's +normal page pool. + +Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that +it becomes less than the number of huge pages in use will convert the balance +of the in-use huge pages to surplus huge pages. This will occur even if +the number of surplus pages it would exceed the overcommit value. As long as +this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is +increased sufficiently, or the surplus huge pages go out of use and are freed-- +no more surplus huge pages will be allowed to be allocated. + +With support for multiple huge page pools at run-time available, much of +the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in +sysfs. +The ``/proc`` interfaces discussed above have been retained for backwards +compatibility. The root huge page control directory in sysfs is:: + + /sys/kernel/mm/hugepages + +For each huge page size supported by the running kernel, a subdirectory +will exist, of the form:: + + hugepages-${size}kB + +Inside each of these directories, the same set of files will exist:: + + nr_hugepages + nr_hugepages_mempolicy + nr_overcommit_hugepages + free_hugepages + resv_hugepages + surplus_hugepages + +which function as described above for the default huge page-sized case. + + +Interaction of Task Memory Policy with Huge Page Allocation/Freeing +=================================================================== + +Whether huge pages are allocated and freed via the ``/proc`` interface or +the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the +NUMA nodes from which huge pages are allocated or freed are controlled by the +NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy`` +sysctl or attribute. When the ``nr_hugepages`` attribute is used, mempolicy +is ignored. + +The recommended method to allocate or free huge pages to/from the kernel +huge page pool, using the ``nr_hugepages`` example above, is:: + + numactl --interleave echo 20 \ + >/proc/sys/vm/nr_hugepages_mempolicy + +or, more succinctly:: + + numactl -m echo 20 >/proc/sys/vm/nr_hugepages_mempolicy + +This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes +specified in , depending on whether number of persistent huge pages +is initially less than or greater than 20, respectively. No huge pages will be +allocated nor freed on any node not included in the specified . + +When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any +memory policy mode--bind, preferred, local or interleave--may be used. The +resulting effect on persistent huge page allocation is as follows: + +#. Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.rst], + persistent huge pages will be distributed across the node or nodes + specified in the mempolicy as if "interleave" had been specified. + However, if a node in the policy does not contain sufficient contiguous + memory for a huge page, the allocation will not "fallback" to the nearest + neighbor node with sufficient contiguous memory. To do this would cause + undesirable imbalance in the distribution of the huge page pool, or + possibly, allocation of persistent huge pages on nodes not allowed by + the task's memory policy. + +#. One or more nodes may be specified with the bind or interleave policy. + If more than one node is specified with the preferred policy, only the + lowest numeric id will be used. Local policy will select the node where + the task is running at the time the nodes_allowed mask is constructed. + For local policy to be deterministic, the task must be bound to a cpu or + cpus in a single node. Otherwise, the task could be migrated to some + other node at any time after launch and the resulting node will be + indeterminate. Thus, local policy is not very useful for this purpose. + Any of the other mempolicy modes may be used to specify a single node. + +#. The nodes allowed mask will be derived from any non-default task mempolicy, + whether this policy was set explicitly by the task itself or one of its + ancestors, such as numactl. This means that if the task is invoked from a + shell with non-default policy, that policy will be used. One can specify a + node list of "all" with numactl --interleave or --membind [-m] to achieve + interleaving over all nodes in the system or cpuset. + +#. Any task mempolicy specified--e.g., using numactl--will be constrained by + the resource limits of any cpuset in which the task runs. Thus, there will + be no way for a task with non-default policy running in a cpuset with a + subset of the system nodes to allocate huge pages outside the cpuset + without first moving to a cpuset that contains all of the desired nodes. + +#. Boot-time huge page allocation attempts to distribute the requested number + of huge pages over all on-lines nodes with memory. + +Per Node Hugepages Attributes +============================= + +A subset of the contents of the root huge page control directory in sysfs, +described above, will be replicated under each the system device of each +NUMA node with memory in:: + + /sys/devices/system/node/node[0-9]*/hugepages/ + +Under this directory, the subdirectory for each supported huge page size +contains the following attribute files:: + + nr_hugepages + free_hugepages + surplus_hugepages + +The free\_' and surplus\_' attribute files are read-only. They return the number +of free and surplus [overcommitted] huge pages, respectively, on the parent +node. + +The ``nr_hugepages`` attribute returns the total number of huge pages on the +specified node. When this attribute is written, the number of persistent huge +pages on the parent node will be adjusted to the specified value, if sufficient +resources exist, regardless of the task's mempolicy or cpuset constraints. + +Note that the number of overcommit and reserve pages remain global quantities, +as we don't know until fault time, when the faulting task's mempolicy is +applied, from which node the huge page allocation will be attempted. + + +Using Huge Pages +================ + +If the user applications are going to request huge pages using mmap system +call, then it is required that system administrator mount a file system of +type hugetlbfs:: + + mount -t hugetlbfs \ + -o uid=,gid=,mode=,pagesize=,size=,\ + min_size=,nr_inodes= none /mnt/huge + +This command mounts a (pseudo) filesystem of type hugetlbfs on the directory +``/mnt/huge``. Any files created on ``/mnt/huge`` uses huge pages. + +The ``uid`` and ``gid`` options sets the owner and group of the root of the +file system. By default the ``uid`` and ``gid`` of the current process +are taken. + +The ``mode`` option sets the mode of root of file system to value & 01777. +This value is given in octal. By default the value 0755 is picked. + +If the platform supports multiple huge page sizes, the ``pagesize`` option can +be used to specify the huge page size and associated pool. ``pagesize`` +is specified in bytes. If ``pagesize`` is not specified the platform's +default huge page size and associated pool will be used. + +The ``size`` option sets the maximum value of memory (huge pages) allowed +for that filesystem (``/mnt/huge``). The ``size`` option can be specified +in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``). +The size is rounded down to HPAGE_SIZE boundary. + +The ``min_size`` option sets the minimum value of memory (huge pages) allowed +for the filesystem. ``min_size`` can be specified in the same way as ``size``, +either bytes or a percentage of the huge page pool. +At mount time, the number of huge pages specified by ``min_size`` are reserved +for use by the filesystem. +If there are not enough free huge pages available, the mount will fail. +As huge pages are allocated to the filesystem and freed, the reserve count +is adjusted so that the sum of allocated and reserved huge pages is always +at least ``min_size``. + +The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge`` +can use. + +If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on +command line then no limits are set. + +For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can +use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. +For example, size=2K has the same meaning as size=2048. + +While read system calls are supported on files that reside on hugetlb +file systems, write system calls are not. + +Regular chown, chgrp, and chmod commands (with right permissions) could be +used to change the file attributes on hugetlbfs. + +Also, it is important to note that no such mount command is required if +applications are going to use only shmat/shmget system calls or mmap with +MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see +:ref:`map_hugetlb ` below. + +Users who wish to use hugetlb memory via shared memory segment should be a +member of a supplementary group and system admin needs to configure that gid +into ``/proc/sys/vm/hugetlb_shm_group``. It is possible for same or different +applications to use any combination of mmaps and shm* calls, though the mount of +filesystem will be required for using mmap calls without MAP_HUGETLB. + +Syscalls that operate on memory backed by hugetlb pages only have their lengths +aligned to the native page size of the processor; they will normally fail with +errno set to EINVAL or exclude hugetlb pages that extend beyond the length if +not hugepage aligned. For example, munmap(2) will fail if memory is backed by +a hugetlb page and the length is smaller than the hugepage size. + + +Examples +======== + +.. _map_hugetlb: + +``map_hugetlb`` + see tools/testing/selftests/vm/map_hugetlb.c + +``hugepage-shm`` + see tools/testing/selftests/vm/hugepage-shm.c + +``hugepage-mmap`` + see tools/testing/selftests/vm/hugepage-mmap.c + +The `libhugetlbfs`_ library provides a wide range of userspace tools +to help with huge page usability, environment setup, and control. + +.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs + +Kernel development regression testing +===================================== + +The most complete set of hugetlb tests are in the libhugetlbfs repository. +If you modify any hugetlb related code, use the libhugetlbfs test suite +to check for regressions. In addition, if you add any new hugetlb +functionality, please add appropriate tests to libhugetlbfs. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt deleted file mode 100644 index 3bb0d991f102..000000000000 --- a/Documentation/vm/hugetlbpage.txt +++ /dev/null @@ -1,386 +0,0 @@ -.. _hugetlbpage: - -============= -HugeTLB Pages -============= - -Overview -======== - -The intent of this file is to give a brief summary of hugetlbpage support in -the Linux kernel. This support is built on top of multiple page size support -that is provided by most modern architectures. For example, x86 CPUs normally -support 4K and 2M (1G if architecturally supported) page sizes, ia64 -architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M, -256M and ppc64 supports 4K and 16M. A TLB is a cache of virtual-to-physical -translations. Typically this is a very scarce resource on processor. -Operating systems try to make best use of limited number of TLB resources. -This optimization is more critical now as bigger and bigger physical memories -(several GBs) are more readily available. - -Users can use the huge page support in Linux kernel by either using the mmap -system call or standard SYSV shared memory system calls (shmget, shmat). - -First the Linux kernel needs to be built with the CONFIG_HUGETLBFS -(present under "File systems") and CONFIG_HUGETLB_PAGE (selected -automatically when CONFIG_HUGETLBFS is selected) configuration -options. - -The ``/proc/meminfo`` file provides information about the total number of -persistent hugetlb pages in the kernel's huge page pool. It also displays -default huge page size and information about the number of free, reserved -and surplus huge pages in the pool of huge pages of default size. -The huge page size is needed for generating the proper alignment and -size of the arguments to system calls that map huge page regions. - -The output of ``cat /proc/meminfo`` will include lines like:: - - HugePages_Total: uuu - HugePages_Free: vvv - HugePages_Rsvd: www - HugePages_Surp: xxx - Hugepagesize: yyy kB - Hugetlb: zzz kB - -where: - -HugePages_Total - is the size of the pool of huge pages. -HugePages_Free - is the number of huge pages in the pool that are not yet - allocated. -HugePages_Rsvd - is short for "reserved," and is the number of huge pages for - which a commitment to allocate from the pool has been made, - but no allocation has yet been made. Reserved huge pages - guarantee that an application will be able to allocate a - huge page from the pool of huge pages at fault time. -HugePages_Surp - is short for "surplus," and is the number of huge pages in - the pool above the value in ``/proc/sys/vm/nr_hugepages``. The - maximum number of surplus huge pages is controlled by - ``/proc/sys/vm/nr_overcommit_hugepages``. -Hugepagesize - is the default hugepage size (in Kb). -Hugetlb - is the total amount of memory (in kB), consumed by huge - pages of all sizes. - If huge pages of different sizes are in use, this number - will exceed HugePages_Total \* Hugepagesize. To get more - detailed information, please, refer to - ``/sys/kernel/mm/hugepages`` (described below). - - -``/proc/filesystems`` should also show a filesystem of type "hugetlbfs" -configured in the kernel. - -``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge -pages in the kernel's huge page pool. "Persistent" huge pages will be -returned to the huge page pool when freed by a task. A user with root -privileges can dynamically allocate more or free some persistent huge pages -by increasing or decreasing the value of ``nr_hugepages``. - -Pages that are used as huge pages are reserved inside the kernel and cannot -be used for other purposes. Huge pages cannot be swapped out under -memory pressure. - -Once a number of huge pages have been pre-allocated to the kernel huge page -pool, a user with appropriate privilege can use either the mmap system call -or shared memory system calls to use the huge pages. See the discussion of -Using Huge Pages, below. - -The administrator can allocate persistent huge pages on the kernel boot -command line by specifying the "hugepages=N" parameter, where 'N' = the -number of huge pages requested. This is the most reliable method of -allocating huge pages as memory has not yet become fragmented. - -Some platforms support multiple huge page sizes. To allocate huge pages -of a specific size, one must precede the huge pages boot command parameters -with a huge page size selection parameter "hugepagesz=". must -be specified in bytes with optional scale suffix [kKmMgG]. The default huge -page size may be selected with the "default_hugepagesz=" boot parameter. - -When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` -indicates the current number of pre-allocated huge pages of the default size. -Thus, one can use the following command to dynamically allocate/deallocate -default sized persistent huge pages:: - - echo 20 > /proc/sys/vm/nr_hugepages - -This command will try to adjust the number of default sized huge pages in the -huge page pool to 20, allocating or freeing huge pages, as required. - -On a NUMA platform, the kernel will attempt to distribute the huge page pool -over all the set of allowed nodes specified by the NUMA memory policy of the -task that modifies ``nr_hugepages``. The default for the allowed nodes--when the -task has default memory policy--is all on-line nodes with memory. Allowed -nodes with insufficient available, contiguous memory for a huge page will be -silently skipped when allocating persistent huge pages. See the discussion -below of the interaction of task memory policy, cpusets and per node attributes -with the allocation and freeing of persistent huge pages. - -The success or failure of huge page allocation depends on the amount of -physically contiguous memory that is present in system at the time of the -allocation attempt. If the kernel is unable to allocate huge pages from -some nodes in a NUMA system, it will attempt to make up the difference by -allocating extra pages on other nodes with sufficient available contiguous -memory, if any. - -System administrators may want to put this command in one of the local rc -init files. This will enable the kernel to allocate huge pages early in -the boot process when the possibility of getting physical contiguous pages -is still very high. Administrators can verify the number of huge pages -actually allocated by checking the sysctl or meminfo. To check the per node -distribution of huge pages in a NUMA system, use:: - - cat /sys/devices/system/node/node*/meminfo | fgrep Huge - -``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of -huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are -requested by applications. Writing any non-zero value into this file -indicates that the hugetlb subsystem is allowed to try to obtain that -number of "surplus" huge pages from the kernel's normal page pool, when the -persistent huge page pool is exhausted. As these surplus huge pages become -unused, they are freed back to the kernel's normal page pool. - -When increasing the huge page pool size via ``nr_hugepages``, any existing -surplus pages will first be promoted to persistent huge pages. Then, additional -huge pages will be allocated, if necessary and if possible, to fulfill -the new persistent huge page pool size. - -The administrator may shrink the pool of persistent huge pages for -the default huge page size by setting the ``nr_hugepages`` sysctl to a -smaller value. The kernel will attempt to balance the freeing of huge pages -across all nodes in the memory policy of the task modifying ``nr_hugepages``. -Any free huge pages on the selected nodes will be freed back to the kernel's -normal page pool. - -Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that -it becomes less than the number of huge pages in use will convert the balance -of the in-use huge pages to surplus huge pages. This will occur even if -the number of surplus pages it would exceed the overcommit value. As long as -this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is -increased sufficiently, or the surplus huge pages go out of use and are freed-- -no more surplus huge pages will be allowed to be allocated. - -With support for multiple huge page pools at run-time available, much of -the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in -sysfs. -The ``/proc`` interfaces discussed above have been retained for backwards -compatibility. The root huge page control directory in sysfs is:: - - /sys/kernel/mm/hugepages - -For each huge page size supported by the running kernel, a subdirectory -will exist, of the form:: - - hugepages-${size}kB - -Inside each of these directories, the same set of files will exist:: - - nr_hugepages - nr_hugepages_mempolicy - nr_overcommit_hugepages - free_hugepages - resv_hugepages - surplus_hugepages - -which function as described above for the default huge page-sized case. - - -Interaction of Task Memory Policy with Huge Page Allocation/Freeing -=================================================================== - -Whether huge pages are allocated and freed via the ``/proc`` interface or -the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the -NUMA nodes from which huge pages are allocated or freed are controlled by the -NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy`` -sysctl or attribute. When the ``nr_hugepages`` attribute is used, mempolicy -is ignored. - -The recommended method to allocate or free huge pages to/from the kernel -huge page pool, using the ``nr_hugepages`` example above, is:: - - numactl --interleave echo 20 \ - >/proc/sys/vm/nr_hugepages_mempolicy - -or, more succinctly:: - - numactl -m echo 20 >/proc/sys/vm/nr_hugepages_mempolicy - -This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes -specified in , depending on whether number of persistent huge pages -is initially less than or greater than 20, respectively. No huge pages will be -allocated nor freed on any node not included in the specified . - -When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any -memory policy mode--bind, preferred, local or interleave--may be used. The -resulting effect on persistent huge page allocation is as follows: - -#. Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt], - persistent huge pages will be distributed across the node or nodes - specified in the mempolicy as if "interleave" had been specified. - However, if a node in the policy does not contain sufficient contiguous - memory for a huge page, the allocation will not "fallback" to the nearest - neighbor node with sufficient contiguous memory. To do this would cause - undesirable imbalance in the distribution of the huge page pool, or - possibly, allocation of persistent huge pages on nodes not allowed by - the task's memory policy. - -#. One or more nodes may be specified with the bind or interleave policy. - If more than one node is specified with the preferred policy, only the - lowest numeric id will be used. Local policy will select the node where - the task is running at the time the nodes_allowed mask is constructed. - For local policy to be deterministic, the task must be bound to a cpu or - cpus in a single node. Otherwise, the task could be migrated to some - other node at any time after launch and the resulting node will be - indeterminate. Thus, local policy is not very useful for this purpose. - Any of the other mempolicy modes may be used to specify a single node. - -#. The nodes allowed mask will be derived from any non-default task mempolicy, - whether this policy was set explicitly by the task itself or one of its - ancestors, such as numactl. This means that if the task is invoked from a - shell with non-default policy, that policy will be used. One can specify a - node list of "all" with numactl --interleave or --membind [-m] to achieve - interleaving over all nodes in the system or cpuset. - -#. Any task mempolicy specified--e.g., using numactl--will be constrained by - the resource limits of any cpuset in which the task runs. Thus, there will - be no way for a task with non-default policy running in a cpuset with a - subset of the system nodes to allocate huge pages outside the cpuset - without first moving to a cpuset that contains all of the desired nodes. - -#. Boot-time huge page allocation attempts to distribute the requested number - of huge pages over all on-lines nodes with memory. - -Per Node Hugepages Attributes -============================= - -A subset of the contents of the root huge page control directory in sysfs, -described above, will be replicated under each the system device of each -NUMA node with memory in:: - - /sys/devices/system/node/node[0-9]*/hugepages/ - -Under this directory, the subdirectory for each supported huge page size -contains the following attribute files:: - - nr_hugepages - free_hugepages - surplus_hugepages - -The free\_' and surplus\_' attribute files are read-only. They return the number -of free and surplus [overcommitted] huge pages, respectively, on the parent -node. - -The ``nr_hugepages`` attribute returns the total number of huge pages on the -specified node. When this attribute is written, the number of persistent huge -pages on the parent node will be adjusted to the specified value, if sufficient -resources exist, regardless of the task's mempolicy or cpuset constraints. - -Note that the number of overcommit and reserve pages remain global quantities, -as we don't know until fault time, when the faulting task's mempolicy is -applied, from which node the huge page allocation will be attempted. - - -Using Huge Pages -================ - -If the user applications are going to request huge pages using mmap system -call, then it is required that system administrator mount a file system of -type hugetlbfs:: - - mount -t hugetlbfs \ - -o uid=,gid=,mode=,pagesize=,size=,\ - min_size=,nr_inodes= none /mnt/huge - -This command mounts a (pseudo) filesystem of type hugetlbfs on the directory -``/mnt/huge``. Any files created on ``/mnt/huge`` uses huge pages. - -The ``uid`` and ``gid`` options sets the owner and group of the root of the -file system. By default the ``uid`` and ``gid`` of the current process -are taken. - -The ``mode`` option sets the mode of root of file system to value & 01777. -This value is given in octal. By default the value 0755 is picked. - -If the platform supports multiple huge page sizes, the ``pagesize`` option can -be used to specify the huge page size and associated pool. ``pagesize`` -is specified in bytes. If ``pagesize`` is not specified the platform's -default huge page size and associated pool will be used. - -The ``size`` option sets the maximum value of memory (huge pages) allowed -for that filesystem (``/mnt/huge``). The ``size`` option can be specified -in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``). -The size is rounded down to HPAGE_SIZE boundary. - -The ``min_size`` option sets the minimum value of memory (huge pages) allowed -for the filesystem. ``min_size`` can be specified in the same way as ``size``, -either bytes or a percentage of the huge page pool. -At mount time, the number of huge pages specified by ``min_size`` are reserved -for use by the filesystem. -If there are not enough free huge pages available, the mount will fail. -As huge pages are allocated to the filesystem and freed, the reserve count -is adjusted so that the sum of allocated and reserved huge pages is always -at least ``min_size``. - -The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge`` -can use. - -If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on -command line then no limits are set. - -For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can -use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. -For example, size=2K has the same meaning as size=2048. - -While read system calls are supported on files that reside on hugetlb -file systems, write system calls are not. - -Regular chown, chgrp, and chmod commands (with right permissions) could be -used to change the file attributes on hugetlbfs. - -Also, it is important to note that no such mount command is required if -applications are going to use only shmat/shmget system calls or mmap with -MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see -:ref:`map_hugetlb ` below. - -Users who wish to use hugetlb memory via shared memory segment should be a -member of a supplementary group and system admin needs to configure that gid -into ``/proc/sys/vm/hugetlb_shm_group``. It is possible for same or different -applications to use any combination of mmaps and shm* calls, though the mount of -filesystem will be required for using mmap calls without MAP_HUGETLB. - -Syscalls that operate on memory backed by hugetlb pages only have their lengths -aligned to the native page size of the processor; they will normally fail with -errno set to EINVAL or exclude hugetlb pages that extend beyond the length if -not hugepage aligned. For example, munmap(2) will fail if memory is backed by -a hugetlb page and the length is smaller than the hugepage size. - - -Examples -======== - -.. _map_hugetlb: - -``map_hugetlb`` - see tools/testing/selftests/vm/map_hugetlb.c - -``hugepage-shm`` - see tools/testing/selftests/vm/hugepage-shm.c - -``hugepage-mmap`` - see tools/testing/selftests/vm/hugepage-mmap.c - -The `libhugetlbfs`_ library provides a wide range of userspace tools -to help with huge page usability, environment setup, and control. - -.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs - -Kernel development regression testing -===================================== - -The most complete set of hugetlb tests are in the libhugetlbfs repository. -If you modify any hugetlb related code, use the libhugetlbfs test suite -to check for regressions. In addition, if you add any new hugetlb -functionality, please add appropriate tests to libhugetlbfs. diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst new file mode 100644 index 000000000000..070aa1e716b7 --- /dev/null +++ b/Documentation/vm/hwpoison.rst @@ -0,0 +1,186 @@ +.. hwpoison: + +======== +hwpoison +======== + +What is hwpoison? +================= + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery``). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment: + + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * This focusses on pages detected as corrupted in the background. + * When the current CPU tries to consume corruption the currently + * running process can just be killed directly instead. This implies + * that if the error cannot be handled for some reason it's safe to + * just ignore it because no corruption has been consumed yet. Instead + * when that happens another machine check will happen. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * Some of the operations here are somewhat inefficient and have non + * linear algorithmic complexity, because the data structures have not + * been optimized for this case. This is in particular the case + * for the mapping from a vma to a process. Since this case is expected + * to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +Failure recovery modes +====================== + +There are two (actually three) modes memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + (on x86 this can be also affected by the tolerant level of the + MCE subsystem) + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +User control +============ + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + + arg1: PR_MCE_KILL_CLEAR: + Revert to system default + arg1: PR_MCE_KILL_SET: + arg2 defines thread specific mode + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + Use system global default + + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + +PR_MCE_KILL_GET + return current mode + +Testing +======= + +* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the + process for testing + +* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` + + corrupt-pfn + Inject hwpoison fault at PFN echoed into this file. This does + some early filtering to avoid corrupted unintended pages in test suites. + + unpoison-pfn + Software-unpoison page at PFN echoed into this file. This way + a page can be reused again. This only works for Linux + injected failures, not for real memory failures. + + Note these injection interfaces are not stable and might change between + kernel versions + + corrupt-filter-dev-major, corrupt-filter-dev-minor + Only handle memory failures to pages associated with the file + system defined by block device major/minor. -1U is the + wildcard value. This should be only used for testing with + artificial injection. + + corrupt-filter-memcg + Limit injection to pages owned by memgroup. Specified by inode + number of the memcg. + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + When specified, only poison pages if ((page_flags & mask) == + value). This allows stress testing of many kinds of + pages. The page_flags are the same as in /proc/kpageflags. The + flag bits are defined in include/linux/kernel-page-flags.h and + documented in Documentation/vm/pagemap.rst + +* Architecture specific MCE injector + + x86 has mce-inject, mce-test + + Some portable hwpoison test programs in mce-test, see below. + +References +========== + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +Limitations +=========== +- Not all page types are supported and never will. Most kernel internal + objects cannot be recovered, only LRU pages for now. +- Right now hugepage support is missing. + +--- +Andi Kleen, Oct 2009 diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt deleted file mode 100644 index b1a8c241d6c2..000000000000 --- a/Documentation/vm/hwpoison.txt +++ /dev/null @@ -1,186 +0,0 @@ -.. hwpoison: - -======== -hwpoison -======== - -What is hwpoison? -================= - -Upcoming Intel CPUs have support for recovering from some memory errors -(``MCA recovery``). This requires the OS to declare a page "poisoned", -kill the processes associated with it and avoid using it in the future. - -This patchkit implements the necessary infrastructure in the VM. - -To quote the overview comment: - - * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache - * failure. - * - * This focusses on pages detected as corrupted in the background. - * When the current CPU tries to consume corruption the currently - * running process can just be killed directly instead. This implies - * that if the error cannot be handled for some reason it's safe to - * just ignore it because no corruption has been consumed yet. Instead - * when that happens another machine check will happen. - * - * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * Some of the operations here are somewhat inefficient and have non - * linear algorithmic complexity, because the data structures have not - * been optimized for this case. This is in particular the case - * for the mapping from a vma to a process. Since this case is expected - * to be rare we hope we can get away with this. - -The code consists of a the high level handler in mm/memory-failure.c, -a new page poison bit and various checks in the VM to handle poisoned -pages. - -The main target right now is KVM guests, but it works for all kinds -of applications. KVM support requires a recent qemu-kvm release. - -For the KVM use there was need for a new signal type so that -KVM can inject the machine check into the guest with the proper -address. This in theory allows other applications to handle -memory failures too. The expection is that near all applications -won't do that, but some very specialized ones might. - -Failure recovery modes -====================== - -There are two (actually three) modes memory failure recovery can be in: - -vm.memory_failure_recovery sysctl set to zero: - All memory failures cause a panic. Do not attempt recovery. - (on x86 this can be also affected by the tolerant level of the - MCE subsystem) - -early kill - (can be controlled globally and per process) - Send SIGBUS to the application as soon as the error is detected - This allows applications who can process memory errors in a gentle - way (e.g. drop affected object) - This is the mode used by KVM qemu. - -late kill - Send SIGBUS when the application runs into the corrupted page. - This is best for memory error unaware applications and default - Note some pages are always handled as late kill. - -User control -============ - -vm.memory_failure_recovery - See sysctl.txt - -vm.memory_failure_early_kill - Enable early kill mode globally - -PR_MCE_KILL - Set early/late kill mode/revert to system default - - arg1: PR_MCE_KILL_CLEAR: - Revert to system default - arg1: PR_MCE_KILL_SET: - arg2 defines thread specific mode - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - Use system global default - - Note that if you want to have a dedicated thread which handles - the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should - call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, - the SIGBUS is sent to the main thread. - -PR_MCE_KILL_GET - return current mode - -Testing -======= - -* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the - process for testing - -* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` - - corrupt-pfn - Inject hwpoison fault at PFN echoed into this file. This does - some early filtering to avoid corrupted unintended pages in test suites. - - unpoison-pfn - Software-unpoison page at PFN echoed into this file. This way - a page can be reused again. This only works for Linux - injected failures, not for real memory failures. - - Note these injection interfaces are not stable and might change between - kernel versions - - corrupt-filter-dev-major, corrupt-filter-dev-minor - Only handle memory failures to pages associated with the file - system defined by block device major/minor. -1U is the - wildcard value. This should be only used for testing with - artificial injection. - - corrupt-filter-memcg - Limit injection to pages owned by memgroup. Specified by inode - number of the memcg. - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - When specified, only poison pages if ((page_flags & mask) == - value). This allows stress testing of many kinds of - pages. The page_flags are the same as in /proc/kpageflags. The - flag bits are defined in include/linux/kernel-page-flags.h and - documented in Documentation/vm/pagemap.txt - -* Architecture specific MCE injector - - x86 has mce-inject, mce-test - - Some portable hwpoison test programs in mce-test, see below. - -References -========== - -http://halobates.de/mce-lc09-2.pdf - Overview presentation from LinuxCon 09 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - Test suite (hwpoison specific portable tests in tsrc) - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86 specific injector - - -Limitations -=========== -- Not all page types are supported and never will. Most kernel internal - objects cannot be recovered, only LRU pages for now. -- Right now hugepage support is missing. - ---- -Andi Kleen, Oct 2009 diff --git a/Documentation/vm/idle_page_tracking.rst b/Documentation/vm/idle_page_tracking.rst new file mode 100644 index 000000000000..d1c4609a5220 --- /dev/null +++ b/Documentation/vm/idle_page_tracking.rst @@ -0,0 +1,115 @@ +.. _idle_page_tracking: + +================== +Idle Page Tracking +================== + +Motivation +========== + +The idle page tracking feature allows to track which memory pages are being +accessed by a workload and which are idle. This information can be useful for +estimating the workload's working set size, which, in turn, can be taken into +account when configuring the workload parameters, setting memory cgroup limits, +or deciding where to place the workload within a compute cluster. + +It is enabled by CONFIG_IDLE_PAGE_TRACKING=y. + +.. _user_api: + +User API +======== + +The idle page tracking API is located at ``/sys/kernel/mm/page_idle``. +Currently, it consists of the only read-write file, +``/sys/kernel/mm/page_idle/bitmap``. + +The file implements a bitmap where each bit corresponds to a memory page. The +bitmap is represented by an array of 8-byte integers, and the page at PFN #i is +mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is +set, the corresponding page is idle. + +A page is considered idle if it has not been accessed since it was marked idle +(for more details on what "accessed" actually means see the :ref:`Implementation +Details ` section). +To mark a page idle one has to set the bit corresponding to +the page by writing to the file. A value written to the file is OR-ed with the +current bitmap value. + +Only accesses to user memory pages are tracked. These are pages mapped to a +process address space, page cache and buffer pages, swap cache pages. For other +page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored, +and hence such pages are never reported idle. + +For huge pages the idle flag is set only on the head page, so one has to read +``/proc/kpageflags`` in order to correctly count idle huge pages. + +Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return +-EINVAL if you are not starting the read/write on an 8-byte boundary, or +if the size of the read/write is not a multiple of 8 bytes. Writing to +this file beyond max PFN will return -ENXIO. + +That said, in order to estimate the amount of pages that are not used by a +workload one should: + + 1. Mark all the workload's pages as idle by setting corresponding bits in + ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading + ``/proc/pid/pagemap`` if the workload is represented by a process, or by + filtering out alien pages using ``/proc/kpagecgroup`` in case the workload + is placed in a memory cgroup. + + 2. Wait until the workload accesses its working set. + + 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set. + If one wants to ignore certain types of pages, e.g. mlocked pages since they + are not reclaimable, he or she can filter them out using + ``/proc/kpageflags``. + +See Documentation/vm/pagemap.rst for more information about +``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``. + +.. _impl_details: + +Implementation Details +====================== + +The kernel internally keeps track of accesses to user memory pages in order to +reclaim unreferenced pages first on memory shortage conditions. A page is +considered referenced if it has been recently accessed via a process address +space, in which case one or more PTEs it is mapped to will have the Accessed bit +set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The +latter happens when: + + - a userspace process reads or writes a page using a system call (e.g. read(2) + or write(2)) + + - a page that is used for storing filesystem buffers is read or written, + because a process needs filesystem metadata stored in it (e.g. lists a + directory tree) + + - a page is accessed by a device driver using get_user_pages() + +When a dirty page is written to swap or disk as a result of memory reclaim or +exceeding the dirty memory limit, it is not marked referenced. + +The idle memory tracking feature adds a new page flag, the Idle flag. This flag +is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the +:ref:`User API ` +section), and cleared automatically whenever a page is referenced as defined +above. + +When a page is marked idle, the Accessed bit must be cleared in all PTEs it is +mapped to, otherwise we will not be able to detect accesses to the page coming +from a process address space. To avoid interference with the reclaimer, which, +as noted above, uses the Accessed bit to promote actively referenced pages, one +more page flag is introduced, the Young flag. When the PTE Accessed bit is +cleared as a result of setting or updating a page's Idle flag, the Young flag +is set on the page. The reclaimer treats the Young flag as an extra PTE +Accessed bit and therefore will consider such a page as referenced. + +Since the idle memory tracking feature is based on the memory reclaimer logic, +it only works with pages that are on an LRU list, other pages are silently +ignored. That means it will ignore a user memory page if it is isolated, but +since there are usually not many of them, it should not affect the overall +result noticeably. In order not to stall scanning of the idle page bitmap, +locked pages may be skipped too. diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt deleted file mode 100644 index 9cbe6f8d7a99..000000000000 --- a/Documentation/vm/idle_page_tracking.txt +++ /dev/null @@ -1,115 +0,0 @@ -.. _idle_page_tracking: - -================== -Idle Page Tracking -================== - -Motivation -========== - -The idle page tracking feature allows to track which memory pages are being -accessed by a workload and which are idle. This information can be useful for -estimating the workload's working set size, which, in turn, can be taken into -account when configuring the workload parameters, setting memory cgroup limits, -or deciding where to place the workload within a compute cluster. - -It is enabled by CONFIG_IDLE_PAGE_TRACKING=y. - -.. _user_api: - -User API -======== - -The idle page tracking API is located at ``/sys/kernel/mm/page_idle``. -Currently, it consists of the only read-write file, -``/sys/kernel/mm/page_idle/bitmap``. - -The file implements a bitmap where each bit corresponds to a memory page. The -bitmap is represented by an array of 8-byte integers, and the page at PFN #i is -mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is -set, the corresponding page is idle. - -A page is considered idle if it has not been accessed since it was marked idle -(for more details on what "accessed" actually means see the :ref:`Implementation -Details ` section). -To mark a page idle one has to set the bit corresponding to -the page by writing to the file. A value written to the file is OR-ed with the -current bitmap value. - -Only accesses to user memory pages are tracked. These are pages mapped to a -process address space, page cache and buffer pages, swap cache pages. For other -page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored, -and hence such pages are never reported idle. - -For huge pages the idle flag is set only on the head page, so one has to read -``/proc/kpageflags`` in order to correctly count idle huge pages. - -Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return --EINVAL if you are not starting the read/write on an 8-byte boundary, or -if the size of the read/write is not a multiple of 8 bytes. Writing to -this file beyond max PFN will return -ENXIO. - -That said, in order to estimate the amount of pages that are not used by a -workload one should: - - 1. Mark all the workload's pages as idle by setting corresponding bits in - ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading - ``/proc/pid/pagemap`` if the workload is represented by a process, or by - filtering out alien pages using ``/proc/kpagecgroup`` in case the workload - is placed in a memory cgroup. - - 2. Wait until the workload accesses its working set. - - 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set. - If one wants to ignore certain types of pages, e.g. mlocked pages since they - are not reclaimable, he or she can filter them out using - ``/proc/kpageflags``. - -See Documentation/vm/pagemap.txt for more information about -``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``. - -.. _impl_details: - -Implementation Details -====================== - -The kernel internally keeps track of accesses to user memory pages in order to -reclaim unreferenced pages first on memory shortage conditions. A page is -considered referenced if it has been recently accessed via a process address -space, in which case one or more PTEs it is mapped to will have the Accessed bit -set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The -latter happens when: - - - a userspace process reads or writes a page using a system call (e.g. read(2) - or write(2)) - - - a page that is used for storing filesystem buffers is read or written, - because a process needs filesystem metadata stored in it (e.g. lists a - directory tree) - - - a page is accessed by a device driver using get_user_pages() - -When a dirty page is written to swap or disk as a result of memory reclaim or -exceeding the dirty memory limit, it is not marked referenced. - -The idle memory tracking feature adds a new page flag, the Idle flag. This flag -is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the -:ref:`User API ` -section), and cleared automatically whenever a page is referenced as defined -above. - -When a page is marked idle, the Accessed bit must be cleared in all PTEs it is -mapped to, otherwise we will not be able to detect accesses to the page coming -from a process address space. To avoid interference with the reclaimer, which, -as noted above, uses the Accessed bit to promote actively referenced pages, one -more page flag is introduced, the Young flag. When the PTE Accessed bit is -cleared as a result of setting or updating a page's Idle flag, the Young flag -is set on the page. The reclaimer treats the Young flag as an extra PTE -Accessed bit and therefore will consider such a page as referenced. - -Since the idle memory tracking feature is based on the memory reclaimer logic, -it only works with pages that are on an LRU list, other pages are silently -ignored. That means it will ignore a user memory page if it is isolated, but -since there are usually not many of them, it should not affect the overall -result noticeably. In order not to stall scanning of the idle page bitmap, -locked pages may be skipped too. diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst new file mode 100644 index 000000000000..87e7eef5ea9c --- /dev/null +++ b/Documentation/vm/ksm.rst @@ -0,0 +1,183 @@ +.. _ksm: + +======================= +Kernel Samepage Merging +======================= + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, +and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ + +The KSM daemon ksmd periodically scans those areas of user memory which +have been registered with it, looking for pages of identical content which +can be replaced by a single write-protected page (which is automatically +copied if a process later wants to update its content). + +KSM was originally developed for use with KVM (where it was known as +Kernel Shared Memory), to fit more virtual machines into physical memory, +by sharing the data common between them. But it can be useful to any +application which generates many instances of the same data. + +KSM only merges anonymous (private) pages, never pagecache (file) pages. +KSM's merged pages were originally locked into kernel memory, but can now +be swapped out just like other user pages (but sharing is broken when they +are swapped back in: ksmd must rediscover their identity and merge again). + +KSM only operates on those areas of address space which an application +has advised to be likely candidates for merging, by using the madvise(2) +system call: int madvise(addr, length, MADV_MERGEABLE). + +The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel +that advice and restore unshared pages: whereupon KSM unmerges whatever +it merged in that range. Note: this unmerging call may suddenly require +more memory than is available - possibly failing with EAGAIN, but more +probably arousing the Out-Of-Memory killer. + +If KSM is not configured into the running kernel, madvise MADV_MERGEABLE +and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was +built with CONFIG_KSM=y, those calls will normally succeed: even if the +the KSM daemon is not currently running, MADV_MERGEABLE still registers +the range for whenever the KSM daemon is started; even if the range +cannot contain any pages which KSM could actually merge; even if +MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. + +If a region of memory must be split into at least one new MADV_MERGEABLE +or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process +will exceed vm.max_map_count (see Documentation/sysctl/vm.txt). + +Like other madvise calls, they are intended for use on mapped areas of +the user address space: they will report ENOMEM if the specified range +includes unmapped gaps (though working on the intervening mapped areas), +and might fail with EAGAIN if not enough memory for internal structures. + +Applications should be considerate in their use of MADV_MERGEABLE, +restricting its use to areas likely to benefit. KSM's scans may use a lot +of processing power: some installations will disable KSM for that reason. + +The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``, +readable by all but writable only by root: + +pages_to_scan + how many present pages to scan before ksmd goes to sleep + e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan`` Default: 100 + (chosen for demonstration purposes) + +sleep_millisecs + how many milliseconds ksmd should sleep before next scan + e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs`` Default: 20 + (chosen for demonstration purposes) + +merge_across_nodes + specifies if pages from different numa nodes can be merged. + When set to 0, ksm merges only pages which physically reside + in the memory area of same NUMA node. That brings lower + latency to access of shared pages. Systems with more nodes, at + significant NUMA distances, are likely to benefit from the + lower latency of setting 0. Smaller systems, which need to + minimize memory usage, are likely to benefit from the greater + sharing of setting 1 (default). You may wish to compare how + your system performs under each setting, before deciding on + which to use. merge_across_nodes setting can be changed only + when there are no ksm shared pages in system: set run 2 to + unmerge pages first, then to 1 after changing + merge_across_nodes, to remerge according to the new setting. + Default: 1 (merging across nodes as in earlier releases) + +run + set 0 to stop ksmd from running but keep merged pages, + set 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``, + set 2 to stop ksmd and unmerge all pages currently merged, but + leave mergeable areas registered for next run Default: 0 (must + be changed to 1 to activate KSM, except if CONFIG_SYSFS is + disabled) + +use_zero_pages + specifies whether empty pages (i.e. allocated pages that only + contain zeroes) should be treated specially. When set to 1, + empty pages are merged with the kernel zero page(s) instead of + with each other as it would happen normally. This can improve + the performance on architectures with coloured zero pages, + depending on the workload. Care should be taken when enabling + this setting, as it can potentially degrade the performance of + KSM for some workloads, for example if the checksums of pages + candidate for merging match the checksum of an empty + page. This setting can be changed at any time, it is only + effective for pages merged after the change. Default: 0 + (normal KSM behaviour as in earlier releases) + +max_page_sharing + Maximum sharing allowed for each KSM page. This enforces a + deduplication limit to avoid the virtual memory rmap lists to + grow too large. The minimum value is 2 as a newly created KSM + page will have at least two sharers. The rmap walk has O(N) + complexity where N is the number of rmap_items (i.e. virtual + mappings) that are sharing the page, which is in turn capped + by max_page_sharing. So this effectively spread the the linear + O(N) computational complexity from rmap walk context over + different KSM pages. The ksmd walk over the stable_node + "chains" is also O(N), but N is the number of stable_node + "dups", not the number of rmap_items, so it has not a + significant impact on ksmd performance. In practice the best + stable_node "dup" candidate will be kept and found at the head + of the "dups" list. The higher this value the faster KSM will + merge the memory (because there will be fewer stable_node dups + queued into the stable_node chain->hlist to check for pruning) + and the higher the deduplication factor will be, but the + slowest the worst case rmap walk could be for any given KSM + page. Slowing down the rmap_walk means there will be higher + latency for certain virtual memory operations happening during + swapping, compaction, NUMA balancing and page migration, in + turn decreasing responsiveness for the caller of those virtual + memory operations. The scheduler latency of other tasks not + involved with the VM operations doing the rmap walk is not + affected by this parameter as the rmap walks are always + schedule friendly themselves. + +stable_node_chains_prune_millisecs + How frequently to walk the whole list of stable_node "dups" + linked in the stable_node "chains" in order to prune stale + stable_nodes. Smaller milllisecs values will free up the KSM + metadata with lower latency, but they will make ksmd use more + CPU during the scan. This only applies to the stable_node + chains so it's a noop if not a single KSM page hit the + max_page_sharing yet (there would be no stable_node chains in + such case). + +The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``: + +pages_shared + how many shared pages are being used +pages_sharing + how many more sites are sharing them i.e. how much saved +pages_unshared + how many pages unique but repeatedly checked for merging +pages_volatile + how many pages changing too fast to be placed in a tree +full_scans + how many times all mergeable areas have been scanned +stable_node_chains + number of stable node chains allocated, this is effectively + the number of KSM pages that hit the max_page_sharing limit +stable_node_dups + number of stable node dups queued into the stable_node chains + +A high ratio of pages_sharing to pages_shared indicates good sharing, but +a high ratio of pages_unshared to pages_sharing indicates wasted effort. +pages_volatile embraces several different kinds of activity, but a high +proportion there would also indicate poor use of madvise MADV_MERGEABLE. + +The maximum possible page_sharing/page_shared ratio is limited by the +max_page_sharing tunable. To increase the ratio max_page_sharing must +be increased accordingly. + +The stable_node_dups/stable_node_chains ratio is also affected by the +max_page_sharing tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to freeup +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + +Izik Eidus, +Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt deleted file mode 100644 index 87e7eef5ea9c..000000000000 --- a/Documentation/vm/ksm.txt +++ /dev/null @@ -1,183 +0,0 @@ -.. _ksm: - -======================= -Kernel Samepage Merging -======================= - -KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, -added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, -and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ - -The KSM daemon ksmd periodically scans those areas of user memory which -have been registered with it, looking for pages of identical content which -can be replaced by a single write-protected page (which is automatically -copied if a process later wants to update its content). - -KSM was originally developed for use with KVM (where it was known as -Kernel Shared Memory), to fit more virtual machines into physical memory, -by sharing the data common between them. But it can be useful to any -application which generates many instances of the same data. - -KSM only merges anonymous (private) pages, never pagecache (file) pages. -KSM's merged pages were originally locked into kernel memory, but can now -be swapped out just like other user pages (but sharing is broken when they -are swapped back in: ksmd must rediscover their identity and merge again). - -KSM only operates on those areas of address space which an application -has advised to be likely candidates for merging, by using the madvise(2) -system call: int madvise(addr, length, MADV_MERGEABLE). - -The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel -that advice and restore unshared pages: whereupon KSM unmerges whatever -it merged in that range. Note: this unmerging call may suddenly require -more memory than is available - possibly failing with EAGAIN, but more -probably arousing the Out-Of-Memory killer. - -If KSM is not configured into the running kernel, madvise MADV_MERGEABLE -and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was -built with CONFIG_KSM=y, those calls will normally succeed: even if the -the KSM daemon is not currently running, MADV_MERGEABLE still registers -the range for whenever the KSM daemon is started; even if the range -cannot contain any pages which KSM could actually merge; even if -MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. - -If a region of memory must be split into at least one new MADV_MERGEABLE -or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process -will exceed vm.max_map_count (see Documentation/sysctl/vm.txt). - -Like other madvise calls, they are intended for use on mapped areas of -the user address space: they will report ENOMEM if the specified range -includes unmapped gaps (though working on the intervening mapped areas), -and might fail with EAGAIN if not enough memory for internal structures. - -Applications should be considerate in their use of MADV_MERGEABLE, -restricting its use to areas likely to benefit. KSM's scans may use a lot -of processing power: some installations will disable KSM for that reason. - -The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``, -readable by all but writable only by root: - -pages_to_scan - how many present pages to scan before ksmd goes to sleep - e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan`` Default: 100 - (chosen for demonstration purposes) - -sleep_millisecs - how many milliseconds ksmd should sleep before next scan - e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs`` Default: 20 - (chosen for demonstration purposes) - -merge_across_nodes - specifies if pages from different numa nodes can be merged. - When set to 0, ksm merges only pages which physically reside - in the memory area of same NUMA node. That brings lower - latency to access of shared pages. Systems with more nodes, at - significant NUMA distances, are likely to benefit from the - lower latency of setting 0. Smaller systems, which need to - minimize memory usage, are likely to benefit from the greater - sharing of setting 1 (default). You may wish to compare how - your system performs under each setting, before deciding on - which to use. merge_across_nodes setting can be changed only - when there are no ksm shared pages in system: set run 2 to - unmerge pages first, then to 1 after changing - merge_across_nodes, to remerge according to the new setting. - Default: 1 (merging across nodes as in earlier releases) - -run - set 0 to stop ksmd from running but keep merged pages, - set 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``, - set 2 to stop ksmd and unmerge all pages currently merged, but - leave mergeable areas registered for next run Default: 0 (must - be changed to 1 to activate KSM, except if CONFIG_SYSFS is - disabled) - -use_zero_pages - specifies whether empty pages (i.e. allocated pages that only - contain zeroes) should be treated specially. When set to 1, - empty pages are merged with the kernel zero page(s) instead of - with each other as it would happen normally. This can improve - the performance on architectures with coloured zero pages, - depending on the workload. Care should be taken when enabling - this setting, as it can potentially degrade the performance of - KSM for some workloads, for example if the checksums of pages - candidate for merging match the checksum of an empty - page. This setting can be changed at any time, it is only - effective for pages merged after the change. Default: 0 - (normal KSM behaviour as in earlier releases) - -max_page_sharing - Maximum sharing allowed for each KSM page. This enforces a - deduplication limit to avoid the virtual memory rmap lists to - grow too large. The minimum value is 2 as a newly created KSM - page will have at least two sharers. The rmap walk has O(N) - complexity where N is the number of rmap_items (i.e. virtual - mappings) that are sharing the page, which is in turn capped - by max_page_sharing. So this effectively spread the the linear - O(N) computational complexity from rmap walk context over - different KSM pages. The ksmd walk over the stable_node - "chains" is also O(N), but N is the number of stable_node - "dups", not the number of rmap_items, so it has not a - significant impact on ksmd performance. In practice the best - stable_node "dup" candidate will be kept and found at the head - of the "dups" list. The higher this value the faster KSM will - merge the memory (because there will be fewer stable_node dups - queued into the stable_node chain->hlist to check for pruning) - and the higher the deduplication factor will be, but the - slowest the worst case rmap walk could be for any given KSM - page. Slowing down the rmap_walk means there will be higher - latency for certain virtual memory operations happening during - swapping, compaction, NUMA balancing and page migration, in - turn decreasing responsiveness for the caller of those virtual - memory operations. The scheduler latency of other tasks not - involved with the VM operations doing the rmap walk is not - affected by this parameter as the rmap walks are always - schedule friendly themselves. - -stable_node_chains_prune_millisecs - How frequently to walk the whole list of stable_node "dups" - linked in the stable_node "chains" in order to prune stale - stable_nodes. Smaller milllisecs values will free up the KSM - metadata with lower latency, but they will make ksmd use more - CPU during the scan. This only applies to the stable_node - chains so it's a noop if not a single KSM page hit the - max_page_sharing yet (there would be no stable_node chains in - such case). - -The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``: - -pages_shared - how many shared pages are being used -pages_sharing - how many more sites are sharing them i.e. how much saved -pages_unshared - how many pages unique but repeatedly checked for merging -pages_volatile - how many pages changing too fast to be placed in a tree -full_scans - how many times all mergeable areas have been scanned -stable_node_chains - number of stable node chains allocated, this is effectively - the number of KSM pages that hit the max_page_sharing limit -stable_node_dups - number of stable node dups queued into the stable_node chains - -A high ratio of pages_sharing to pages_shared indicates good sharing, but -a high ratio of pages_unshared to pages_sharing indicates wasted effort. -pages_volatile embraces several different kinds of activity, but a high -proportion there would also indicate poor use of madvise MADV_MERGEABLE. - -The maximum possible page_sharing/page_shared ratio is limited by the -max_page_sharing tunable. To increase the ratio max_page_sharing must -be increased accordingly. - -The stable_node_dups/stable_node_chains ratio is also affected by the -max_page_sharing tunable, and an high ratio may indicate fragmentation -in the stable_node dups, which could be solved by introducing -fragmentation algorithms in ksmd which would refile rmap_items from -one stable_node dup to another stable_node dup, in order to freeup -stable_node "dups" with few rmap_items in them, but that may increase -the ksmd CPU usage and possibly slowdown the readonly computations on -the KSM pages of the applications. - -Izik Eidus, -Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst new file mode 100644 index 000000000000..47baa1cf28c5 --- /dev/null +++ b/Documentation/vm/mmu_notifier.rst @@ -0,0 +1,99 @@ +.. _mmu_notifier: + +When do you need to notify inside page table lock ? +=================================================== + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {try to write to addrA} + CPU-thread-1 {try to write to addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA and populate device TLB} + DEV-thread-2 {read addrB and populate device TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} + CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {write to addrA which is a write to new page} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {write to addrB which is a write to new page} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA from old page} + DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt deleted file mode 100644 index 47baa1cf28c5..000000000000 --- a/Documentation/vm/mmu_notifier.txt +++ /dev/null @@ -1,99 +0,0 @@ -.. _mmu_notifier: - -When do you need to notify inside page table lock ? -=================================================== - -When clearing a pte/pmd we are given a choice to notify the event through -(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under -the page table lock. But that notification is not necessary in all cases. - -For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use -thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a -process virtual address space). There is only 2 cases when you need to notify -those secondary TLB while holding page table lock when clearing a pte/pmd: - - A) page backing address is free before mmu_notifier_invalidate_range_end() - B) a page table entry is updated to point to a new page (COW, write fault - on zero page, __replace_page(), ...) - -Case A is obvious you do not want to take the risk for the device to write to -a page that might now be used by some completely different task. - -Case B is more subtle. For correctness it requires the following sequence to -happen: - - - take page table lock - - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) - - set page table entry to point to new page - -If clearing the page table entry is not followed by a notify before setting -the new pte/pmd value then you can break memory model like C11 or C++11 for -the device. - -Consider the following scenario (device use a feature similar to ATS/PASID): - -Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume -they are write protected for COW (other case of B apply too). - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {try to write to addrA} - CPU-thread-1 {try to write to addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA and populate device TLB} - DEV-thread-2 {read addrB and populate device TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} - CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {write to addrA which is a write to new page} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {write to addrB which is a write to new page} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA from old page} - DEV-thread-2 {read addrB from new page} - -So here because at time N+2 the clear page table entry was not pair with a -notification to invalidate the secondary TLB, the device see the new value for -addrB before seing the new value for addrA. This break total memory ordering -for the device. - -When changing a pte to write protect or to point to a new write protected page -with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range -call to mmu_notifier_invalidate_range_end() outside the page table lock. This -is true even if the thread doing the page table update is preempted right after -releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/vm/numa b/Documentation/vm/numa deleted file mode 100644 index c81e7c56f0f9..000000000000 --- a/Documentation/vm/numa +++ /dev/null @@ -1,150 +0,0 @@ -.. _numa: - -Started Nov 1999 by Kanoj Sarcar - -============= -What is NUMA? -============= - -This question can be answered from a couple of perspectives: the -hardware view and the Linux software view. - -From the hardware perspective, a NUMA system is a computer platform that -comprises multiple components or assemblies each of which may contain 0 -or more CPUs, local memory, and/or IO buses. For brevity and to -disambiguate the hardware view of these physical components/assemblies -from the software abstraction thereof, we'll call the components/assemblies -'cells' in this document. - -Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset -of the system--although some components necessary for a stand-alone SMP system -may not be populated on any given cell. The cells of the NUMA system are -connected together with some sort of system interconnect--e.g., a crossbar or -point-to-point link are common types of NUMA system interconnects. Both of -these types of interconnects can be aggregated to create NUMA platforms with -cells at multiple distances from other cells. - -For Linux, the NUMA platforms of interest are primarily what is known as Cache -Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible -to and accessible from any CPU attached to any cell and cache coherency -is handled in hardware by the processor caches and/or the system interconnect. - -Memory access time and effective memory bandwidth varies depending on how far -away the cell containing the CPU or IO bus making the memory access is from the -cell containing the target memory. For example, access to memory by CPUs -attached to the same cell will experience faster access times and higher -bandwidths than accesses to memory on other, remote cells. NUMA platforms -can have cells at multiple remote distances from any given cell. - -Platform vendors don't build NUMA systems just to make software developers' -lives interesting. Rather, this architecture is a means to provide scalable -memory bandwidth. However, to achieve scalable memory bandwidth, system and -application software must arrange for a large majority of the memory references -[cache misses] to be to "local" memory--memory on the same cell, if any--or -to the closest cell with memory. - -This leads to the Linux software view of a NUMA system: - -Linux divides the system's hardware resources into multiple software -abstractions called "nodes". Linux maps the nodes onto the physical cells -of the hardware platform, abstracting away some of the details for some -architectures. As with physical cells, software nodes may contain 0 or more -CPUs, memory and/or IO buses. And, again, memory accesses to memory on -"closer" nodes--nodes that map to closer cells--will generally experience -faster access times and higher effective bandwidth than accesses to more -remote cells. - -For some architectures, such as x86, Linux will "hide" any node representing a -physical cell that has no memory attached, and reassign any CPUs attached to -that cell to a node representing a cell that does have memory. Thus, on -these architectures, one cannot assume that all CPUs that Linux associates with -a given node will see the same local memory access times and bandwidth. - -In addition, for some architectures, again x86 is an example, Linux supports -the emulation of additional nodes. For NUMA emulation, linux will carve up -the existing nodes--or the system memory for non-NUMA platforms--into multiple -nodes. Each emulated node will manage a fraction of the underlying cells' -physical memory. NUMA emluation is useful for testing NUMA kernel and -application features on non-NUMA platforms, and as a sort of memory resource -management mechanism when used together with cpusets. -[see Documentation/cgroup-v1/cpusets.txt] - -For each node with memory, Linux constructs an independent memory management -subsystem, complete with its own free page lists, in-use page lists, usage -statistics and locks to mediate access. In addition, Linux constructs for -each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], -an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a -selected zone/node cannot satisfy the allocation request. This situation, -when a zone has no available memory to satisfy a request, is called -"overflow" or "fallback". - -Because some nodes contain multiple zones containing different types of -memory, Linux must decide whether to order the zonelists such that allocations -fall back to the same zone type on a different node, or to a different zone -type on the same node. This is an important consideration because some zones, -such as DMA or DMA32, represent relatively scarce resources. Linux chooses -a default Node ordered zonelist. This means it tries to fallback to other zones -from the same node before using remote nodes which are ordered by NUMA distance. - -By default, Linux will attempt to satisfy memory allocation requests from the -node to which the CPU that executes the request is assigned. Specifically, -Linux will attempt to allocate from the first node in the appropriate zonelist -for the node where the request originates. This is called "local allocation." -If the "local" node cannot satisfy the request, the kernel will examine other -nodes' zones in the selected zonelist looking for the first zone in the list -that can satisfy the request. - -Local allocation will tend to keep subsequent access to the allocated memory -"local" to the underlying physical resources and off the system interconnect-- -as long as the task on whose behalf the kernel allocated some memory does not -later migrate away from that memory. The Linux scheduler is aware of the -NUMA topology of the platform--embodied in the "scheduling domains" data -structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler -attempts to minimize task migration to distant scheduling domains. However, -the scheduler does not take a task's NUMA footprint into account directly. -Thus, under sufficient imbalance, tasks can migrate between nodes, remote -from their initial node and kernel data structures. - -System administrators and application designers can restrict a task's migration -to improve NUMA locality using various CPU affinity command line interfaces, -such as taskset(1) and numactl(1), and program interfaces such as -sched_setaffinity(2). Further, one can modify the kernel's default local -allocation behavior using Linux NUMA memory policy. -[see Documentation/vm/numa_memory_policy.txt.] - -System administrators can restrict the CPUs and nodes' memories that a non- -privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt] - -On architectures that do not hide memoryless nodes, Linux will include only -zones [nodes] with memory in the zonelists. This means that for a memoryless -node the "local memory node"--the node of the first zone in CPU's node's -zonelist--will not be the node itself. Rather, it will be the node that the -kernel selected as the nearest node with memory when it built the zonelists. -So, default, local allocations will succeed with the kernel supplying the -closest available memory. This is a consequence of the same mechanism that -allows such allocations to fallback to other nearby nodes when a node that -does contain memory overflows. - -Some kernel allocations do not want or cannot tolerate this allocation fallback -behavior. Rather they want to be sure they get memory from the specified node -or get notified that the node has no free memory. This is usually the case when -a subsystem allocates per CPU memory resources, for example. - -A typical model for making such an allocation is to obtain the node id of the -node to which the "current CPU" is attached using one of the kernel's -numa_node_id() or CPU_to_node() functions and then request memory from only -the node id returned. When such an allocation fails, the requesting subsystem -may revert to its own fallback path. The slab kernel memory allocator is an -example of this. Or, the subsystem may choose to disable or not to enable -itself on allocation failure. The kernel profiling subsystem is an example of -this. - -If the architecture supports--does not hide--memoryless nodes, then CPUs -attached to memoryless nodes would always incur the fallback path overhead -or some subsystems would fail to initialize if they attempted to allocated -memory exclusively from a node without memory. To support such -architectures transparently, kernel subsystems can use the numa_mem_id() -or cpu_to_mem() function to locate the "local memory node" for the calling or -specified CPU. Again, this is the same node from which default, local page -allocations will be attempted. diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst new file mode 100644 index 000000000000..aada84bc8c46 --- /dev/null +++ b/Documentation/vm/numa.rst @@ -0,0 +1,150 @@ +.. _numa: + +Started Nov 1999 by Kanoj Sarcar + +============= +What is NUMA? +============= + +This question can be answered from a couple of perspectives: the +hardware view and the Linux software view. + +From the hardware perspective, a NUMA system is a computer platform that +comprises multiple components or assemblies each of which may contain 0 +or more CPUs, local memory, and/or IO buses. For brevity and to +disambiguate the hardware view of these physical components/assemblies +from the software abstraction thereof, we'll call the components/assemblies +'cells' in this document. + +Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset +of the system--although some components necessary for a stand-alone SMP system +may not be populated on any given cell. The cells of the NUMA system are +connected together with some sort of system interconnect--e.g., a crossbar or +point-to-point link are common types of NUMA system interconnects. Both of +these types of interconnects can be aggregated to create NUMA platforms with +cells at multiple distances from other cells. + +For Linux, the NUMA platforms of interest are primarily what is known as Cache +Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible +to and accessible from any CPU attached to any cell and cache coherency +is handled in hardware by the processor caches and/or the system interconnect. + +Memory access time and effective memory bandwidth varies depending on how far +away the cell containing the CPU or IO bus making the memory access is from the +cell containing the target memory. For example, access to memory by CPUs +attached to the same cell will experience faster access times and higher +bandwidths than accesses to memory on other, remote cells. NUMA platforms +can have cells at multiple remote distances from any given cell. + +Platform vendors don't build NUMA systems just to make software developers' +lives interesting. Rather, this architecture is a means to provide scalable +memory bandwidth. However, to achieve scalable memory bandwidth, system and +application software must arrange for a large majority of the memory references +[cache misses] to be to "local" memory--memory on the same cell, if any--or +to the closest cell with memory. + +This leads to the Linux software view of a NUMA system: + +Linux divides the system's hardware resources into multiple software +abstractions called "nodes". Linux maps the nodes onto the physical cells +of the hardware platform, abstracting away some of the details for some +architectures. As with physical cells, software nodes may contain 0 or more +CPUs, memory and/or IO buses. And, again, memory accesses to memory on +"closer" nodes--nodes that map to closer cells--will generally experience +faster access times and higher effective bandwidth than accesses to more +remote cells. + +For some architectures, such as x86, Linux will "hide" any node representing a +physical cell that has no memory attached, and reassign any CPUs attached to +that cell to a node representing a cell that does have memory. Thus, on +these architectures, one cannot assume that all CPUs that Linux associates with +a given node will see the same local memory access times and bandwidth. + +In addition, for some architectures, again x86 is an example, Linux supports +the emulation of additional nodes. For NUMA emulation, linux will carve up +the existing nodes--or the system memory for non-NUMA platforms--into multiple +nodes. Each emulated node will manage a fraction of the underlying cells' +physical memory. NUMA emluation is useful for testing NUMA kernel and +application features on non-NUMA platforms, and as a sort of memory resource +management mechanism when used together with cpusets. +[see Documentation/cgroup-v1/cpusets.txt] + +For each node with memory, Linux constructs an independent memory management +subsystem, complete with its own free page lists, in-use page lists, usage +statistics and locks to mediate access. In addition, Linux constructs for +each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], +an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a +selected zone/node cannot satisfy the allocation request. This situation, +when a zone has no available memory to satisfy a request, is called +"overflow" or "fallback". + +Because some nodes contain multiple zones containing different types of +memory, Linux must decide whether to order the zonelists such that allocations +fall back to the same zone type on a different node, or to a different zone +type on the same node. This is an important consideration because some zones, +such as DMA or DMA32, represent relatively scarce resources. Linux chooses +a default Node ordered zonelist. This means it tries to fallback to other zones +from the same node before using remote nodes which are ordered by NUMA distance. + +By default, Linux will attempt to satisfy memory allocation requests from the +node to which the CPU that executes the request is assigned. Specifically, +Linux will attempt to allocate from the first node in the appropriate zonelist +for the node where the request originates. This is called "local allocation." +If the "local" node cannot satisfy the request, the kernel will examine other +nodes' zones in the selected zonelist looking for the first zone in the list +that can satisfy the request. + +Local allocation will tend to keep subsequent access to the allocated memory +"local" to the underlying physical resources and off the system interconnect-- +as long as the task on whose behalf the kernel allocated some memory does not +later migrate away from that memory. The Linux scheduler is aware of the +NUMA topology of the platform--embodied in the "scheduling domains" data +structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler +attempts to minimize task migration to distant scheduling domains. However, +the scheduler does not take a task's NUMA footprint into account directly. +Thus, under sufficient imbalance, tasks can migrate between nodes, remote +from their initial node and kernel data structures. + +System administrators and application designers can restrict a task's migration +to improve NUMA locality using various CPU affinity command line interfaces, +such as taskset(1) and numactl(1), and program interfaces such as +sched_setaffinity(2). Further, one can modify the kernel's default local +allocation behavior using Linux NUMA memory policy. +[see Documentation/vm/numa_memory_policy.rst.] + +System administrators can restrict the CPUs and nodes' memories that a non- +privileged user can specify in the scheduling or NUMA commands and functions +using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt] + +On architectures that do not hide memoryless nodes, Linux will include only +zones [nodes] with memory in the zonelists. This means that for a memoryless +node the "local memory node"--the node of the first zone in CPU's node's +zonelist--will not be the node itself. Rather, it will be the node that the +kernel selected as the nearest node with memory when it built the zonelists. +So, default, local allocations will succeed with the kernel supplying the +closest available memory. This is a consequence of the same mechanism that +allows such allocations to fallback to other nearby nodes when a node that +does contain memory overflows. + +Some kernel allocations do not want or cannot tolerate this allocation fallback +behavior. Rather they want to be sure they get memory from the specified node +or get notified that the node has no free memory. This is usually the case when +a subsystem allocates per CPU memory resources, for example. + +A typical model for making such an allocation is to obtain the node id of the +node to which the "current CPU" is attached using one of the kernel's +numa_node_id() or CPU_to_node() functions and then request memory from only +the node id returned. When such an allocation fails, the requesting subsystem +may revert to its own fallback path. The slab kernel memory allocator is an +example of this. Or, the subsystem may choose to disable or not to enable +itself on allocation failure. The kernel profiling subsystem is an example of +this. + +If the architecture supports--does not hide--memoryless nodes, then CPUs +attached to memoryless nodes would always incur the fallback path overhead +or some subsystems would fail to initialize if they attempted to allocated +memory exclusively from a node without memory. To support such +architectures transparently, kernel subsystems can use the numa_mem_id() +or cpu_to_mem() function to locate the "local memory node" for the calling or +specified CPU. Again, this is the same node from which default, local page +allocations will be attempted. diff --git a/Documentation/vm/numa_memory_policy.rst b/Documentation/vm/numa_memory_policy.rst new file mode 100644 index 000000000000..8cd942ca114e --- /dev/null +++ b/Documentation/vm/numa_memory_policy.rst @@ -0,0 +1,485 @@ +.. _numa_memory_policy: + +=================== +Linux Memory Policy +=================== + +What is Linux Memory Policy? +============================ + +In the Linux kernel, "memory policy" determines from which node the kernel will +allocate memory in a NUMA system or in an emulated NUMA system. Linux has +supported platforms with Non-Uniform Memory Access architectures since 2.4.?. +The current memory policy support was added to Linux 2.6 around May 2004. This +document attempts to describe the concepts and APIs of the 2.6 memory policy +support. + +Memory policies should not be confused with cpusets +(``Documentation/cgroup-v1/cpusets.txt``) +which is an administrative mechanism for restricting the nodes from which +memory may be allocated by a set of processes. Memory policies are a +programming interface that a NUMA-aware application can take advantage of. When +both cpusets and policies are applied to a task, the restrictions of the cpuset +takes priority. See :ref:`Memory Policies and cpusets ` +below for more details. + +Memory Policy Concepts +====================== + +Scope of Memory Policies +------------------------ + +The Linux kernel supports _scopes_ of memory policy, described here from +most general to most specific: + +System Default Policy + this policy is "hard coded" into the kernel. It is the policy + that governs all page allocations that aren't controlled by + one of the more specific policy scopes discussed below. When + the system is "up and running", the system default policy will + use "local allocation" described below. However, during boot + up, the system default policy will be set to interleave + allocations across all nodes with "sufficient" memory, so as + not to overload the initial boot node with boot-time + allocations. + +Task/Process Policy + this is an optional, per-task policy. When defined for a specific task, this policy controls all page allocations made by or on behalf of the task that aren't controlled by a more specific scope. If a task does not define a task policy, then all page allocations that would have been controlled by the task policy "fall back" to the System Default Policy. + + The task policy applies to the entire address space of a task. Thus, + it is inheritable, and indeed is inherited, across both fork() + [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task + to establish the task policy for a child task exec()'d from an + executable image that has no awareness of memory policy. See the + MEMORY POLICY APIS section, below, for an overview of the system call + that a task may use to set/change its task/process policy. + + In a multi-threaded task, task policies apply only to the thread + [Linux kernel task] that installs the policy and any threads + subsequently created by that thread. Any sibling threads existing + at the time a new task policy is installed retain their current + policy. + + A task policy applies only to pages allocated after the policy is + installed. Any pages already faulted in by the task when the task + changes its task policy remain where they were allocated based on + the policy at the time they were allocated. + +.. _vma_policy: + +VMA Policy + A "VMA" or "Virtual Memory Area" refers to a range of a task's + virtual address space. A task may define a specific policy for a range + of its virtual address space. See the MEMORY POLICIES APIS section, + below, for an overview of the mbind() system call used to set a VMA + policy. + + A VMA policy will govern the allocation of pages that back + this region ofthe address space. Any regions of the task's + address space that don't have an explicit VMA policy will fall + back to the task policy, which may itself fall back to the + System Default Policy. + + VMA policies have a few complicating details: + + * VMA policy applies ONLY to anonymous pages. These include + pages allocated for anonymous segments, such as the task + stack and heap, and any regions of the address space + mmap()ed with the MAP_ANONYMOUS flag. If a VMA policy is + applied to a file mapping, it will be ignored if the mapping + used the MAP_SHARED flag. If the file mapping used the + MAP_PRIVATE flag, the VMA policy will only be applied when + an anonymous page is allocated on an attempt to write to the + mapping-- i.e., at Copy-On-Write. + + * VMA policies are shared between all tasks that share a + virtual address space--a.k.a. threads--independent of when + the policy is installed; and they are inherited across + fork(). However, because VMA policies refer to a specific + region of a task's address space, and because the address + space is discarded and recreated on exec*(), VMA policies + are NOT inheritable across exec(). Thus, only NUMA-aware + applications may use VMA policies. + + * A task may install a new VMA policy on a sub-range of a + previously mmap()ed region. When this happens, Linux splits + the existing virtual memory area into 2 or 3 VMAs, each with + it's own policy. + + * By default, VMA policy applies only to pages allocated after + the policy is installed. Any pages already faulted into the + VMA range remain where they were allocated based on the + policy at the time they were allocated. However, since + 2.6.16, Linux supports page migration via the mbind() system + call, so that page contents can be moved to match a newly + installed policy. + +Shared Policy + Conceptually, shared policies apply to "memory objects" mapped + shared into one or more tasks' distinct address spaces. An + application installs a shared policies the same way as VMA + policies--using the mbind() system call specifying a range of + virtual addresses that map the shared object. However, unlike + VMA policies, which can be considered to be an attribute of a + range of a task's address space, shared policies apply + directly to the shared object. Thus, all tasks that attach to + the object share the policy, and all pages allocated for the + shared object, by any task, will obey the shared policy. + + As of 2.6.22, only shared memory segments, created by shmget() or + mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared + policy support was added to Linux, the associated data structures were + added to hugetlbfs shmem segments. At the time, hugetlbfs did not + support allocation at fault time--a.k.a lazy allocation--so hugetlbfs + shmem segments were never "hooked up" to the shared policy support. + Although hugetlbfs segments now support lazy allocation, their support + for shared policy has not been completed. + + As mentioned above :ref:`VMA policies `, + allocations of page cache pages for regular files mmap()ed + with MAP_SHARED ignore any VMA policy installed on the virtual + address range backed by the shared file mapping. Rather, + shared page cache pages, including pages backing private + mappings that have not yet been written by the task, follow + task policy, if any, else System Default Policy. + + The shared policy infrastructure supports different policies on subset + ranges of the shared object. However, Linux still splits the VMA of + the task that installs the policy for each range of distinct policy. + Thus, different tasks that attach to a shared memory segment can have + different VMA configurations mapping that one shared object. This + can be seen by examining the /proc//numa_maps of tasks sharing + a shared memory region, when one task has installed shared policy on + one or more ranges of the region. + +Components of Memory Policies +----------------------------- + +A Linux memory policy consists of a "mode", optional mode flags, and +an optional set of nodes. The mode determines the behavior of the +policy, the optional mode flags determine the behavior of the mode, +and the optional set of nodes can be viewed as the arguments to the +policy behavior. + +Internally, memory policies are implemented by a reference counted +structure, struct mempolicy. Details of this structure will be +discussed in context, below, as required to explain the behavior. + +Linux memory policy supports the following 4 behavioral modes: + +Default Mode--MPOL_DEFAULT + This mode is only used in the memory policy APIs. Internally, + MPOL_DEFAULT is converted to the NULL memory policy in all + policy scopes. Any existing non-default policy will simply be + removed when MPOL_DEFAULT is specified. As a result, + MPOL_DEFAULT means "fall back to the next most specific policy + scope." + + For example, a NULL or default task policy will fall back to the + system default policy. A NULL or default vma policy will fall + back to the task policy. + + When specified in one of the memory policy APIs, the Default mode + does not use the optional set of nodes. + + It is an error for the set of nodes specified for this policy to + be non-empty. + +MPOL_BIND + This mode specifies that memory must come from the set of + nodes specified by the policy. Memory will be allocated from + the node in the set with sufficient free memory that is + closest to the node where the allocation takes place. + +MPOL_PREFERRED + This mode specifies that the allocation should be attempted + from the single node specified in the policy. If that + allocation fails, the kernel will search other nodes, in order + of increasing distance from the preferred node based on + information provided by the platform firmware. + + Internally, the Preferred policy uses a single node--the + preferred_node member of struct mempolicy. When the internal + mode flag MPOL_F_LOCAL is set, the preferred_node is ignored + and the policy is interpreted as local allocation. "Local" + allocation policy can be viewed as a Preferred policy that + starts at the node containing the cpu where the allocation + takes place. + + It is possible for the user to specify that local allocation + is always preferred by passing an empty nodemask with this + mode. If an empty nodemask is passed, the policy cannot use + the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags + described below. + +MPOL_INTERLEAVED + This mode specifies that page allocations be interleaved, on a + page granularity, across the nodes specified in the policy. + This mode also behaves slightly differently, based on the + context where it is used: + + For allocation of anonymous pages and shared memory pages, + Interleave mode indexes the set of nodes specified by the + policy using the page offset of the faulting address into the + segment [VMA] containing the address modulo the number of + nodes specified by the policy. It then attempts to allocate a + page, starting at the selected node, as if the node had been + specified by a Preferred policy or had been selected by a + local allocation. That is, allocation will follow the per + node zonelist. + + For allocation of page cache pages, Interleave mode indexes + the set of nodes specified by the policy using a node counter + maintained per task. This counter wraps around to the lowest + specified node after it reaches the highest specified node. + This will tend to spread the pages out over the nodes + specified by the policy based on the order in which they are + allocated, rather than based on any page offset into an + address range or file. During system boot up, the temporary + interleaved system default policy works in this mode. + +Linux memory policy supports the following optional mode flags: + +MPOL_F_STATIC_NODES + This flag specifies that the nodemask passed by + the user should not be remapped if the task or VMA's set of allowed + nodes changes after the memory policy has been defined. + + Without this flag, anytime a mempolicy is rebound because of a + change in the set of allowed nodes, the node (Preferred) or + nodemask (Bind, Interleave) is remapped to the new set of + allowed nodes. This may result in nodes being used that were + previously undesired. + + With this flag, if the user-specified nodes overlap with the + nodes allowed by the task's cpuset, then the memory policy is + applied to their intersection. If the two sets of nodes do not + overlap, the Default policy is used. + + For example, consider a task that is attached to a cpuset with + mems 1-3 that sets an Interleave policy over the same set. If + the cpuset's mems change to 3-5, the Interleave will now occur + over nodes 3, 4, and 5. With this flag, however, since only node + 3 is allowed from the user's nodemask, the "interleave" only + occurs over that node. If no nodes from the user's nodemask are + now allowed, the Default behavior is used. + + MPOL_F_STATIC_NODES cannot be combined with the + MPOL_F_RELATIVE_NODES flag. It also cannot be used for + MPOL_PREFERRED policies that were created with an empty nodemask + (local allocation). + +MPOL_F_RELATIVE_NODES + This flag specifies that the nodemask passed + by the user will be mapped relative to the set of the task or VMA's + set of allowed nodes. The kernel stores the user-passed nodemask, + and if the allowed nodes changes, then that original nodemask will + be remapped relative to the new set of allowed nodes. + + Without this flag (and without MPOL_F_STATIC_NODES), anytime a + mempolicy is rebound because of a change in the set of allowed + nodes, the node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. That remap may not + preserve the relative nature of the user's passed nodemask to its + set of allowed nodes upon successive rebinds: a nodemask of + 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of + allowed nodes is restored to its original state. + + With this flag, the remap is done so that the node numbers from + the user's passed nodemask are relative to the set of allowed + nodes. In other words, if nodes 0, 2, and 4 are set in the user's + nodemask, the policy will be effected over the first (and in the + Bind or Interleave case, the third and fifth) nodes in the set of + allowed nodes. The nodemask passed by the user represents nodes + relative to task or VMA's set of allowed nodes. + + If the user's nodemask includes nodes that are outside the range + of the new set of allowed nodes (for example, node 5 is set in + the user's nodemask when the set of allowed nodes is only 0-3), + then the remap wraps around to the beginning of the nodemask and, + if not already set, sets the node in the mempolicy nodemask. + + For example, consider a task that is attached to a cpuset with + mems 2-5 that sets an Interleave policy over the same set with + MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the + interleave now occurs over nodes 3,5-7. If the cpuset's mems + then change to 0,2-3,5, then the interleave occurs over nodes + 0,2-3,5. + + Thanks to the consistent remapping, applications preparing + nodemasks to specify memory policies using this flag should + disregard their current, actual cpuset imposed memory placement + and prepare the nodemask as if they were always located on + memory nodes 0 to N-1, where N is the number of memory nodes the + policy is intended to manage. Let the kernel then remap to the + set of memory nodes allowed by the task's cpuset, as that may + change over time. + + MPOL_F_RELATIVE_NODES cannot be combined with the + MPOL_F_STATIC_NODES flag. It also cannot be used for + MPOL_PREFERRED policies that were created with an empty nodemask + (local allocation). + +Memory Policy Reference Counting +================================ + +To resolve use/free races, struct mempolicy contains an atomic reference +count field. Internal interfaces, mpol_get()/mpol_put() increment and +decrement this reference count, respectively. mpol_put() will only free +the structure back to the mempolicy kmem cache when the reference count +goes to zero. + +When a new memory policy is allocated, its reference count is initialized +to '1', representing the reference held by the task that is installing the +new policy. When a pointer to a memory policy structure is stored in another +structure, another reference is added, as the task's reference will be dropped +on completion of the policy installation. + +During run-time "usage" of the policy, we attempt to minimize atomic operations +on the reference count, as this can lead to cache lines bouncing between cpus +and NUMA nodes. "Usage" here means one of the following: + +1) querying of the policy, either by the task itself [using the get_mempolicy() + API discussed below] or by another task using the /proc//numa_maps + interface. + +2) examination of the policy to determine the policy mode and associated node + or node lists, if any, for page allocation. This is considered a "hot + path". Note that for MPOL_BIND, the "usage" extends across the entire + allocation process, which may sleep during page reclaimation, because the + BIND policy nodemask is used, by reference, to filter ineligible nodes. + +We can avoid taking an extra reference during the usages listed above as +follows: + +1) we never need to get/free the system default policy as this is never + changed nor freed, once the system is up and running. + +2) for querying the policy, we do not need to take an extra reference on the + target task's task policy nor vma policies because we always acquire the + task's mm's mmap_sem for read during the query. The set_mempolicy() and + mbind() APIs [see below] always acquire the mmap_sem for write when + installing or replacing task or vma policies. Thus, there is no possibility + of a task or thread freeing a policy while another task or thread is + querying it. + +3) Page allocation usage of task or vma policy occurs in the fault path where + we hold them mmap_sem for read. Again, because replacing the task or vma + policy requires that the mmap_sem be held for write, the policy can't be + freed out from under us while we're using it for page allocation. + +4) Shared policies require special consideration. One task can replace a + shared memory policy while another task, with a distinct mmap_sem, is + querying or allocating a page based on the policy. To resolve this + potential race, the shared policy infrastructure adds an extra reference + to the shared policy during lookup while holding a spin lock on the shared + policy management structure. This requires that we drop this extra + reference when we're finished "using" the policy. We must drop the + extra reference on shared policies in the same query/allocation paths + used for non-shared policies. For this reason, shared policies are marked + as such, and the extra reference is dropped "conditionally"--i.e., only + for shared policies. + + Because of this extra reference counting, and because we must lookup + shared policies in a tree structure under spinlock, shared policies are + more expensive to use in the page allocation path. This is especially + true for shared policies on shared memory regions shared by tasks running + on different NUMA nodes. This extra overhead can be avoided by always + falling back to task or system default policy for shared memory regions, + or by prefaulting the entire shared memory region into memory and locking + it down. However, this might not be appropriate for all applications. + +Memory Policy APIs + +Linux supports 3 system calls for controlling memory policy. These APIS +always affect only the calling task, the calling task's address space, or +some shared object mapped into the calling task's address space. + +.. note:: + the headers that define these APIs and the parameter data types for + user space applications reside in a package that is not part of the + Linux kernel. The kernel system call interfaces, with the 'sys\_' + prefix, are defined in ; the mode and flag + definitions are defined in . + +Set [Task] Memory Policy:: + + long set_mempolicy(int mode, const unsigned long *nmask, + unsigned long maxnode); + +Set's the calling task's "task/process memory policy" to mode +specified by the 'mode' argument and the set of nodes defined by +'nmask'. 'nmask' points to a bit mask of node ids containing at least +'maxnode' ids. Optional mode flags may be passed by combining the +'mode' argument with the flag (for example: MPOL_INTERLEAVE | +MPOL_F_STATIC_NODES). + +See the set_mempolicy(2) man page for more details + + +Get [Task] Memory Policy or Related Information:: + + long get_mempolicy(int *mode, + const unsigned long *nmask, unsigned long maxnode, + void *addr, int flags); + +Queries the "task/process memory policy" of the calling task, or the +policy or location of a specified virtual address, depending on the +'flags' argument. + +See the get_mempolicy(2) man page for more details + + +Install VMA/Shared Policy for a Range of Task's Address Space:: + + long mbind(void *start, unsigned long len, int mode, + const unsigned long *nmask, unsigned long maxnode, + unsigned flags); + +mbind() installs the policy specified by (mode, nmask, maxnodes) as a +VMA policy for the range of the calling task's address space specified +by the 'start' and 'len' arguments. Additional actions may be +requested via the 'flags' argument. + +See the mbind(2) man page for more details. + +Memory Policy Command Line Interface +==================================== + +Although not strictly part of the Linux implementation of memory policy, +a command line tool, numactl(8), exists that allows one to: + ++ set the task policy for a specified program via set_mempolicy(2), fork(2) and + exec(2) + ++ set the shared policy for a shared memory segment via mbind(2) + +The numactl(8) tool is packaged with the run-time version of the library +containing the memory policy system call wrappers. Some distributions +package the headers and compile-time libraries in a separate development +package. + +.. _mem_pol_and_cpusets: + +Memory Policies and cpusets +=========================== + +Memory policies work within cpusets as described above. For memory policies +that require a node or set of nodes, the nodes are restricted to the set of +nodes whose memories are allowed by the cpuset constraints. If the nodemask +specified for the policy contains nodes that are not allowed by the cpuset and +MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes +specified for the policy and the set of nodes with memory is used. If the +result is the empty set, the policy is considered invalid and cannot be +installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped +onto and folded into the task's set of allowed nodes as previously described. + +The interaction of memory policies and cpusets can be problematic when tasks +in two cpusets share access to a memory region, such as shared memory segments +created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and +any of the tasks install shared policy on the region, only nodes whose +memories are allowed in both cpusets may be used in the policies. Obtaining +this information requires "stepping outside" the memory policy APIs to use the +cpuset information and requires that one know in what cpusets other task might +be attaching to the shared region. Furthermore, if the cpusets' allowed +memory sets are disjoint, "local" allocation is the only valid policy. diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt deleted file mode 100644 index 8cd942ca114e..000000000000 --- a/Documentation/vm/numa_memory_policy.txt +++ /dev/null @@ -1,485 +0,0 @@ -.. _numa_memory_policy: - -=================== -Linux Memory Policy -=================== - -What is Linux Memory Policy? -============================ - -In the Linux kernel, "memory policy" determines from which node the kernel will -allocate memory in a NUMA system or in an emulated NUMA system. Linux has -supported platforms with Non-Uniform Memory Access architectures since 2.4.?. -The current memory policy support was added to Linux 2.6 around May 2004. This -document attempts to describe the concepts and APIs of the 2.6 memory policy -support. - -Memory policies should not be confused with cpusets -(``Documentation/cgroup-v1/cpusets.txt``) -which is an administrative mechanism for restricting the nodes from which -memory may be allocated by a set of processes. Memory policies are a -programming interface that a NUMA-aware application can take advantage of. When -both cpusets and policies are applied to a task, the restrictions of the cpuset -takes priority. See :ref:`Memory Policies and cpusets ` -below for more details. - -Memory Policy Concepts -====================== - -Scope of Memory Policies ------------------------- - -The Linux kernel supports _scopes_ of memory policy, described here from -most general to most specific: - -System Default Policy - this policy is "hard coded" into the kernel. It is the policy - that governs all page allocations that aren't controlled by - one of the more specific policy scopes discussed below. When - the system is "up and running", the system default policy will - use "local allocation" described below. However, during boot - up, the system default policy will be set to interleave - allocations across all nodes with "sufficient" memory, so as - not to overload the initial boot node with boot-time - allocations. - -Task/Process Policy - this is an optional, per-task policy. When defined for a specific task, this policy controls all page allocations made by or on behalf of the task that aren't controlled by a more specific scope. If a task does not define a task policy, then all page allocations that would have been controlled by the task policy "fall back" to the System Default Policy. - - The task policy applies to the entire address space of a task. Thus, - it is inheritable, and indeed is inherited, across both fork() - [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task - to establish the task policy for a child task exec()'d from an - executable image that has no awareness of memory policy. See the - MEMORY POLICY APIS section, below, for an overview of the system call - that a task may use to set/change its task/process policy. - - In a multi-threaded task, task policies apply only to the thread - [Linux kernel task] that installs the policy and any threads - subsequently created by that thread. Any sibling threads existing - at the time a new task policy is installed retain their current - policy. - - A task policy applies only to pages allocated after the policy is - installed. Any pages already faulted in by the task when the task - changes its task policy remain where they were allocated based on - the policy at the time they were allocated. - -.. _vma_policy: - -VMA Policy - A "VMA" or "Virtual Memory Area" refers to a range of a task's - virtual address space. A task may define a specific policy for a range - of its virtual address space. See the MEMORY POLICIES APIS section, - below, for an overview of the mbind() system call used to set a VMA - policy. - - A VMA policy will govern the allocation of pages that back - this region ofthe address space. Any regions of the task's - address space that don't have an explicit VMA policy will fall - back to the task policy, which may itself fall back to the - System Default Policy. - - VMA policies have a few complicating details: - - * VMA policy applies ONLY to anonymous pages. These include - pages allocated for anonymous segments, such as the task - stack and heap, and any regions of the address space - mmap()ed with the MAP_ANONYMOUS flag. If a VMA policy is - applied to a file mapping, it will be ignored if the mapping - used the MAP_SHARED flag. If the file mapping used the - MAP_PRIVATE flag, the VMA policy will only be applied when - an anonymous page is allocated on an attempt to write to the - mapping-- i.e., at Copy-On-Write. - - * VMA policies are shared between all tasks that share a - virtual address space--a.k.a. threads--independent of when - the policy is installed; and they are inherited across - fork(). However, because VMA policies refer to a specific - region of a task's address space, and because the address - space is discarded and recreated on exec*(), VMA policies - are NOT inheritable across exec(). Thus, only NUMA-aware - applications may use VMA policies. - - * A task may install a new VMA policy on a sub-range of a - previously mmap()ed region. When this happens, Linux splits - the existing virtual memory area into 2 or 3 VMAs, each with - it's own policy. - - * By default, VMA policy applies only to pages allocated after - the policy is installed. Any pages already faulted into the - VMA range remain where they were allocated based on the - policy at the time they were allocated. However, since - 2.6.16, Linux supports page migration via the mbind() system - call, so that page contents can be moved to match a newly - installed policy. - -Shared Policy - Conceptually, shared policies apply to "memory objects" mapped - shared into one or more tasks' distinct address spaces. An - application installs a shared policies the same way as VMA - policies--using the mbind() system call specifying a range of - virtual addresses that map the shared object. However, unlike - VMA policies, which can be considered to be an attribute of a - range of a task's address space, shared policies apply - directly to the shared object. Thus, all tasks that attach to - the object share the policy, and all pages allocated for the - shared object, by any task, will obey the shared policy. - - As of 2.6.22, only shared memory segments, created by shmget() or - mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared - policy support was added to Linux, the associated data structures were - added to hugetlbfs shmem segments. At the time, hugetlbfs did not - support allocation at fault time--a.k.a lazy allocation--so hugetlbfs - shmem segments were never "hooked up" to the shared policy support. - Although hugetlbfs segments now support lazy allocation, their support - for shared policy has not been completed. - - As mentioned above :ref:`VMA policies `, - allocations of page cache pages for regular files mmap()ed - with MAP_SHARED ignore any VMA policy installed on the virtual - address range backed by the shared file mapping. Rather, - shared page cache pages, including pages backing private - mappings that have not yet been written by the task, follow - task policy, if any, else System Default Policy. - - The shared policy infrastructure supports different policies on subset - ranges of the shared object. However, Linux still splits the VMA of - the task that installs the policy for each range of distinct policy. - Thus, different tasks that attach to a shared memory segment can have - different VMA configurations mapping that one shared object. This - can be seen by examining the /proc//numa_maps of tasks sharing - a shared memory region, when one task has installed shared policy on - one or more ranges of the region. - -Components of Memory Policies ------------------------------ - -A Linux memory policy consists of a "mode", optional mode flags, and -an optional set of nodes. The mode determines the behavior of the -policy, the optional mode flags determine the behavior of the mode, -and the optional set of nodes can be viewed as the arguments to the -policy behavior. - -Internally, memory policies are implemented by a reference counted -structure, struct mempolicy. Details of this structure will be -discussed in context, below, as required to explain the behavior. - -Linux memory policy supports the following 4 behavioral modes: - -Default Mode--MPOL_DEFAULT - This mode is only used in the memory policy APIs. Internally, - MPOL_DEFAULT is converted to the NULL memory policy in all - policy scopes. Any existing non-default policy will simply be - removed when MPOL_DEFAULT is specified. As a result, - MPOL_DEFAULT means "fall back to the next most specific policy - scope." - - For example, a NULL or default task policy will fall back to the - system default policy. A NULL or default vma policy will fall - back to the task policy. - - When specified in one of the memory policy APIs, the Default mode - does not use the optional set of nodes. - - It is an error for the set of nodes specified for this policy to - be non-empty. - -MPOL_BIND - This mode specifies that memory must come from the set of - nodes specified by the policy. Memory will be allocated from - the node in the set with sufficient free memory that is - closest to the node where the allocation takes place. - -MPOL_PREFERRED - This mode specifies that the allocation should be attempted - from the single node specified in the policy. If that - allocation fails, the kernel will search other nodes, in order - of increasing distance from the preferred node based on - information provided by the platform firmware. - - Internally, the Preferred policy uses a single node--the - preferred_node member of struct mempolicy. When the internal - mode flag MPOL_F_LOCAL is set, the preferred_node is ignored - and the policy is interpreted as local allocation. "Local" - allocation policy can be viewed as a Preferred policy that - starts at the node containing the cpu where the allocation - takes place. - - It is possible for the user to specify that local allocation - is always preferred by passing an empty nodemask with this - mode. If an empty nodemask is passed, the policy cannot use - the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags - described below. - -MPOL_INTERLEAVED - This mode specifies that page allocations be interleaved, on a - page granularity, across the nodes specified in the policy. - This mode also behaves slightly differently, based on the - context where it is used: - - For allocation of anonymous pages and shared memory pages, - Interleave mode indexes the set of nodes specified by the - policy using the page offset of the faulting address into the - segment [VMA] containing the address modulo the number of - nodes specified by the policy. It then attempts to allocate a - page, starting at the selected node, as if the node had been - specified by a Preferred policy or had been selected by a - local allocation. That is, allocation will follow the per - node zonelist. - - For allocation of page cache pages, Interleave mode indexes - the set of nodes specified by the policy using a node counter - maintained per task. This counter wraps around to the lowest - specified node after it reaches the highest specified node. - This will tend to spread the pages out over the nodes - specified by the policy based on the order in which they are - allocated, rather than based on any page offset into an - address range or file. During system boot up, the temporary - interleaved system default policy works in this mode. - -Linux memory policy supports the following optional mode flags: - -MPOL_F_STATIC_NODES - This flag specifies that the nodemask passed by - the user should not be remapped if the task or VMA's set of allowed - nodes changes after the memory policy has been defined. - - Without this flag, anytime a mempolicy is rebound because of a - change in the set of allowed nodes, the node (Preferred) or - nodemask (Bind, Interleave) is remapped to the new set of - allowed nodes. This may result in nodes being used that were - previously undesired. - - With this flag, if the user-specified nodes overlap with the - nodes allowed by the task's cpuset, then the memory policy is - applied to their intersection. If the two sets of nodes do not - overlap, the Default policy is used. - - For example, consider a task that is attached to a cpuset with - mems 1-3 that sets an Interleave policy over the same set. If - the cpuset's mems change to 3-5, the Interleave will now occur - over nodes 3, 4, and 5. With this flag, however, since only node - 3 is allowed from the user's nodemask, the "interleave" only - occurs over that node. If no nodes from the user's nodemask are - now allowed, the Default behavior is used. - - MPOL_F_STATIC_NODES cannot be combined with the - MPOL_F_RELATIVE_NODES flag. It also cannot be used for - MPOL_PREFERRED policies that were created with an empty nodemask - (local allocation). - -MPOL_F_RELATIVE_NODES - This flag specifies that the nodemask passed - by the user will be mapped relative to the set of the task or VMA's - set of allowed nodes. The kernel stores the user-passed nodemask, - and if the allowed nodes changes, then that original nodemask will - be remapped relative to the new set of allowed nodes. - - Without this flag (and without MPOL_F_STATIC_NODES), anytime a - mempolicy is rebound because of a change in the set of allowed - nodes, the node (Preferred) or nodemask (Bind, Interleave) is - remapped to the new set of allowed nodes. That remap may not - preserve the relative nature of the user's passed nodemask to its - set of allowed nodes upon successive rebinds: a nodemask of - 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of - allowed nodes is restored to its original state. - - With this flag, the remap is done so that the node numbers from - the user's passed nodemask are relative to the set of allowed - nodes. In other words, if nodes 0, 2, and 4 are set in the user's - nodemask, the policy will be effected over the first (and in the - Bind or Interleave case, the third and fifth) nodes in the set of - allowed nodes. The nodemask passed by the user represents nodes - relative to task or VMA's set of allowed nodes. - - If the user's nodemask includes nodes that are outside the range - of the new set of allowed nodes (for example, node 5 is set in - the user's nodemask when the set of allowed nodes is only 0-3), - then the remap wraps around to the beginning of the nodemask and, - if not already set, sets the node in the mempolicy nodemask. - - For example, consider a task that is attached to a cpuset with - mems 2-5 that sets an Interleave policy over the same set with - MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the - interleave now occurs over nodes 3,5-7. If the cpuset's mems - then change to 0,2-3,5, then the interleave occurs over nodes - 0,2-3,5. - - Thanks to the consistent remapping, applications preparing - nodemasks to specify memory policies using this flag should - disregard their current, actual cpuset imposed memory placement - and prepare the nodemask as if they were always located on - memory nodes 0 to N-1, where N is the number of memory nodes the - policy is intended to manage. Let the kernel then remap to the - set of memory nodes allowed by the task's cpuset, as that may - change over time. - - MPOL_F_RELATIVE_NODES cannot be combined with the - MPOL_F_STATIC_NODES flag. It also cannot be used for - MPOL_PREFERRED policies that were created with an empty nodemask - (local allocation). - -Memory Policy Reference Counting -================================ - -To resolve use/free races, struct mempolicy contains an atomic reference -count field. Internal interfaces, mpol_get()/mpol_put() increment and -decrement this reference count, respectively. mpol_put() will only free -the structure back to the mempolicy kmem cache when the reference count -goes to zero. - -When a new memory policy is allocated, its reference count is initialized -to '1', representing the reference held by the task that is installing the -new policy. When a pointer to a memory policy structure is stored in another -structure, another reference is added, as the task's reference will be dropped -on completion of the policy installation. - -During run-time "usage" of the policy, we attempt to minimize atomic operations -on the reference count, as this can lead to cache lines bouncing between cpus -and NUMA nodes. "Usage" here means one of the following: - -1) querying of the policy, either by the task itself [using the get_mempolicy() - API discussed below] or by another task using the /proc//numa_maps - interface. - -2) examination of the policy to determine the policy mode and associated node - or node lists, if any, for page allocation. This is considered a "hot - path". Note that for MPOL_BIND, the "usage" extends across the entire - allocation process, which may sleep during page reclaimation, because the - BIND policy nodemask is used, by reference, to filter ineligible nodes. - -We can avoid taking an extra reference during the usages listed above as -follows: - -1) we never need to get/free the system default policy as this is never - changed nor freed, once the system is up and running. - -2) for querying the policy, we do not need to take an extra reference on the - target task's task policy nor vma policies because we always acquire the - task's mm's mmap_sem for read during the query. The set_mempolicy() and - mbind() APIs [see below] always acquire the mmap_sem for write when - installing or replacing task or vma policies. Thus, there is no possibility - of a task or thread freeing a policy while another task or thread is - querying it. - -3) Page allocation usage of task or vma policy occurs in the fault path where - we hold them mmap_sem for read. Again, because replacing the task or vma - policy requires that the mmap_sem be held for write, the policy can't be - freed out from under us while we're using it for page allocation. - -4) Shared policies require special consideration. One task can replace a - shared memory policy while another task, with a distinct mmap_sem, is - querying or allocating a page based on the policy. To resolve this - potential race, the shared policy infrastructure adds an extra reference - to the shared policy during lookup while holding a spin lock on the shared - policy management structure. This requires that we drop this extra - reference when we're finished "using" the policy. We must drop the - extra reference on shared policies in the same query/allocation paths - used for non-shared policies. For this reason, shared policies are marked - as such, and the extra reference is dropped "conditionally"--i.e., only - for shared policies. - - Because of this extra reference counting, and because we must lookup - shared policies in a tree structure under spinlock, shared policies are - more expensive to use in the page allocation path. This is especially - true for shared policies on shared memory regions shared by tasks running - on different NUMA nodes. This extra overhead can be avoided by always - falling back to task or system default policy for shared memory regions, - or by prefaulting the entire shared memory region into memory and locking - it down. However, this might not be appropriate for all applications. - -Memory Policy APIs - -Linux supports 3 system calls for controlling memory policy. These APIS -always affect only the calling task, the calling task's address space, or -some shared object mapped into the calling task's address space. - -.. note:: - the headers that define these APIs and the parameter data types for - user space applications reside in a package that is not part of the - Linux kernel. The kernel system call interfaces, with the 'sys\_' - prefix, are defined in ; the mode and flag - definitions are defined in . - -Set [Task] Memory Policy:: - - long set_mempolicy(int mode, const unsigned long *nmask, - unsigned long maxnode); - -Set's the calling task's "task/process memory policy" to mode -specified by the 'mode' argument and the set of nodes defined by -'nmask'. 'nmask' points to a bit mask of node ids containing at least -'maxnode' ids. Optional mode flags may be passed by combining the -'mode' argument with the flag (for example: MPOL_INTERLEAVE | -MPOL_F_STATIC_NODES). - -See the set_mempolicy(2) man page for more details - - -Get [Task] Memory Policy or Related Information:: - - long get_mempolicy(int *mode, - const unsigned long *nmask, unsigned long maxnode, - void *addr, int flags); - -Queries the "task/process memory policy" of the calling task, or the -policy or location of a specified virtual address, depending on the -'flags' argument. - -See the get_mempolicy(2) man page for more details - - -Install VMA/Shared Policy for a Range of Task's Address Space:: - - long mbind(void *start, unsigned long len, int mode, - const unsigned long *nmask, unsigned long maxnode, - unsigned flags); - -mbind() installs the policy specified by (mode, nmask, maxnodes) as a -VMA policy for the range of the calling task's address space specified -by the 'start' and 'len' arguments. Additional actions may be -requested via the 'flags' argument. - -See the mbind(2) man page for more details. - -Memory Policy Command Line Interface -==================================== - -Although not strictly part of the Linux implementation of memory policy, -a command line tool, numactl(8), exists that allows one to: - -+ set the task policy for a specified program via set_mempolicy(2), fork(2) and - exec(2) - -+ set the shared policy for a shared memory segment via mbind(2) - -The numactl(8) tool is packaged with the run-time version of the library -containing the memory policy system call wrappers. Some distributions -package the headers and compile-time libraries in a separate development -package. - -.. _mem_pol_and_cpusets: - -Memory Policies and cpusets -=========================== - -Memory policies work within cpusets as described above. For memory policies -that require a node or set of nodes, the nodes are restricted to the set of -nodes whose memories are allowed by the cpuset constraints. If the nodemask -specified for the policy contains nodes that are not allowed by the cpuset and -MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes -specified for the policy and the set of nodes with memory is used. If the -result is the empty set, the policy is considered invalid and cannot be -installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped -onto and folded into the task's set of allowed nodes as previously described. - -The interaction of memory policies and cpusets can be problematic when tasks -in two cpusets share access to a memory region, such as shared memory segments -created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and -any of the tasks install shared policy on the region, only nodes whose -memories are allowed in both cpusets may be used in the policies. Obtaining -this information requires "stepping outside" the memory policy APIs to use the -cpuset information and requires that one know in what cpusets other task might -be attaching to the shared region. Furthermore, if the cpusets' allowed -memory sets are disjoint, "local" allocation is the only valid policy. diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting deleted file mode 100644 index 0dd54bbe4afa..000000000000 --- a/Documentation/vm/overcommit-accounting +++ /dev/null @@ -1,87 +0,0 @@ -.. _overcommit_accounting: - -===================== -Overcommit Accounting -===================== - -The Linux kernel supports the following overcommit handling modes - -0 - Heuristic overcommit handling. Obvious overcommits of address - space are refused. Used for a typical system. It ensures a - seriously wild allocation fails while allowing overcommit to - reduce swap usage. root is allowed to allocate slightly more - memory in this mode. This is the default. - -1 - Always overcommit. Appropriate for some scientific - applications. Classic example is code using sparse arrays and - just relying on the virtual memory consisting almost entirely - of zero pages. - -2 - Don't overcommit. The total address space commit for the - system is not permitted to exceed swap + a configurable amount - (default is 50%) of physical RAM. Depending on the amount you - use, in most situations this means a process will not be - killed while accessing pages but will receive errors on memory - allocation as appropriate. - - Useful for applications that want to guarantee their memory - allocations will be available in the future without having to - initialize every page. - -The overcommit policy is set via the sysctl ``vm.overcommit_memory``. - -The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) -or ``vm.overcommit_kbytes`` (absolute value). - -The current overcommit limit and amount committed are viewable in -``/proc/meminfo`` as CommitLimit and Committed_AS respectively. - -Gotchas -======= - -The C language stack growth does an implicit mremap. If you want absolute -guarantees and run close to the edge you MUST mmap your stack for the -largest size you think you will need. For typical stack usage this does -not matter much but it's a corner case if you really really care - -In mode 2 the MAP_NORESERVE flag is ignored. - - -How It Works -============ - -The overcommit is based on the following rules - -For a file backed map - | SHARED or READ-only - 0 cost (the file is the map not swap) - | PRIVATE WRITABLE - size of mapping per instance - -For an anonymous or ``/dev/zero`` map - | SHARED - size of mapping - | PRIVATE READ-only - 0 cost (but of little use) - | PRIVATE WRITABLE - size of mapping per instance - -Additional accounting - | Pages made writable copies by mmap - | shmfs memory drawn from the same pool - -Status -====== - -* We account mmap memory mappings -* We account mprotect changes in commit -* We account mremap changes in size -* We account brk -* We account munmap -* We report the commit status in /proc -* Account and check on fork -* Review stack handling/building on exec -* SHMfs accounting -* Implement actual limit enforcement - -To Do -===== -* Account ptrace pages (this is hard) diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst new file mode 100644 index 000000000000..0dd54bbe4afa --- /dev/null +++ b/Documentation/vm/overcommit-accounting.rst @@ -0,0 +1,87 @@ +.. _overcommit_accounting: + +===================== +Overcommit Accounting +===================== + +The Linux kernel supports the following overcommit handling modes + +0 + Heuristic overcommit handling. Obvious overcommits of address + space are refused. Used for a typical system. It ensures a + seriously wild allocation fails while allowing overcommit to + reduce swap usage. root is allowed to allocate slightly more + memory in this mode. This is the default. + +1 + Always overcommit. Appropriate for some scientific + applications. Classic example is code using sparse arrays and + just relying on the virtual memory consisting almost entirely + of zero pages. + +2 + Don't overcommit. The total address space commit for the + system is not permitted to exceed swap + a configurable amount + (default is 50%) of physical RAM. Depending on the amount you + use, in most situations this means a process will not be + killed while accessing pages but will receive errors on memory + allocation as appropriate. + + Useful for applications that want to guarantee their memory + allocations will be available in the future without having to + initialize every page. + +The overcommit policy is set via the sysctl ``vm.overcommit_memory``. + +The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) +or ``vm.overcommit_kbytes`` (absolute value). + +The current overcommit limit and amount committed are viewable in +``/proc/meminfo`` as CommitLimit and Committed_AS respectively. + +Gotchas +======= + +The C language stack growth does an implicit mremap. If you want absolute +guarantees and run close to the edge you MUST mmap your stack for the +largest size you think you will need. For typical stack usage this does +not matter much but it's a corner case if you really really care + +In mode 2 the MAP_NORESERVE flag is ignored. + + +How It Works +============ + +The overcommit is based on the following rules + +For a file backed map + | SHARED or READ-only - 0 cost (the file is the map not swap) + | PRIVATE WRITABLE - size of mapping per instance + +For an anonymous or ``/dev/zero`` map + | SHARED - size of mapping + | PRIVATE READ-only - 0 cost (but of little use) + | PRIVATE WRITABLE - size of mapping per instance + +Additional accounting + | Pages made writable copies by mmap + | shmfs memory drawn from the same pool + +Status +====== + +* We account mmap memory mappings +* We account mprotect changes in commit +* We account mremap changes in size +* We account brk +* We account munmap +* We report the commit status in /proc +* Account and check on fork +* Review stack handling/building on exec +* SHMfs accounting +* Implement actual limit enforcement + +To Do +===== +* Account ptrace pages (this is hard) diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags deleted file mode 100644 index 637cc49d1b2f..000000000000 --- a/Documentation/vm/page_frags +++ /dev/null @@ -1,45 +0,0 @@ -.. _page_frags: - -============== -Page fragments -============== - -A page fragment is an arbitrary-length arbitrary-offset area of memory -which resides within a 0 or higher order compound page. Multiple -fragments within that page are individually refcounted, in the page's -reference counter. - -The page_frag functions, page_frag_alloc and page_frag_free, provide a -simple allocation framework for page fragments. This is used by the -network stack and network device drivers to provide a backing region of -memory for use as either an sk_buff->head, or to be used in the "frags" -portion of skb_shared_info. - -In order to make use of the page fragment APIs a backing page fragment -cache is needed. This provides a central point for the fragment allocation -and tracks allows multiple calls to make use of a cached page. The -advantage to doing this is that multiple calls to get_page can be avoided -which can be expensive at allocation time. However due to the nature of -this caching it is required that any calls to the cache be protected by -either a per-cpu limitation, or a per-cpu limitation and forcing interrupts -to be disabled when executing the fragment allocation. - -The network stack uses two separate caches per CPU to handle fragment -allocation. The netdev_alloc_cache is used by callers making use of the -__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The -main difference between these two calls is the context in which they may be -called. The "netdev" prefixed functions are usable in any context as these -functions will disable interrupts, while the "napi" prefixed functions are -only usable within the softirq context. - -Many network device drivers use a similar methodology for allocating page -fragments, but the page fragments are cached at the ring or descriptor -level. In order to enable these cases it is necessary to provide a generic -way of tearing down a page cache. For this reason __page_frag_cache_drain -was implemented. It allows for freeing multiple references from a single -page via a single call. The advantage to doing this is that it allows for -cleaning up the multiple references that were added to a page in order to -avoid calling get_page per allocation. - -Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst new file mode 100644 index 000000000000..637cc49d1b2f --- /dev/null +++ b/Documentation/vm/page_frags.rst @@ -0,0 +1,45 @@ +.. _page_frags: + +============== +Page fragments +============== + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration deleted file mode 100644 index 07b67a821a12..000000000000 --- a/Documentation/vm/page_migration +++ /dev/null @@ -1,257 +0,0 @@ -.. _page_migration: - -============== -Page migration -============== - -Page migration allows the moving of the physical location of pages between -nodes in a numa system while the process is running. This means that the -virtual addresses that the process sees do not change. However, the -system rearranges the physical location of those pages. - -The main intend of page migration is to reduce the latency of memory access -by moving pages near to the processor where the process accessing that memory -is running. - -Page migration allows a process to manually relocate the node on which its -pages are located through the MF_MOVE and MF_MOVE_ALL options while setting -a new memory policy via mbind(). The pages of process can also be relocated -from another process using the sys_migrate_pages() function call. The -migrate_pages function call takes two sets of nodes and moves pages of a -process that are located on the from nodes to the destination nodes. -Page migration functions are provided by the numactl package by Andi Kleen -(a version later than 0.9.3 is required. Get it from -ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma -which provides an interface similar to other numa functionality for page -migration. cat ``/proc//numa_maps`` allows an easy review of where the -pages of a process are located. See also the numa_maps documentation in the -proc(5) man page. - -Manual migration is useful if for example the scheduler has relocated -a process to a processor on a distant node. A batch scheduler or an -administrator may detect the situation and move the pages of the process -nearer to the new processor. The kernel itself does only provide -manual page migration support. Automatic page migration may be implemented -through user space processes that move pages. A special function call -"move_pages" allows the moving of individual pages within a process. -A NUMA profiler may f.e. obtain a log showing frequent off node -accesses and may use the result to move pages to more advantageous -locations. - -Larger installations usually partition the system using cpusets into -sections of nodes. Paul Jackson has equipped cpusets with the ability to -move pages when a task is moved to another cpuset (See -Documentation/cgroup-v1/cpusets.txt). -Cpusets allows the automation of process locality. If a task is moved to -a new cpuset then also all its pages are moved with it so that the -performance of the process does not sink dramatically. Also the pages -of processes in a cpuset are moved if the allowed memory nodes of a -cpuset are changed. - -Page migration allows the preservation of the relative location of pages -within a group of nodes for all migration techniques which will preserve a -particular memory allocation pattern generated even after migrating a -process. This is necessary in order to preserve the memory latencies. -Processes will run with similar performance after migration. - -Page migration occurs in several steps. First a high level -description for those trying to use migrate_pages() from the kernel -(for userspace usage see the Andi Kleen's numactl package mentioned above) -and then a low level description of how the low level details work. - -In kernel use of migrate_pages() -================================ - -1. Remove pages from the LRU. - - Lists of pages to be migrated are generated by scanning over - pages and moving them into lists. This is done by - calling isolate_lru_page(). - Calling isolate_lru_page increases the references to the page - so that it cannot vanish while the page migration occurs. - It also prevents the swapper or other scans to encounter - the page. - -2. We need to have a function of type new_page_t that can be - passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. - -3. The migrate_pages() function is called which attempts - to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. - -How migrate_pages() works -========================= - -migrate_pages() does several passes over its list of pages. A page is moved -if all references to a page are removable at the time. The page has -already been removed from the LRU via isolate_lru_page() and the refcount -is increased so that the page cannot be freed while page migration occurs. - -Steps: - -1. Lock the page to be migrated - -2. Insure that writeback is complete. - -3. Lock the new page that we want to move to. It is locked so that accesses to - this (not yet uptodate) page immediately lock while the move is in progress. - -4. All the page table references to the page are converted to migration - entries. This decreases the mapcount of a page. If the resulting - mapcount is not zero then we do not migrate the page. All user space - processes that attempt to access the page will now wait on the page lock. - -5. The radix tree lock is taken. This will cause all processes trying - to access the page via the mapping to block on the radix tree spinlock. - -6. The refcount of the page is examined and we back out if references remain - otherwise we know that we are the only one referencing this page. - -7. The radix tree is checked and if it does not contain the pointer to this - page then we back out because someone else modified the radix tree. - -8. The new page is prepped with some settings from the old page so that - accesses to the new page will discover a page with the correct settings. - -9. The radix tree is changed to point to the new page. - -10. The reference count of the old page is dropped because the radix tree - reference is gone. A reference to the new page is established because - the new page is referenced to by the radix tree. - -11. The radix tree lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the tree_lock - to sleeping on the locked new page. - -12. The page contents are copied to the new page. - -13. The remaining page flags are copied to the new page. - -14. The old page flags are cleared to indicate that the page does - not provide any information anymore. - -15. Queued up writeback on the new page is triggered. - -16. If migration entries were page then replace them with real ptes. Doing - so will enable access for user space processes not already waiting for - the page lock. - -19. The page locks are dropped from the old and new page. - Processes waiting on the page lock will redo their page faults - and will reach the new page. - -20. The new page is moved to the LRU and can be scanned by the swapper - etc again. - -Non-LRU page migration -====================== - -Although original migration aimed for reducing the latency of memory access -for NUMA, compaction who want to create high-order page is also main customer. - -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-lru pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add own specific hooks in migration path. - -To overclome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -migration path. - -If a driver want to make own pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects on isolate_page function of driver is to return *true* - if driver isolates page successfully. On returing true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in that fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage of driver with isolated page. - The function of migratepage is to move content of the old page to new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returns - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporal migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without retrying - in this time. - - Driver shouldn't touch page.lru field VM using in the functions. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on isolated page, VM should return the isolated page - to the driver so VM calls driver's putback_page with migration failed page. - In this function, driver should put the isolated page back to the own data - structure. - -4. non-lru movable page flags - - There are two page flags for supporting non-lru movable page. - - * PG_movable - - Driver should use the below function to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather than, VM - reuses page->mapping's lower bits to represent it. - -:: - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping which mask off the low two bits of page->mapping under - page lock so it can get right struct address_space. - - For testing of non-lru movable page, VM supports __PageMovable function. - However, it doesn't guarantee to identify non-lru movable page because - page->mapping field is unified with other variables in struct page. - As well, if driver releases the page after isolation by VM, page->mapping - doesn't have stable value although it has PAGE_MAPPING_MOVABLE - (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether - page is LRU or non-lru movable once the page has been isolated. Because - LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also - good for just peeking to test non-lru movable pages before more expensive - checking with lock_page in pfn scanning to select victim. - - For guaranteeing non-lru movable page, VM provides PageMovable function. - Unlike __PageMovable, PageMovable functions validates page->mapping and - mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden - destroying of page->mapping. - - Driver using __SetPageMovable should clear the flag via __ClearMovablePage - under page_lock before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru - movable page, it can skip it. Driver doesn't need to manipulate the flag - because VM will set/clear it automatically. Keep in mind that if driver - sees PG_isolated page, it means the page have been isolated by VM so it - shouldn't touch page.lru field. - PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag - for own purpose. - -Christoph Lameter, May 8, 2006. -Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst new file mode 100644 index 000000000000..07b67a821a12 --- /dev/null +++ b/Documentation/vm/page_migration.rst @@ -0,0 +1,257 @@ +.. _page_migration: + +============== +Page migration +============== + +Page migration allows the moving of the physical location of pages between +nodes in a numa system while the process is running. This means that the +virtual addresses that the process sees do not change. However, the +system rearranges the physical location of those pages. + +The main intend of page migration is to reduce the latency of memory access +by moving pages near to the processor where the process accessing that memory +is running. + +Page migration allows a process to manually relocate the node on which its +pages are located through the MF_MOVE and MF_MOVE_ALL options while setting +a new memory policy via mbind(). The pages of process can also be relocated +from another process using the sys_migrate_pages() function call. The +migrate_pages function call takes two sets of nodes and moves pages of a +process that are located on the from nodes to the destination nodes. +Page migration functions are provided by the numactl package by Andi Kleen +(a version later than 0.9.3 is required. Get it from +ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma +which provides an interface similar to other numa functionality for page +migration. cat ``/proc//numa_maps`` allows an easy review of where the +pages of a process are located. See also the numa_maps documentation in the +proc(5) man page. + +Manual migration is useful if for example the scheduler has relocated +a process to a processor on a distant node. A batch scheduler or an +administrator may detect the situation and move the pages of the process +nearer to the new processor. The kernel itself does only provide +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +A NUMA profiler may f.e. obtain a log showing frequent off node +accesses and may use the result to move pages to more advantageous +locations. + +Larger installations usually partition the system using cpusets into +sections of nodes. Paul Jackson has equipped cpusets with the ability to +move pages when a task is moved to another cpuset (See +Documentation/cgroup-v1/cpusets.txt). +Cpusets allows the automation of process locality. If a task is moved to +a new cpuset then also all its pages are moved with it so that the +performance of the process does not sink dramatically. Also the pages +of processes in a cpuset are moved if the allowed memory nodes of a +cpuset are changed. + +Page migration allows the preservation of the relative location of pages +within a group of nodes for all migration techniques which will preserve a +particular memory allocation pattern generated even after migrating a +process. This is necessary in order to preserve the memory latencies. +Processes will run with similar performance after migration. + +Page migration occurs in several steps. First a high level +description for those trying to use migrate_pages() from the kernel +(for userspace usage see the Andi Kleen's numactl package mentioned above) +and then a low level description of how the low level details work. + +In kernel use of migrate_pages() +================================ + +1. Remove pages from the LRU. + + Lists of pages to be migrated are generated by scanning over + pages and moving them into lists. This is done by + calling isolate_lru_page(). + Calling isolate_lru_page increases the references to the page + so that it cannot vanish while the page migration occurs. + It also prevents the swapper or other scans to encounter + the page. + +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. + +3. The migrate_pages() function is called which attempts + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. + +How migrate_pages() works +========================= + +migrate_pages() does several passes over its list of pages. A page is moved +if all references to a page are removable at the time. The page has +already been removed from the LRU via isolate_lru_page() and the refcount +is increased so that the page cannot be freed while page migration occurs. + +Steps: + +1. Lock the page to be migrated + +2. Insure that writeback is complete. + +3. Lock the new page that we want to move to. It is locked so that accesses to + this (not yet uptodate) page immediately lock while the move is in progress. + +4. All the page table references to the page are converted to migration + entries. This decreases the mapcount of a page. If the resulting + mapcount is not zero then we do not migrate the page. All user space + processes that attempt to access the page will now wait on the page lock. + +5. The radix tree lock is taken. This will cause all processes trying + to access the page via the mapping to block on the radix tree spinlock. + +6. The refcount of the page is examined and we back out if references remain + otherwise we know that we are the only one referencing this page. + +7. The radix tree is checked and if it does not contain the pointer to this + page then we back out because someone else modified the radix tree. + +8. The new page is prepped with some settings from the old page so that + accesses to the new page will discover a page with the correct settings. + +9. The radix tree is changed to point to the new page. + +10. The reference count of the old page is dropped because the radix tree + reference is gone. A reference to the new page is established because + the new page is referenced to by the radix tree. + +11. The radix tree lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the tree_lock + to sleeping on the locked new page. + +12. The page contents are copied to the new page. + +13. The remaining page flags are copied to the new page. + +14. The old page flags are cleared to indicate that the page does + not provide any information anymore. + +15. Queued up writeback on the new page is triggered. + +16. If migration entries were page then replace them with real ptes. Doing + so will enable access for user space processes not already waiting for + the page lock. + +19. The page locks are dropped from the old and new page. + Processes waiting on the page lock will redo their page faults + and will reach the new page. + +20. The new page is moved to the LRU and can be scanned by the swapper + etc again. + +Non-LRU page migration +====================== + +Although original migration aimed for reducing the latency of memory access +for NUMA, compaction who want to create high-order page is also main customer. + +Current problem of the implementation is that it is designed to migrate only +*LRU* pages. However, there are potential non-lru pages which can be migrated +in drivers, for example, zsmalloc, virtio-balloon pages. + +For virtio-balloon pages, some parts of migration code path have been hooked +up and added virtio-balloon specific functions to intercept migration logics. +It's too specific to a driver so other drivers who want to make their pages +movable would have to add own specific hooks in migration path. + +To overclome the problem, VM supports non-LRU page migration which provides +generic functions for non-LRU movable pages without driver specific hooks +migration path. + +If a driver want to make own pages movable, it should define three functions +which are function pointers of struct address_space_operations. + +1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` + + What VM expects on isolate_page function of driver is to return *true* + if driver isolates page successfully. On returing true, VM marks the page + as PG_isolated so concurrent isolation in several CPUs skip the page + for isolation. If a driver cannot isolate the page, it should return *false*. + + Once page is successfully isolated, VM uses page.lru fields so driver + shouldn't expect to preserve values in that fields. + +2. ``int (*migratepage) (struct address_space *mapping,`` +| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` + + After isolation, VM calls migratepage of driver with isolated page. + The function of migratepage is to move content of the old page to new page + and set up fields of struct page newpage. Keep in mind that you should + indicate to the VM the oldpage is no longer movable via __ClearPageMovable() + under page_lock if you migrated the oldpage successfully and returns + MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver + can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time + because VM interprets -EAGAIN as "temporal migration failure". On returning + any error except -EAGAIN, VM will give up the page migration without retrying + in this time. + + Driver shouldn't touch page.lru field VM using in the functions. + +3. ``void (*putback_page)(struct page *);`` + + If migration fails on isolated page, VM should return the isolated page + to the driver so VM calls driver's putback_page with migration failed page. + In this function, driver should put the isolated page back to the own data + structure. + +4. non-lru movable page flags + + There are two page flags for supporting non-lru movable page. + + * PG_movable + + Driver should use the below function to make page movable under page_lock:: + + void __SetPageMovable(struct page *page, struct address_space *mapping) + + It needs argument of address_space for registering migration + family functions which will be called by VM. Exactly speaking, + PG_movable is not a real flag of struct page. Rather than, VM + reuses page->mapping's lower bits to represent it. + +:: + #define PAGE_MAPPING_MOVABLE 0x2 + page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; + + so driver shouldn't access page->mapping directly. Instead, driver should + use page_mapping which mask off the low two bits of page->mapping under + page lock so it can get right struct address_space. + + For testing of non-lru movable page, VM supports __PageMovable function. + However, it doesn't guarantee to identify non-lru movable page because + page->mapping field is unified with other variables in struct page. + As well, if driver releases the page after isolation by VM, page->mapping + doesn't have stable value although it has PAGE_MAPPING_MOVABLE + (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether + page is LRU or non-lru movable once the page has been isolated. Because + LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also + good for just peeking to test non-lru movable pages before more expensive + checking with lock_page in pfn scanning to select victim. + + For guaranteeing non-lru movable page, VM provides PageMovable function. + Unlike __PageMovable, PageMovable functions validates page->mapping and + mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden + destroying of page->mapping. + + Driver using __SetPageMovable should clear the flag via __ClearMovablePage + under page_lock before the releasing the page. + + * PG_isolated + + To prevent concurrent isolation among several CPUs, VM marks isolated page + as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru + movable page, it can skip it. Driver doesn't need to manipulate the flag + because VM will set/clear it automatically. Keep in mind that if driver + sees PG_isolated page, it means the page have been isolated by VM so it + shouldn't touch page.lru field. + PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag + for own purpose. + +Christoph Lameter, May 8, 2006. +Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst new file mode 100644 index 000000000000..0ed5ab8c7ab4 --- /dev/null +++ b/Documentation/vm/page_owner.rst @@ -0,0 +1,90 @@ +.. _page_owner: + +================================================== +page owner: Tracking about who allocated each page +================================================== + +Introduction +============ + +page owner is for the tracking about who allocated each page. +It can be used to debug memory leak or to find a memory hogger. +When allocation happens, information about allocation such as call stack +and order of pages is stored into certain storage for each page. +When we need to know about status of all pages, we can get and analyze +this information. + +Although we already have tracepoint for tracing page allocation/free, +using it for analyzing who allocate each page is rather complex. We need +to enlarge the trace buffer for preventing overlapping until userspace +program launched. And, launched program continually dump out the trace +buffer for later analysis and it would change system behviour with more +possibility rather than just keeping it in memory, so bad for debugging. + +page owner can also be used for various purposes. For example, accurate +fragmentation statistics can be obtained through gfp flag information of +each page. It is already implemented and activated if page owner is +enabled. Other usages are more than welcome. + +page owner is disabled in default. So, if you'd like to use it, you need +to add "page_owner=on" into your boot cmdline. If the kernel is built +with page owner and page owner is disabled in runtime due to no enabling +boot option, runtime overhead is marginal. If disabled in runtime, it +doesn't require memory to store owner information, so there is no runtime +memory overhead. And, page owner inserts just two unlikely branches into +the page allocator hotpath and if not enabled, then allocation is done +like as the kernel without page owner. These two unlikely branches should +not affect to allocation performance, especially if the static keys jump +label patching functionality is available. Following is the kernel's code +size change due to this facility. + +- Without page owner:: + + text data bss dec hex filename + 40662 1493 644 42799 a72f mm/page_alloc.o + +- With page owner:: + + text data bss dec hex filename + 40892 1493 644 43029 a815 mm/page_alloc.o + 1427 24 8 1459 5b3 mm/page_ext.o + 2722 50 0 2772 ad4 mm/page_owner.o + +Although, roughly, 4 KB code is added in total, page_alloc.o increase by +230 bytes and only half of it is in hotpath. Building the kernel with +page owner and turning it on if needed would be great option to debug +kernel memory problem. + +There is one notice that is caused by implementation detail. page owner +stores information into the memory from struct page extension. This memory +is initialized some time later than that page allocator starts in sparse +memory system, so, until initialization, many pages can be allocated and +they would have no owner information. To fix it up, these early allocated +pages are investigated and marked as allocated in initialization phase. +Although it doesn't mean that they have the right owner information, +at least, we can tell whether the page is allocated or not, +more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages +are catched and marked, although they are mostly allocated from struct +page extension feature. Anyway, after that, no page is left in +un-tracking state. + +Usage +===== + +1) Build user-space helper:: + + cd tools/vm + make page_owner_sort + +2) Enable page owner: add "page_owner=on" to boot cmdline. + +3) Do the job what you want to debug + +4) Analyze information from page owner:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + grep -v ^PFN page_owner_full.txt > page_owner.txt + ./page_owner_sort page_owner.txt sorted_page_owner.txt + + See the result about who allocated each page + in the ``sorted_page_owner.txt``. diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt deleted file mode 100644 index 0ed5ab8c7ab4..000000000000 --- a/Documentation/vm/page_owner.txt +++ /dev/null @@ -1,90 +0,0 @@ -.. _page_owner: - -================================================== -page owner: Tracking about who allocated each page -================================================== - -Introduction -============ - -page owner is for the tracking about who allocated each page. -It can be used to debug memory leak or to find a memory hogger. -When allocation happens, information about allocation such as call stack -and order of pages is stored into certain storage for each page. -When we need to know about status of all pages, we can get and analyze -this information. - -Although we already have tracepoint for tracing page allocation/free, -using it for analyzing who allocate each page is rather complex. We need -to enlarge the trace buffer for preventing overlapping until userspace -program launched. And, launched program continually dump out the trace -buffer for later analysis and it would change system behviour with more -possibility rather than just keeping it in memory, so bad for debugging. - -page owner can also be used for various purposes. For example, accurate -fragmentation statistics can be obtained through gfp flag information of -each page. It is already implemented and activated if page owner is -enabled. Other usages are more than welcome. - -page owner is disabled in default. So, if you'd like to use it, you need -to add "page_owner=on" into your boot cmdline. If the kernel is built -with page owner and page owner is disabled in runtime due to no enabling -boot option, runtime overhead is marginal. If disabled in runtime, it -doesn't require memory to store owner information, so there is no runtime -memory overhead. And, page owner inserts just two unlikely branches into -the page allocator hotpath and if not enabled, then allocation is done -like as the kernel without page owner. These two unlikely branches should -not affect to allocation performance, especially if the static keys jump -label patching functionality is available. Following is the kernel's code -size change due to this facility. - -- Without page owner:: - - text data bss dec hex filename - 40662 1493 644 42799 a72f mm/page_alloc.o - -- With page owner:: - - text data bss dec hex filename - 40892 1493 644 43029 a815 mm/page_alloc.o - 1427 24 8 1459 5b3 mm/page_ext.o - 2722 50 0 2772 ad4 mm/page_owner.o - -Although, roughly, 4 KB code is added in total, page_alloc.o increase by -230 bytes and only half of it is in hotpath. Building the kernel with -page owner and turning it on if needed would be great option to debug -kernel memory problem. - -There is one notice that is caused by implementation detail. page owner -stores information into the memory from struct page extension. This memory -is initialized some time later than that page allocator starts in sparse -memory system, so, until initialization, many pages can be allocated and -they would have no owner information. To fix it up, these early allocated -pages are investigated and marked as allocated in initialization phase. -Although it doesn't mean that they have the right owner information, -at least, we can tell whether the page is allocated or not, -more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages -are catched and marked, although they are mostly allocated from struct -page extension feature. Anyway, after that, no page is left in -un-tracking state. - -Usage -===== - -1) Build user-space helper:: - - cd tools/vm - make page_owner_sort - -2) Enable page owner: add "page_owner=on" to boot cmdline. - -3) Do the job what you want to debug - -4) Analyze information from page owner:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - grep -v ^PFN page_owner_full.txt > page_owner.txt - ./page_owner_sort page_owner.txt sorted_page_owner.txt - - See the result about who allocated each page - in the ``sorted_page_owner.txt``. diff --git a/Documentation/vm/pagemap.rst b/Documentation/vm/pagemap.rst new file mode 100644 index 000000000000..d54b4bfd3043 --- /dev/null +++ b/Documentation/vm/pagemap.rst @@ -0,0 +1,197 @@ +.. _pagemap: + +====================================== +pagemap from the Userspace Perspective +====================================== + +pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow +userspace programs to examine the page tables and related information by +reading files in ``/proc``. + +There are four components to pagemap: + + * ``/proc/pid/pagemap``. This file lets a userspace process find out which + physical frame each virtual page is mapped to. It contains one 64-bit + value for each virtual page, containing the following data (from + fs/proc/task_mmu.c, above pagemap_read): + + * Bits 0-54 page frame number (PFN) if present + * Bits 0-4 swap type if swapped + * Bits 5-54 swap offset if swapped + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.rst) + * Bit 56 page exclusively mapped (since 4.2) + * Bits 57-60 zero + * Bit 61 page is file-page or shared-anon (since 3.5) + * Bit 62 page swapped + * Bit 63 page present + + Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs. + In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from + 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN. + Reason: information about PFNs helps in exploiting Rowhammer vulnerability. + + If the page is not present but in swap, then the PFN contains an + encoding of the swap file number and the page's offset into the + swap. Unmapped pages return a null PFN. This allows determining + precisely which pages are mapped (or in swap) and comparing mapped + pages between processes. + + Efficient users of this interface will use /proc/pid/maps to + determine which areas of memory are actually mapped and llseek to + skip over unmapped regions. + + * ``/proc/kpagecount``. This file contains a 64-bit count of the number of + times each page is mapped, indexed by PFN. + + * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each + page, indexed by PFN. + + The flags are (from ``fs/proc/page.c``, above kpageflags_read): + + 0. LOCKED + 1. ERROR + 2. REFERENCED + 3. UPTODATE + 4. DIRTY + 5. LRU + 6. ACTIVE + 7. SLAB + 8. WRITEBACK + 9. RECLAIM + 10. BUDDY + 11. MMAP + 12. ANON + 13. SWAPCACHE + 14. SWAPBACKED + 15. COMPOUND_HEAD + 16. COMPOUND_TAIL + 17. HUGE + 18. UNEVICTABLE + 19. HWPOISON + 20. NOPAGE + 21. KSM + 22. THP + 23. BALLOON + 24. ZERO_PAGE + 25. IDLE + + * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the + memory cgroup each page is charged to, indexed by PFN. Only available when + CONFIG_MEMCG is set. + +Short descriptions to the page flags: +===================================== + +0 - LOCKED + page is being locked for exclusive access, eg. by undergoing read/write IO +7 - SLAB + page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator + When compound page is used, SLUB/SLQB will only set this flag on the head + page; SLOB will not flag it at all. +10 - BUDDY + a free memory block managed by the buddy system allocator + The buddy system organizes free memory in blocks of various orders. + An order N block has 2^N physically contiguous pages, with the BUDDY flag + set for and _only_ for the first page. +15 - COMPOUND_HEAD + A compound page with order N consists of 2^N physically contiguous pages. + A compound page with order 2 takes the form of "HTTT", where H donates its + head page and T donates its tail page(s). The major consumers of compound + pages are hugeTLB pages (Documentation/vm/hugetlbpage.rst), the SLUB etc. + memory allocators and various device drivers. However in this interface, + only huge/giga pages are made visible to end users. +16 - COMPOUND_TAIL + A compound page tail (see description above). +17 - HUGE + this is an integral part of a HugeTLB page +19 - HWPOISON + hardware detected memory corruption on this page: don't touch the data! +20 - NOPAGE + no page frame exists at the requested address +21 - KSM + identical memory pages dynamically shared between one or more processes +22 - THP + contiguous pages which construct transparent hugepages +23 - BALLOON + balloon compaction page +24 - ZERO_PAGE + zero page for pfn_zero or huge_zero page +25 - IDLE + page has not been accessed since it was marked idle (see + Documentation/vm/idle_page_tracking.rst). Note that this flag may be + stale in case the page was accessed via a PTE. To make sure the flag + is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. + +IO related page flags +--------------------- + +1 - ERROR + IO error occurred +3 - UPTODATE + page has up-to-date data + ie. for file backed page: (in-memory data revision >= on-disk one) +4 - DIRTY + page has been written to, hence contains new data + ie. for file backed page: (in-memory data revision > on-disk one) +8 - WRITEBACK + page is being synced to disk + +LRU related page flags +---------------------- + +5 - LRU + page is in one of the LRU lists +6 - ACTIVE + page is in the active LRU list +18 - UNEVICTABLE + page is in the unevictable (non-)LRU list It is somehow pinned and + not a candidate for LRU page reclaims, eg. ramfs pages, + shmctl(SHM_LOCK) and mlock() memory segments +2 - REFERENCED + page has been referenced since last LRU list enqueue/requeue +9 - RECLAIM + page will be reclaimed soon after its pageout IO completed +11 - MMAP + a memory mapped page +12 - ANON + a memory mapped page that is not part of a file +13 - SWAPCACHE + page is mapped to swap space, ie. has an associated swap entry +14 - SWAPBACKED + page is backed by swap/RAM + +The page-types tool in the tools/vm directory can be used to query the +above flags. + +Using pagemap to do something useful +==================================== + +The general procedure for using pagemap to find out about a process' memory +usage goes like this: + + 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are + mapped to what. + 2. Select the maps you are interested in -- all of them, or a particular + library, or the stack or the heap, etc. + 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine. + 4. Read a u64 for each page from pagemap. + 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``. For each PFN you + just read, seek to that entry in the file, and read the data you want. + +For example, to find the "unique set size" (USS), which is the amount of +memory that a process is using that is not shared with any other process, +you can go through every map in the process, find the PFNs, look those up +in kpagecount, and tally up the number of pages that are only referenced +once. + +Other notes +=========== + +Reading from any of the files will return -EINVAL if you are not starting +the read on an 8-byte boundary (e.g., if you sought an odd number of bytes +into the file), or if the size of the read is not a multiple of 8 bytes. + +Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is +always 12 at most architectures). Since Linux 3.11 their meaning changes +after first clear of soft-dirty bits. Since Linux 4.2 they are used for +flags unconditionally. diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt deleted file mode 100644 index bd6d71740c88..000000000000 --- a/Documentation/vm/pagemap.txt +++ /dev/null @@ -1,197 +0,0 @@ -.. _pagemap: - -====================================== -pagemap from the Userspace Perspective -====================================== - -pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow -userspace programs to examine the page tables and related information by -reading files in ``/proc``. - -There are four components to pagemap: - - * ``/proc/pid/pagemap``. This file lets a userspace process find out which - physical frame each virtual page is mapped to. It contains one 64-bit - value for each virtual page, containing the following data (from - fs/proc/task_mmu.c, above pagemap_read): - - * Bits 0-54 page frame number (PFN) if present - * Bits 0-4 swap type if swapped - * Bits 5-54 swap offset if swapped - * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) - * Bit 56 page exclusively mapped (since 4.2) - * Bits 57-60 zero - * Bit 61 page is file-page or shared-anon (since 3.5) - * Bit 62 page swapped - * Bit 63 page present - - Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs. - In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from - 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN. - Reason: information about PFNs helps in exploiting Rowhammer vulnerability. - - If the page is not present but in swap, then the PFN contains an - encoding of the swap file number and the page's offset into the - swap. Unmapped pages return a null PFN. This allows determining - precisely which pages are mapped (or in swap) and comparing mapped - pages between processes. - - Efficient users of this interface will use /proc/pid/maps to - determine which areas of memory are actually mapped and llseek to - skip over unmapped regions. - - * ``/proc/kpagecount``. This file contains a 64-bit count of the number of - times each page is mapped, indexed by PFN. - - * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each - page, indexed by PFN. - - The flags are (from ``fs/proc/page.c``, above kpageflags_read): - - 0. LOCKED - 1. ERROR - 2. REFERENCED - 3. UPTODATE - 4. DIRTY - 5. LRU - 6. ACTIVE - 7. SLAB - 8. WRITEBACK - 9. RECLAIM - 10. BUDDY - 11. MMAP - 12. ANON - 13. SWAPCACHE - 14. SWAPBACKED - 15. COMPOUND_HEAD - 16. COMPOUND_TAIL - 17. HUGE - 18. UNEVICTABLE - 19. HWPOISON - 20. NOPAGE - 21. KSM - 22. THP - 23. BALLOON - 24. ZERO_PAGE - 25. IDLE - - * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the - memory cgroup each page is charged to, indexed by PFN. Only available when - CONFIG_MEMCG is set. - -Short descriptions to the page flags: -===================================== - -0 - LOCKED - page is being locked for exclusive access, eg. by undergoing read/write IO -7 - SLAB - page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator - When compound page is used, SLUB/SLQB will only set this flag on the head - page; SLOB will not flag it at all. -10 - BUDDY - a free memory block managed by the buddy system allocator - The buddy system organizes free memory in blocks of various orders. - An order N block has 2^N physically contiguous pages, with the BUDDY flag - set for and _only_ for the first page. -15 - COMPOUND_HEAD - A compound page with order N consists of 2^N physically contiguous pages. - A compound page with order 2 takes the form of "HTTT", where H donates its - head page and T donates its tail page(s). The major consumers of compound - pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc. - memory allocators and various device drivers. However in this interface, - only huge/giga pages are made visible to end users. -16 - COMPOUND_TAIL - A compound page tail (see description above). -17 - HUGE - this is an integral part of a HugeTLB page -19 - HWPOISON - hardware detected memory corruption on this page: don't touch the data! -20 - NOPAGE - no page frame exists at the requested address -21 - KSM - identical memory pages dynamically shared between one or more processes -22 - THP - contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page -24 - ZERO_PAGE - zero page for pfn_zero or huge_zero page -25 - IDLE - page has not been accessed since it was marked idle (see - Documentation/vm/idle_page_tracking.txt). Note that this flag may be - stale in case the page was accessed via a PTE. To make sure the flag - is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. - -IO related page flags ---------------------- - -1 - ERROR - IO error occurred -3 - UPTODATE - page has up-to-date data - ie. for file backed page: (in-memory data revision >= on-disk one) -4 - DIRTY - page has been written to, hence contains new data - ie. for file backed page: (in-memory data revision > on-disk one) -8 - WRITEBACK - page is being synced to disk - -LRU related page flags ----------------------- - -5 - LRU - page is in one of the LRU lists -6 - ACTIVE - page is in the active LRU list -18 - UNEVICTABLE - page is in the unevictable (non-)LRU list It is somehow pinned and - not a candidate for LRU page reclaims, eg. ramfs pages, - shmctl(SHM_LOCK) and mlock() memory segments -2 - REFERENCED - page has been referenced since last LRU list enqueue/requeue -9 - RECLAIM - page will be reclaimed soon after its pageout IO completed -11 - MMAP - a memory mapped page -12 - ANON - a memory mapped page that is not part of a file -13 - SWAPCACHE - page is mapped to swap space, ie. has an associated swap entry -14 - SWAPBACKED - page is backed by swap/RAM - -The page-types tool in the tools/vm directory can be used to query the -above flags. - -Using pagemap to do something useful -==================================== - -The general procedure for using pagemap to find out about a process' memory -usage goes like this: - - 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are - mapped to what. - 2. Select the maps you are interested in -- all of them, or a particular - library, or the stack or the heap, etc. - 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine. - 4. Read a u64 for each page from pagemap. - 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``. For each PFN you - just read, seek to that entry in the file, and read the data you want. - -For example, to find the "unique set size" (USS), which is the amount of -memory that a process is using that is not shared with any other process, -you can go through every map in the process, find the PFNs, look those up -in kpagecount, and tally up the number of pages that are only referenced -once. - -Other notes -=========== - -Reading from any of the files will return -EINVAL if you are not starting -the read on an 8-byte boundary (e.g., if you sought an odd number of bytes -into the file), or if the size of the read is not a multiple of 8 bytes. - -Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is -always 12 at most architectures). Since Linux 3.11 their meaning changes -after first clear of soft-dirty bits. Since Linux 4.2 they are used for -flags unconditionally. diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst new file mode 100644 index 000000000000..7bef6718e3a9 --- /dev/null +++ b/Documentation/vm/remap_file_pages.rst @@ -0,0 +1,33 @@ +.. _remap_file_pages: + +============================== +remap_file_pages() system call +============================== + +The remap_file_pages() system call is used to create a nonlinear mapping, +that is, a mapping in which the pages of the file are mapped into a +nonsequential order in memory. The advantage of using remap_file_pages() +over using repeated calls to mmap(2) is that the former approach does not +require the kernel to create additional VMA (Virtual Memory Area) data +structures. + +Supporting of nonlinear mapping requires significant amount of non-trivial +code in kernel virtual memory subsystem including hot paths. Also to get +nonlinear mapping work kernel need a way to distinguish normal page table +entries from entries with file offset (pte_file). Kernel reserves flag in +PTE for this purpose. PTE flags are scarce resource especially on some CPU +architectures. It would be nice to free up the flag for other usage. + +Fortunately, there are not many users of remap_file_pages() in the wild. +It's only known that one enterprise RDBMS implementation uses the syscall +on 32-bit systems to map files bigger than can linearly fit into 32-bit +virtual address space. This use-case is not critical anymore since 64-bit +systems are widely available. + +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. + +One side effect of emulation (apart from performance) is that user can hit +vm.max_map_count limit more easily due to additional VMAs. See comment for +DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt deleted file mode 100644 index 7bef6718e3a9..000000000000 --- a/Documentation/vm/remap_file_pages.txt +++ /dev/null @@ -1,33 +0,0 @@ -.. _remap_file_pages: - -============================== -remap_file_pages() system call -============================== - -The remap_file_pages() system call is used to create a nonlinear mapping, -that is, a mapping in which the pages of the file are mapped into a -nonsequential order in memory. The advantage of using remap_file_pages() -over using repeated calls to mmap(2) is that the former approach does not -require the kernel to create additional VMA (Virtual Memory Area) data -structures. - -Supporting of nonlinear mapping requires significant amount of non-trivial -code in kernel virtual memory subsystem including hot paths. Also to get -nonlinear mapping work kernel need a way to distinguish normal page table -entries from entries with file offset (pte_file). Kernel reserves flag in -PTE for this purpose. PTE flags are scarce resource especially on some CPU -architectures. It would be nice to free up the flag for other usage. - -Fortunately, there are not many users of remap_file_pages() in the wild. -It's only known that one enterprise RDBMS implementation uses the syscall -on 32-bit systems to map files bigger than can linearly fit into 32-bit -virtual address space. This use-case is not critical anymore since 64-bit -systems are widely available. - -The syscall is deprecated and replaced it with an emulation now. The -emulation creates new VMAs instead of nonlinear mappings. It's going to -work slower for rare users of remap_file_pages() but ABI is preserved. - -One side effect of emulation (apart from performance) is that user can hit -vm.max_map_count limit more easily due to additional VMAs. See comment for -DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst new file mode 100644 index 000000000000..3a775fd64e2d --- /dev/null +++ b/Documentation/vm/slub.rst @@ -0,0 +1,361 @@ +.. _slub: + +========================== +Short users guide for SLUB +========================== + +The basic philosophy of SLUB is very different from SLAB. SLAB +requires rebuilding the kernel to activate debug options for all +slab caches. SLUB always includes full debugging but it is off by default. +SLUB can enable debugging only for selected slabs in order to avoid +an impact on overall system performance which may make a bug more +difficult to find. + +In order to switch debugging on one can add an option ``slub_debug`` +to the kernel command line. That will enable full debugging for +all slabs. + +Typically one would then use the ``slabinfo`` command to get statistical +data and perform operation on the slabs. By default ``slabinfo`` only lists +slabs that have data in them. See "slabinfo -h" for more options when +running the command. ``slabinfo`` can be compiled with +:: + + gcc -o slabinfo tools/vm/slabinfo.c + +Some of the modes of operation of ``slabinfo`` require that slub debugging +be enabled on the command line. F.e. no tracking information will be +available without debugging on and validation can only partially +be performed if debugging was not switched on. + +Some more sophisticated uses of slub_debug: +------------------------------------------- + +Parameters may be given to ``slub_debug``. If none is specified then full +debugging is enabled. Format: + +slub_debug= + Enable options for all slabs +slub_debug=, + Enable options only for select slabs + + +Possible debug options are:: + + F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS + Sorry SLAB legacy issues) + Z Red zoning + P Poisoning (object and padding) + U User tracking (free and alloc) + T Trace (please only use on single slabs) + A Toggle failslab filter mark for the cache + O Switch debugging off for caches that would have + caused higher minimum slab orders + - Switch all debugging off (useful if the kernel is + configured with CONFIG_SLUB_DEBUG_ON) + +F.e. in order to boot just with sanity checks and red zoning one would specify:: + + slub_debug=FZ + +Trying to find an issue in the dentry cache? Try:: + + slub_debug=,dentry + +to only enable debugging on the dentry cache. + +Red zoning and tracking may realign the slab. We can just apply sanity checks +to the dentry cache with:: + + slub_debug=F,dentry + +Debugging options may require the minimum possible slab order to increase as +a result of storing the metadata (for example, caches with PAGE_SIZE object +sizes). This has a higher liklihood of resulting in slab allocation errors +in low memory situations or if there's high fragmentation of memory. To +switch off debugging for such caches by default, use:: + + slub_debug=O + +In case you forgot to enable debugging on the kernel command line: It is +possible to enable debugging manually when the kernel is up. Look at the +contents of:: + + /sys/kernel/slab// + +Look at the writable files. Writing 1 to them will enable the +corresponding debug option. All options can be set on a slab that does +not contain objects. If the slab already contains objects then sanity checks +and tracing may only be enabled. The other options may cause the realignment +of objects. + +Careful with tracing: It may spew out lots of information and never stop if +used on the wrong slab. + +Slab merging +============ + +If no debug options are specified then SLUB may merge similar slabs together +in order to reduce overhead and increase cache hotness of objects. +``slabinfo -a`` displays which slabs were merged together. + +Slab validation +=============== + +SLUB can validate all object if the kernel was booted with slub_debug. In +order to do so you must have the ``slabinfo`` tool. Then you can do +:: + + slabinfo -v + +which will test all objects. Output will be generated to the syslog. + +This also works in a more limited way if boot was without slab debug. +In that case ``slabinfo -v`` simply tests all reachable objects. Usually +these are in the cpu slabs and the partial slabs. Full slabs are not +tracked by SLUB in a non debug situation. + +Getting more performance +======================== + +To some degree SLUB's performance is limited by the need to take the +list_lock once in a while to deal with partial slabs. That overhead is +governed by the order of the allocation for each slab. The allocations +can be influenced by kernel parameters: + +.. slub_min_objects=x (default 4) +.. slub_min_order=x (default 0) +.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) + +``slub_min_objects`` + allows to specify how many objects must at least fit into one + slab in order for the allocation order to be acceptable. In + general slub will be able to perform this number of + allocations on a slab without consulting centralized resources + (list_lock) where contention may occur. + +``slub_min_order`` + specifies a minim order of slabs. A similar effect like + ``slub_min_objects``. + +``slub_max_order`` + specified the order at which ``slub_min_objects`` should no + longer be checked. This is useful to avoid SLUB trying to + generate super large order pages to fit ``slub_min_objects`` + of a slab cache with large object sizes into one high order + page. Setting command line parameter + ``debug_guardpage_minorder=N`` (N > 0), forces setting + ``slub_max_order`` to 0, what cause minimum possible order of + slabs allocation. + +SLUB Debug output +================= + +Here is a sample of slub debug output:: + + ==================================================================== + BUG kmalloc-8: Redzone overwritten + -------------------------------------------------------------------- + + INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc + INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 + INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 + INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 + + Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ + Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005 + Redzone 0xc90f6d28: 00 cc cc cc . + Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + + [] dump_trace+0x63/0x1eb + [] show_trace_log_lvl+0x1a/0x2f + [] show_trace+0x12/0x14 + [] dump_stack+0x16/0x18 + [] object_err+0x143/0x14b + [] check_object+0x66/0x234 + [] __slab_free+0x239/0x384 + [] kfree+0xa6/0xc6 + [] get_modalias+0xb9/0xf5 + [] dmi_dev_uevent+0x27/0x3c + [] dev_uevent+0x1ad/0x1da + [] kobject_uevent_env+0x20a/0x45b + [] kobject_uevent+0xa/0xf + [] store_uevent+0x4f/0x58 + [] dev_attr_store+0x29/0x2f + [] sysfs_write_file+0x16e/0x19c + [] vfs_write+0xd1/0x15a + [] sys_write+0x3d/0x72 + [] sysenter_past_esp+0x5f/0x99 + [] 0xb7f7b410 + ======================= + + FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc + +If SLUB encounters a corrupted object (full detection requires the kernel +to be booted with slub_debug) then the following output will be dumped +into the syslog: + +1. Description of the problem encountered + + This will be a message in the system log starting with:: + + =============================================== + BUG : + ----------------------------------------------- + + INFO: - + INFO: Slab
+ INFO: Object
+ INFO: Allocated in age= cpu= pid= + INFO: Freed in age= cpu= + pid= + + (Object allocation / free information is only available if SLAB_STORE_USER is + set for the slab. slub_debug sets that option) + +2. The object contents if an object was involved. + + Various types of lines can follow the BUG SLUB line: + + Bytes b4
: + Shows a few bytes before the object where the problem was detected. + Can be useful if the corruption does not stop with the start of the + object. + + Object
: + The bytes of the object. If the object is inactive then the bytes + typically contain poison values. Any non-poison value shows a + corruption by a write after free. + + Redzone
: + The Redzone following the object. The Redzone is used to detect + writes after the object. All bytes should always have the same + value. If there is any deviation then it is due to a write after + the object boundary. + + (Redzone information is only available if SLAB_RED_ZONE is set. + slub_debug sets that option) + + Padding
: + Unused data to fill up the space in order to get the next object + properly aligned. In the debug case we make sure that there are + at least 4 bytes of padding. This allows the detection of writes + before the object. + +3. A stackdump + + The stackdump describes the location where the error was detected. The cause + of the corruption is may be more likely found by looking at the function that + allocated or freed the object. + +4. Report on how the problem was dealt with in order to ensure the continued + operation of the system. + + These are messages in the system log beginning with:: + + FIX : + + In the above sample SLUB found that the Redzone of an active object has + been overwritten. Here a string of 8 characters was written into a slab that + has the length of 8 characters. However, a 8 character string needs a + terminating 0. That zero has overwritten the first byte of the Redzone field. + After reporting the details of the issue encountered the FIX SLUB message + tells us that SLUB has restored the Redzone to its proper value and then + system operations continue. + +Emergency operations +==================== + +Minimal debugging (sanity checks alone) can be enabled by booting with:: + + slub_debug=F + +This will be generally be enough to enable the resiliency features of slub +which will keep the system running even if a bad kernel component will +keep corrupting objects. This may be important for production systems. +Performance will be impacted by the sanity checks and there will be a +continual stream of error messages to the syslog but no additional memory +will be used (unlike full debugging). + +No guarantees. The kernel component still needs to be fixed. Performance +may be optimized further by locating the slab that experiences corruption +and enabling debugging only for that cache + +I.e.:: + + slub_debug=F,dentry + +If the corruption occurs by writing after the end of the object then it +may be advisable to enable a Redzone to avoid corrupting the beginning +of other objects:: + + slub_debug=FZ,dentry + +Extended slabinfo mode and plotting +=================================== + +The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: + - Slabcache Totals + - Slabs sorted by size (up to -N slabs, default 1) + - Slabs sorted by loss (up to -N slabs, default 1) + +Additionally, in this mode ``slabinfo`` does not dynamically scale +sizes (G/M/K) and reports everything in bytes (this functionality is +also available to other slabinfo modes via '-B' option) which makes +reporting more precise and accurate. Moreover, in some sense the `-X' +mode also simplifies the analysis of slabs' behaviour, because its +output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it +pushes the analysis from looking through the numbers (tons of numbers) +to something easier -- visual analysis. + +To generate plots: + +a) collect slabinfo extended records, for example:: + + while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done + +b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: + + slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] + + The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records + and generates 3 png files (and 3 pre-processing cache files) per STATS + file: + - Slabcache Totals: FOO_STATS-totals.png + - Slabs sorted by size: FOO_STATS-slabs-by-size.png + - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png + +Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you +need to compare slabs' behaviour "prior to" and "after" some code +modification. To help you out there, ``slabinfo-gnuplot.sh`` script +can 'merge' the `Slabcache Totals` sections from different +measurements. To visually compare N plots: + +a) Collect as many STATS1, STATS2, .. STATSN files as you need:: + + while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done + +b) Pre-process those STATS files:: + + slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN + +c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the + generated pre-processed \*-totals:: + + slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals + + This will produce a single plot (png file). + + Plots, expectedly, can be large so some fluctuations or small spikes + can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two + options to 'zoom-in'/'zoom-out': + + a) ``-s %d,%d`` -- overwrites the default image width and heigh + b) ``-r %d,%d`` -- specifies a range of samples to use (for example, + in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r + 40,60`` range will plot only samples collected between 40th and + 60th seconds). + +Christoph Lameter, May 30, 2007 +Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt deleted file mode 100644 index 3a775fd64e2d..000000000000 --- a/Documentation/vm/slub.txt +++ /dev/null @@ -1,361 +0,0 @@ -.. _slub: - -========================== -Short users guide for SLUB -========================== - -The basic philosophy of SLUB is very different from SLAB. SLAB -requires rebuilding the kernel to activate debug options for all -slab caches. SLUB always includes full debugging but it is off by default. -SLUB can enable debugging only for selected slabs in order to avoid -an impact on overall system performance which may make a bug more -difficult to find. - -In order to switch debugging on one can add an option ``slub_debug`` -to the kernel command line. That will enable full debugging for -all slabs. - -Typically one would then use the ``slabinfo`` command to get statistical -data and perform operation on the slabs. By default ``slabinfo`` only lists -slabs that have data in them. See "slabinfo -h" for more options when -running the command. ``slabinfo`` can be compiled with -:: - - gcc -o slabinfo tools/vm/slabinfo.c - -Some of the modes of operation of ``slabinfo`` require that slub debugging -be enabled on the command line. F.e. no tracking information will be -available without debugging on and validation can only partially -be performed if debugging was not switched on. - -Some more sophisticated uses of slub_debug: -------------------------------------------- - -Parameters may be given to ``slub_debug``. If none is specified then full -debugging is enabled. Format: - -slub_debug= - Enable options for all slabs -slub_debug=, - Enable options only for select slabs - - -Possible debug options are:: - - F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS - Sorry SLAB legacy issues) - Z Red zoning - P Poisoning (object and padding) - U User tracking (free and alloc) - T Trace (please only use on single slabs) - A Toggle failslab filter mark for the cache - O Switch debugging off for caches that would have - caused higher minimum slab orders - - Switch all debugging off (useful if the kernel is - configured with CONFIG_SLUB_DEBUG_ON) - -F.e. in order to boot just with sanity checks and red zoning one would specify:: - - slub_debug=FZ - -Trying to find an issue in the dentry cache? Try:: - - slub_debug=,dentry - -to only enable debugging on the dentry cache. - -Red zoning and tracking may realign the slab. We can just apply sanity checks -to the dentry cache with:: - - slub_debug=F,dentry - -Debugging options may require the minimum possible slab order to increase as -a result of storing the metadata (for example, caches with PAGE_SIZE object -sizes). This has a higher liklihood of resulting in slab allocation errors -in low memory situations or if there's high fragmentation of memory. To -switch off debugging for such caches by default, use:: - - slub_debug=O - -In case you forgot to enable debugging on the kernel command line: It is -possible to enable debugging manually when the kernel is up. Look at the -contents of:: - - /sys/kernel/slab// - -Look at the writable files. Writing 1 to them will enable the -corresponding debug option. All options can be set on a slab that does -not contain objects. If the slab already contains objects then sanity checks -and tracing may only be enabled. The other options may cause the realignment -of objects. - -Careful with tracing: It may spew out lots of information and never stop if -used on the wrong slab. - -Slab merging -============ - -If no debug options are specified then SLUB may merge similar slabs together -in order to reduce overhead and increase cache hotness of objects. -``slabinfo -a`` displays which slabs were merged together. - -Slab validation -=============== - -SLUB can validate all object if the kernel was booted with slub_debug. In -order to do so you must have the ``slabinfo`` tool. Then you can do -:: - - slabinfo -v - -which will test all objects. Output will be generated to the syslog. - -This also works in a more limited way if boot was without slab debug. -In that case ``slabinfo -v`` simply tests all reachable objects. Usually -these are in the cpu slabs and the partial slabs. Full slabs are not -tracked by SLUB in a non debug situation. - -Getting more performance -======================== - -To some degree SLUB's performance is limited by the need to take the -list_lock once in a while to deal with partial slabs. That overhead is -governed by the order of the allocation for each slab. The allocations -can be influenced by kernel parameters: - -.. slub_min_objects=x (default 4) -.. slub_min_order=x (default 0) -.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) - -``slub_min_objects`` - allows to specify how many objects must at least fit into one - slab in order for the allocation order to be acceptable. In - general slub will be able to perform this number of - allocations on a slab without consulting centralized resources - (list_lock) where contention may occur. - -``slub_min_order`` - specifies a minim order of slabs. A similar effect like - ``slub_min_objects``. - -``slub_max_order`` - specified the order at which ``slub_min_objects`` should no - longer be checked. This is useful to avoid SLUB trying to - generate super large order pages to fit ``slub_min_objects`` - of a slab cache with large object sizes into one high order - page. Setting command line parameter - ``debug_guardpage_minorder=N`` (N > 0), forces setting - ``slub_max_order`` to 0, what cause minimum possible order of - slabs allocation. - -SLUB Debug output -================= - -Here is a sample of slub debug output:: - - ==================================================================== - BUG kmalloc-8: Redzone overwritten - -------------------------------------------------------------------- - - INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc - INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 - INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 - INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 - - Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ - Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005 - Redzone 0xc90f6d28: 00 cc cc cc . - Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ - - [] dump_trace+0x63/0x1eb - [] show_trace_log_lvl+0x1a/0x2f - [] show_trace+0x12/0x14 - [] dump_stack+0x16/0x18 - [] object_err+0x143/0x14b - [] check_object+0x66/0x234 - [] __slab_free+0x239/0x384 - [] kfree+0xa6/0xc6 - [] get_modalias+0xb9/0xf5 - [] dmi_dev_uevent+0x27/0x3c - [] dev_uevent+0x1ad/0x1da - [] kobject_uevent_env+0x20a/0x45b - [] kobject_uevent+0xa/0xf - [] store_uevent+0x4f/0x58 - [] dev_attr_store+0x29/0x2f - [] sysfs_write_file+0x16e/0x19c - [] vfs_write+0xd1/0x15a - [] sys_write+0x3d/0x72 - [] sysenter_past_esp+0x5f/0x99 - [] 0xb7f7b410 - ======================= - - FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc - -If SLUB encounters a corrupted object (full detection requires the kernel -to be booted with slub_debug) then the following output will be dumped -into the syslog: - -1. Description of the problem encountered - - This will be a message in the system log starting with:: - - =============================================== - BUG : - ----------------------------------------------- - - INFO: - - INFO: Slab
- INFO: Object
- INFO: Allocated in age= cpu= pid= - INFO: Freed in age= cpu= - pid= - - (Object allocation / free information is only available if SLAB_STORE_USER is - set for the slab. slub_debug sets that option) - -2. The object contents if an object was involved. - - Various types of lines can follow the BUG SLUB line: - - Bytes b4
: - Shows a few bytes before the object where the problem was detected. - Can be useful if the corruption does not stop with the start of the - object. - - Object
: - The bytes of the object. If the object is inactive then the bytes - typically contain poison values. Any non-poison value shows a - corruption by a write after free. - - Redzone
: - The Redzone following the object. The Redzone is used to detect - writes after the object. All bytes should always have the same - value. If there is any deviation then it is due to a write after - the object boundary. - - (Redzone information is only available if SLAB_RED_ZONE is set. - slub_debug sets that option) - - Padding
: - Unused data to fill up the space in order to get the next object - properly aligned. In the debug case we make sure that there are - at least 4 bytes of padding. This allows the detection of writes - before the object. - -3. A stackdump - - The stackdump describes the location where the error was detected. The cause - of the corruption is may be more likely found by looking at the function that - allocated or freed the object. - -4. Report on how the problem was dealt with in order to ensure the continued - operation of the system. - - These are messages in the system log beginning with:: - - FIX : - - In the above sample SLUB found that the Redzone of an active object has - been overwritten. Here a string of 8 characters was written into a slab that - has the length of 8 characters. However, a 8 character string needs a - terminating 0. That zero has overwritten the first byte of the Redzone field. - After reporting the details of the issue encountered the FIX SLUB message - tells us that SLUB has restored the Redzone to its proper value and then - system operations continue. - -Emergency operations -==================== - -Minimal debugging (sanity checks alone) can be enabled by booting with:: - - slub_debug=F - -This will be generally be enough to enable the resiliency features of slub -which will keep the system running even if a bad kernel component will -keep corrupting objects. This may be important for production systems. -Performance will be impacted by the sanity checks and there will be a -continual stream of error messages to the syslog but no additional memory -will be used (unlike full debugging). - -No guarantees. The kernel component still needs to be fixed. Performance -may be optimized further by locating the slab that experiences corruption -and enabling debugging only for that cache - -I.e.:: - - slub_debug=F,dentry - -If the corruption occurs by writing after the end of the object then it -may be advisable to enable a Redzone to avoid corrupting the beginning -of other objects:: - - slub_debug=FZ,dentry - -Extended slabinfo mode and plotting -=================================== - -The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: - - Slabcache Totals - - Slabs sorted by size (up to -N slabs, default 1) - - Slabs sorted by loss (up to -N slabs, default 1) - -Additionally, in this mode ``slabinfo`` does not dynamically scale -sizes (G/M/K) and reports everything in bytes (this functionality is -also available to other slabinfo modes via '-B' option) which makes -reporting more precise and accurate. Moreover, in some sense the `-X' -mode also simplifies the analysis of slabs' behaviour, because its -output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it -pushes the analysis from looking through the numbers (tons of numbers) -to something easier -- visual analysis. - -To generate plots: - -a) collect slabinfo extended records, for example:: - - while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done - -b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: - - slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] - - The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records - and generates 3 png files (and 3 pre-processing cache files) per STATS - file: - - Slabcache Totals: FOO_STATS-totals.png - - Slabs sorted by size: FOO_STATS-slabs-by-size.png - - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png - -Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you -need to compare slabs' behaviour "prior to" and "after" some code -modification. To help you out there, ``slabinfo-gnuplot.sh`` script -can 'merge' the `Slabcache Totals` sections from different -measurements. To visually compare N plots: - -a) Collect as many STATS1, STATS2, .. STATSN files as you need:: - - while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done - -b) Pre-process those STATS files:: - - slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN - -c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the - generated pre-processed \*-totals:: - - slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals - - This will produce a single plot (png file). - - Plots, expectedly, can be large so some fluctuations or small spikes - can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two - options to 'zoom-in'/'zoom-out': - - a) ``-s %d,%d`` -- overwrites the default image width and heigh - b) ``-r %d,%d`` -- specifies a range of samples to use (for example, - in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r - 40,60`` range will plot only samples collected between 40th and - 60th seconds). - -Christoph Lameter, May 30, 2007 -Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/vm/soft-dirty.rst b/Documentation/vm/soft-dirty.rst new file mode 100644 index 000000000000..cb0cfd6672fa --- /dev/null +++ b/Documentation/vm/soft-dirty.rst @@ -0,0 +1,47 @@ +.. _soft_dirty: + +=============== +Soft-Dirty PTEs +=============== + +The soft-dirty is a bit on a PTE which helps to track which pages a task +writes to. In order to do this tracking one should + + 1. Clear soft-dirty bits from the task's PTEs. + + This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the + task in question. + + 2. Wait some time. + + 3. Read soft-dirty bits from the PTEs. + + This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the + 64-bit qword is the soft-dirty one. If set, the respective PTE was + written to since step 1. + + +Internally, to do this tracking, the writable bit is cleared from PTEs +when the soft-dirty bit is cleared. So, after this, when the task tries to +modify a page at some virtual address the #PF occurs and the kernel sets +the soft-dirty bit on the respective PTE. + +Note, that although all the task's address space is marked as r/o after the +soft-dirty bits clear, the #PF-s that occur after that are processed fast. +This is so, since the pages are still mapped to physical memory, and thus all +the kernel does is finds this fact out and puts both writable and soft-dirty +bits on the PTE. + +While in most cases tracking memory changes by #PF-s is more than enough +there is still a scenario when we can lose soft dirty bits -- a task +unmaps a previously mapped memory region and then maps a new one at exactly +the same place. When unmap is called, the kernel internally clears PTE values +including soft dirty bits. To notify user space application about such +memory region renewal the kernel always marks new memory regions (and +expanded regions) as soft dirty. + +This feature is actively used by the checkpoint-restore project. You +can find more details about it on http://criu.org + + +-- Pavel Emelyanov, Apr 9, 2013 diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt deleted file mode 100644 index cb0cfd6672fa..000000000000 --- a/Documentation/vm/soft-dirty.txt +++ /dev/null @@ -1,47 +0,0 @@ -.. _soft_dirty: - -=============== -Soft-Dirty PTEs -=============== - -The soft-dirty is a bit on a PTE which helps to track which pages a task -writes to. In order to do this tracking one should - - 1. Clear soft-dirty bits from the task's PTEs. - - This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the - task in question. - - 2. Wait some time. - - 3. Read soft-dirty bits from the PTEs. - - This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the - 64-bit qword is the soft-dirty one. If set, the respective PTE was - written to since step 1. - - -Internally, to do this tracking, the writable bit is cleared from PTEs -when the soft-dirty bit is cleared. So, after this, when the task tries to -modify a page at some virtual address the #PF occurs and the kernel sets -the soft-dirty bit on the respective PTE. - -Note, that although all the task's address space is marked as r/o after the -soft-dirty bits clear, the #PF-s that occur after that are processed fast. -This is so, since the pages are still mapped to physical memory, and thus all -the kernel does is finds this fact out and puts both writable and soft-dirty -bits on the PTE. - -While in most cases tracking memory changes by #PF-s is more than enough -there is still a scenario when we can lose soft dirty bits -- a task -unmaps a previously mapped memory region and then maps a new one at exactly -the same place. When unmap is called, the kernel internally clears PTE values -including soft dirty bits. To notify user space application about such -memory region renewal the kernel always marks new memory regions (and -expanded regions) as soft dirty. - -This feature is actively used by the checkpoint-restore project. You -can find more details about it on http://criu.org - - --- Pavel Emelyanov, Apr 9, 2013 diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock deleted file mode 100644 index 889b00be469f..000000000000 --- a/Documentation/vm/split_page_table_lock +++ /dev/null @@ -1,100 +0,0 @@ -.. _split_page_table_lock: - -===================== -Split page table lock -===================== - -Originally, mm->page_table_lock spinlock protected all page tables of the -mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve -scalability, split page table lock was introduced. - -With split page table lock we have separate per-table lock to serialize -access to the table. At the moment we use split lock for PTE and PMD -tables. Access to higher level tables protected by mm->page_table_lock. - -There are helpers to lock/unlock a table and other accessor functions: - - - pte_offset_map_lock() - maps pte and takes PTE table lock, returns pointer to the taken - lock; - - pte_unmap_unlock() - unlocks and unmaps PTE table; - - pte_alloc_map_lock() - allocates PTE table if needed and take the lock, returns pointer - to taken lock or NULL if allocation failed; - - pte_lockptr() - returns pointer to PTE table lock; - - pmd_lock() - takes PMD table lock, returns pointer to taken lock; - - pmd_lockptr() - returns pointer to PMD table lock; - -Split page table lock for PTE tables is enabled compile-time if -CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. -If split lock is disabled, all tables guaded by mm->page_table_lock. - -Split page table lock for PMD tables is enabled, if it's enabled for PTE -tables and the architecture supports it (see below). - -Hugetlb and split page table lock -================================= - -Hugetlb can support several page sizes. We use split lock only for PMD -level, but not for PUD. - -Hugetlb-specific helpers: - - - huge_pte_lock() - takes pmd split lock for PMD_SIZE page, mm->page_table_lock - otherwise; - - huge_pte_lockptr() - returns pointer to table lock; - -Support of split page table lock by an architecture -=================================================== - -There's no need in special enabling of PTE split page table lock: -everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), -which must be called on PTE table allocation / freeing. - -Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache for its pages. -This field shares storage with page->ptl. - -PMD split lock only makes sense if you have more than two page table -levels. - -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table -allocation and pgtable_pmd_page_dtor() on freeing. - -Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and -pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing -paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). - -With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. - -NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must -be handled properly. - -page->ptl -========= - -page->ptl is used to access split page table lock, where 'page' is struct -page of page containing the table. It shares storage with page->private -(and few other fields in union). - -To avoid increasing size of struct page and have best performance, we use a -trick: - - - if spinlock_t fits into long, we use page->ptr as spinlock, so we - can avoid indirect access and save a cache line. - - if size of spinlock_t is bigger then size of long, we use page->ptl as - pointer to spinlock_t and allocate it dynamically. This allows to use - split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs - one more cache line for indirect access; - -The spinlock_t allocated in pgtable_page_ctor() for PTE table and in -pgtable_pmd_page_ctor() for PMD table. - -Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst new file mode 100644 index 000000000000..889b00be469f --- /dev/null +++ b/Documentation/vm/split_page_table_lock.rst @@ -0,0 +1,100 @@ +.. _split_page_table_lock: + +===================== +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables guaded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +================================= + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +=================================================== + +There's no need in special enabling of PTE split page table lock: +everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), +which must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and +pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing +paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +========= + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/vm/swap_numa.rst b/Documentation/vm/swap_numa.rst new file mode 100644 index 000000000000..e0466f2db8fa --- /dev/null +++ b/Documentation/vm/swap_numa.rst @@ -0,0 +1,80 @@ +.. _swap_numa: + +=========================================== +Automatically bind swap device to numa node +=========================================== + +If the system has more than one swap device and swap device has the node +information, we can make use of this information to decide which swap +device to use in get_swap_pages() to get better performance. + + +How to use this feature +======================= + +Swap device has priority and that decides the order of it to be used. To make +use of automatically binding, there is no need to manipulate priority settings +for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and +swapB, with swapA attached to node 0 and swapB attached to node 1, are going +to be swapped on. Simply swapping them on by doing:: + + # swapon /dev/swapA + # swapon /dev/swapB + +Then node 0 will use the two swap devices in the order of swapA then swapB and +node 1 will use the two swap devices in the order of swapB then swapA. Note +that the order of them being swapped on doesn't matter. + +A more complex example on a 4 node machine. Assume 6 swap devices are going to +be swapped on: swapA and swapB are attached to node 0, swapC is attached to +node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. +The way to swap them on is the same as above:: + + # swapon /dev/swapA + # swapon /dev/swapB + # swapon /dev/swapC + # swapon /dev/swapD + # swapon /dev/swapE + # swapon /dev/swapF + +Then node 0 will use them in the order of:: + + swapA/swapB -> swapC -> swapD -> swapE -> swapF + +swapA and swapB will be used in a round robin mode before any other swap device. + +node 1 will use them in the order of:: + + swapC -> swapA -> swapB -> swapD -> swapE -> swapF + +node 2 will use them in the order of:: + + swapD/swapE -> swapA -> swapB -> swapC -> swapF + +Similaly, swapD and swapE will be used in a round robin mode before any +other swap devices. + +node 3 will use them in the order of:: + + swapF -> swapA -> swapB -> swapC -> swapD -> swapE + + +Implementation details +====================== + +The current code uses a priority based list, swap_avail_list, to decide +which swap device to use and if multiple swap devices share the same +priority, they are used round robin. This change here replaces the single +global swap_avail_list with a per-numa-node list, i.e. for each numa node, +it sees its own priority based list of available swap devices. Swap +device's priority can be promoted on its matching node's swap_avail_list. + +The current swap device's priority is set as: user can set a >=0 value, +or the system will pick one starting from -1 then downwards. The priority +value in the swap_avail_list is the negated value of the swap device's +due to plist being sorted from low to high. The new policy doesn't change +the semantics for priority >=0 cases, the previous starting from -1 then +downwards now becomes starting from -2 then downwards and -1 is reserved +as the promoted value. So if multiple swap devices are attached to the same +node, they will all be promoted to priority -1 on that node's plist and will +be used round robin before any other swap devices. diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt deleted file mode 100644 index e0466f2db8fa..000000000000 --- a/Documentation/vm/swap_numa.txt +++ /dev/null @@ -1,80 +0,0 @@ -.. _swap_numa: - -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst new file mode 100644 index 000000000000..569d182cc973 --- /dev/null +++ b/Documentation/vm/transhuge.rst @@ -0,0 +1,573 @@ +.. _transhuge: + +============================ +Transparent Hugepage Support +============================ + +Objective +========= + +Performance critical computing applications dealing with large memory +working sets are already running on top of libhugetlbfs and in turn +hugetlbfs. Transparent Hugepage Support is an alternative means of +using huge pages for the backing of virtual memory with huge pages +that supports the automatic promotion and demotion of page sizes and +without the shortcomings of hugetlbfs. + +Currently it only works for anonymous memory mappings and tmpfs/shmem. +But in the future it can expand to other filesystems. + +The reason applications are running faster is because of two +factors. The first factor is almost completely irrelevant and it's not +of significant interest because it'll also have the downside of +requiring larger clear-page copy-page in page faults which is a +potentially negative effect. The first factor consists in taking a +single page fault for each 2M virtual region touched by userland (so +reducing the enter/exit kernel frequency by a 512 times factor). This +only matters the first time the memory is accessed for the lifetime of +a memory mapping. The second long lasting and much more important +factor will affect all subsequent accesses to the memory for the whole +runtime of the application. The second factor consist of two +components: 1) the TLB miss will run faster (especially with +virtualization using nested pagetables but almost always also on bare +metal without virtualization) and 2) a single TLB entry will be +mapping a much larger amount of virtual memory in turn reducing the +number of TLB misses. With virtualization and nested pagetables the +TLB can be mapped of larger size only if both KVM and the Linux guest +are using hugepages but a significant speedup already happens if only +one of the two is using hugepages just because of the fact the TLB +miss is going to run faster. + +Design +====== + +- "graceful fallback": mm components which don't have transparent hugepage + knowledge fall back to breaking huge pmd mapping into table of ptes and, + if necessary, split a transparent hugepage. Therefore these components + can continue working on the regular pages or regular pte mappings. + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +Transparent Hugepage Support maximizes the usefulness of free memory +if compared to the reservation approach of hugetlbfs by allowing all +unused memory to be used as cache or other movable (or even unmovable +entities). It doesn't require reservation to prevent hugepage +allocation failures to be noticeable from userland. It allows paging +and all other advanced VM features to be available on the +hugepages. It requires no modifications for applications to take +advantage of it. + +Applications however can be further optimized to take advantage of +this feature, like for example they've been optimized before to avoid +a flood of mmap system calls for every malloc(4k). Optimizing userland +is by far not mandatory and khugepaged already can take care of long +lived page allocations even for hugepage unaware applications that +deals with large amounts of memory. + +In certain cases when hugepages are enabled system wide, application +may end up allocating more memory resources. An application may mmap a +large region but only touch 1 byte of it, in that case a 2M page might +be allocated instead of a 4k page for no good. This is why it's +possible to disable hugepages system-wide and to only have them inside +MADV_HUGEPAGE madvise regions. + +Embedded systems should enable hugepages only inside madvise regions +to eliminate any risk of wasting any precious byte of memory and to +only run faster. + +Applications that gets a lot of benefit from hugepages and that don't +risk to lose memory by using hugepages, should use +madvise(MADV_HUGEPAGE) on their critical mmapped regions. + +sysfs +===== + +Transparent Hugepage Support for anonymous memory can be entirely disabled +(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE +regions (to avoid the risk of consuming more memory resources) or enabled +system wide. This can be achieved with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/enabled + echo madvise >/sys/kernel/mm/transparent_hugepage/enabled + echo never >/sys/kernel/mm/transparent_hugepage/enabled + +It's also possible to limit defrag efforts in the VM to generate +anonymous hugepages in case they're not immediately free to madvise +regions or to never try to defrag memory and simply fallback to regular +pages unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact we +use hugepages later instead of regular pages. This isn't always +guaranteed, but it may be more likely in case the allocation is for a +MADV_HUGEPAGE region. + +:: + + echo always >/sys/kernel/mm/transparent_hugepage/defrag + echo defer >/sys/kernel/mm/transparent_hugepage/defrag + echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag + echo madvise >/sys/kernel/mm/transparent_hugepage/defrag + echo never >/sys/kernel/mm/transparent_hugepage/defrag + +always + means that an application requesting THP will stall on + allocation failure and directly reclaim pages and compact + memory in an effort to allocate a THP immediately. This may be + desirable for virtual machines that benefit heavily from THP + use and are willing to delay the VM start to utilise them. + +defer + means that an application will wake kswapd in the background + to reclaim pages and wake kcompactd to compact memory so that + THP is available in the near future. It's the responsibility + of khugepaged to then install the THP pages later. + +defer+madvise + will enter direct reclaim and compaction like ``always``, but + only for regions that have used madvise(MADV_HUGEPAGE); all + other regions will wake kswapd in the background to reclaim + pages and wake kcompactd to compact memory so that THP is + available in the near future. + +madvise + will enter direct reclaim like ``always`` but only for regions + that are have used madvise(MADV_HUGEPAGE). This is the default + behaviour. + +never + should be self-explanatory. + +By default kernel tries to use huge zero page on read page fault to +anonymous mapping. It's possible to disable huge zero page by writing 0 +or enable it back by writing 1:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page + echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page + +Some userspace (such as a test program, or an optimized memory allocation +library) may want to know the size (in bytes) of a transparent hugepage:: + + cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size + +khugepaged will be automatically started when +transparent_hugepage/enabled is set to "always" or "madvise, and it'll +be automatically shutdown if it's set to "never". + +khugepaged runs usually at low frequency so while one may not want to +invoke defrag algorithms synchronously during the page faults, it +should be worth invoking defrag at least in khugepaged. However it's +also possible to disable defrag in khugepaged by writing 0 or enable +defrag in khugepaged by writing 1:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + +You can also control how many pages khugepaged should scan at each +pass:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan + +and how many milliseconds to wait in khugepaged between each pass (you +can set this to 0 to run khugepaged at 100% utilization of one core):: + + /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs + +and how many milliseconds to wait in khugepaged if there's an hugepage +allocation failure to throttle the next allocation attempt:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs + +The khugepaged progress can be seen in the number of pages collapsed:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed + +for each pass:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans + +``max_ptes_none`` specifies how many extra small pages (that are +not already mapped) can be allocated when collapsing a group +of small pages into one large page:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none + +A higher value leads to use additional memory for programs. +A lower value leads to gain less thp performance. Value of +max_ptes_none can waste cpu time very little, you can +ignore it. + +``max_ptes_swap`` specifies how many pages can be brought in from +swap when collapsing a group of pages into a transparent huge page:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap + +A higher value can cause excessive swap IO and waste +memory. A lower value can prevent THPs from being +collapsed, resulting fewer pages being collapsed into +THPs, and lower memory access performance. + +Boot parameter +============== + +You can change the sysfs boot time defaults of Transparent Hugepage +Support by passing the parameter ``transparent_hugepage=always`` or +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` +to the kernel command line. + +Hugepages in tmpfs/shmem +======================== + +You can control hugepage allocation policy in tmpfs with mount option +``huge=``. It can have following values: + +always + Attempt to allocate huge pages every time we need a new page; + +never + Do not allocate huge pages; + +within_size + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + +advise + Only allocate huge pages if requested with fadvise()/madvise(); + +The default policy is ``never``. + +``mount -o remount,huge= /mountpoint`` works fine after mount: remounting +``huge=never`` will not attempt to break up huge pages at all, just stop more +from being allocated. + +There's also sysfs knob to control hugepage allocation policy for internal +shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount +is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or +MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. + +In addition to policies listed above, shmem_enabled allows two further +values: + +deny + For use in emergencies, to force the huge option off from + all mounts; +force + Force the huge option on for all - very useful for testing; + +Need of application restart +=========================== + +The transparent_hugepage/enabled values and tmpfs mount option only affect +future behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to the +regions registered in khugepaged. + +Monitoring usage +================ + +The number of anonymous transparent huge pages currently used by the +system is available by reading the AnonHugePages field in ``/proc/meminfo``. +To identify what applications are using anonymous transparent huge pages, +it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields +for each mapping. + +The number of file transparent huge pages mapped to userspace is available +by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. +To identify what applications are mapping file transparent huge pages, it +is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +for each mapping. + +Note that reading the smaps file is expensive and reading it +frequently will incur overhead. + +There are a number of counters in ``/proc/vmstat`` that may be used to +monitor how successfully the system is providing huge pages for use. + +thp_fault_alloc + is incremented every time a huge page is successfully + allocated to handle a page fault. This applies to both the + first time a page is faulted and for COW faults. + +thp_collapse_alloc + is incremented by khugepaged when it has found + a range of pages to collapse into one huge page and has + successfully allocated a new huge page to store the data. + +thp_fault_fallback + is incremented if a page fault fails to allocate + a huge page and instead falls back to using small pages. + +thp_collapse_alloc_failed + is incremented if khugepaged found a range + of pages that should be collapsed into one huge page but failed + the allocation. + +thp_file_alloc + is incremented every time a file huge page is successfully + allocated. + +thp_file_mapped + is incremented every time a file huge page is mapped into + user address space. + +thp_split_page + is incremented every time a huge page is split into base + pages. This can happen for a variety of reasons but a common + reason is that a huge page is old and is being reclaimed. + This action implies splitting all PMD the page mapped with. + +thp_split_page_failed + is incremented if kernel fails to split huge + page. This can happen if the page was pinned by somebody. + +thp_deferred_split_page + is incremented when a huge page is put onto split + queue. This happens when a huge page is partially unmapped and + splitting it would free up some memory. Pages on split queue are + going to be split under memory pressure. + +thp_split_pmd + is incremented every time a PMD split into table of PTEs. + This can happen, for instance, when application calls mprotect() or + munmap() on part of huge page. It doesn't split huge page, only + page table entry. + +thp_zero_page_alloc + is incremented every time a huge zero page is + successfully allocated. It includes allocations which where + dropped due race with other allocation. Note, it doesn't count + every map of the huge zero page, only its allocation. + +thp_zero_page_alloc_failed + is incremented if kernel fails to allocate + huge zero page and falls back to using small pages. + +As the system ages, allocating huge pages may be expensive as the +system uses memory compaction to copy data around memory to free a +huge page for use. There are some counters in ``/proc/vmstat`` to help +monitor this overhead. + +compact_stall + is incremented every time a process stalls to run + memory compaction so that a huge page is free for use. + +compact_success + is incremented if the system compacted memory and + freed a huge page for use. + +compact_fail + is incremented if the system tries to compact memory + but failed. + +compact_pages_moved + is incremented each time a page is moved. If + this value is increasing rapidly, it implies that the system + is copying a lot of data to satisfy the huge page allocation. + It is possible that the cost of copying exceeds any savings + from reduced TLB misses. + +compact_pagemigrate_failed + is incremented when the underlying mechanism + for moving a page failed. + +compact_blocks_moved + is incremented each time memory compaction examines + a huge page aligned range of pages. + +It is possible to establish how long the stalls were using the function +tracer to record how long was spent in __alloc_pages_nodemask and +using the mm_page_alloc tracepoint to identify which allocations were +for huge pages. + +get_user_pages and follow_page +============================== + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most gup users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead. Taking reference on any head/tail page would +prevent page from being split by anyone. + +.. note:: + these aren't new constraints to the GUP API, and they match the + same constrains that applies to hugetlbfs too, so any driver capable + of handling GUP on hugetlbfs will also work fine on transparent + hugepage backed mappings. + +In case you can't handle compound pages if they're returned by +follow_page, the FOLL_SPLIT bit can be specified as parameter to +follow_page, so that it will split the hugepages before returning +them. Migration for example passes FOLL_SPLIT as parameter to +follow_page because it's not hugepage aware and in fact it can't work +at all on hugetlbfs (but it instead works fine on transparent +hugepages thanks to FOLL_SPLIT). migration simply can't deal with +hugepages being returned (as it's not only checking the pfn of the +page and pinning it during the copy but it pretends to migrate the +memory in regular page sizes and with regular pte/pmd mappings). + +Optimizing the applications +=========================== + +To be guaranteed that the kernel will map a 2M page immediately in any +memory region, the mmap region has to be hugepage naturally +aligned. posix_memalign() can provide that guarantee. + +Hugetlbfs +========= + +You can use hugetlbfs on a kernel that has transparent hugepage +support enabled just fine as always. No difference can be noted in +hugetlbfs other than there will be less overall fragmentation. All +usual features belonging to hugetlbfs are preserved and +unaffected. libhugetlbfs will also work fine as usual. + +Graceful fallback +================= + +Code walking pagetables but unaware about huge pmds can simply call +split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundred if not thousand of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +but you can't handle it natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. split_huge_page() can fail +if the page is pinned and you must handle this correctly. + +Example to make mremap.c transparent hugepage aware with a one liner +change:: + + diff --git a/mm/mremap.c b/mm/mremap.c + --- a/mm/mremap.c + +++ b/mm/mremap.c + @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); + + split_huge_pmd(vma, pmd, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +Locking in hugepage aware code +============================== + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_sem in read (or write) mode to be sure an huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_sem in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the +page table lock will prevent the huge pmd to be converted into a +regular pmd from under you (split_huge_pmd can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page table lock and fallback to the old code as +before. Otherwise you can proceed to process the huge pmd and the +hugepage natively. Once finished you can drop the page table lock. + +Refcounts and transparent huge pages +==================================== + +Refcounting on THP is mostly consistent with refcounting on other compound +pages: + + - get_page()/put_page() and GUP operate in head page's ->_refcount. + + - ->_refcount in tail pages is always zero: get_page_unless_zero() never + succeed on tail pages. + + - map/unmap of the pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page. + + - map/unmap of the whole compound page accounted in compound_mapcount + (stored in first tail page). For file huge pages, we also increment + ->_mapcount of all sub-pages in order to have race-free detection of + last unmap of subpages. + +PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. + +For anonymous pages PageDoubleMap() also indicates ->_mapcount in all +subpages is offset up by one. This additional reference is required to +get race-free detection of unmap of subpages when we have them mapped with +both PMDs and PTEs. + +This is optimization required to lower overhead of per-subpage mapcount +tracking. The alternative is alter ->_mapcount in all subpages on each +map/unmap of the whole compound page. + +For anonymous pages, we set PG_double_map when a PMD of the page got split +for the first time, but still have PMD mapping. The additional references +go away with last compound_mapcount. + +File pages get PG_double_map set on first map of the page with PTE and +goes away when the page gets evicted from page cache. + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the page +structures. It can be done easily for refcounts taken by page table +entries. But we don't have enough information on how to distribute any +additional pins (i.e. from get_user_pages). split_huge_page() fails any +requests to split pinned huge page: it expects page count to be equal to +sum of mapcount of all sub-pages plus one (split_huge_page caller must +have reference for head page). + +split_huge_page uses migration entries to stabilize page->_refcount and +page->_mapcount of anonymous pages. File pages just got unmapped. + +We safe against physical memory scanners too: the only legitimate way +scanner can get reference to a page is get_page_unless_zero(). + +All tail pages have zero ->_refcount until atomic_add(). This prevents the +scanner from getting a reference to the tail page up to that point. After the +atomic_add() we don't care about the ->_refcount value. We already known how +many references should be uncharged from the head page. + +For head page get_page_unless_zero() will succeed and we don't mind. It's +clear where reference should go after split: it will stay on head page. + +Note that split_huge_pmd() doesn't have any limitation on refcounting: +pmd can be split at any point and never fails. + +Partial unmap and deferred_split_huge_page() +============================================ + +Unmapping part of THP (with munmap() or other way) is not going to free +memory immediately. Instead, we detect that a subpage of THP is not in use +in page_remove_rmap() and queue the THP for splitting if memory pressure +comes. Splitting will free up unused subpages. + +Splitting the page right away is not an option due to locking context in +the place where we can detect partial unmap. It's also might be +counterproductive since in many cases partial unmap happens during exit(2) if +a THP crosses a VMA boundary. + +Function deferred_split_huge_page() is used to queue page for splitting. +The splitting itself will happen when we get memory pressure via shrinker +interface. diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt deleted file mode 100644 index 569d182cc973..000000000000 --- a/Documentation/vm/transhuge.txt +++ /dev/null @@ -1,573 +0,0 @@ -.. _transhuge: - -============================ -Transparent Hugepage Support -============================ - -Objective -========= - -Performance critical computing applications dealing with large memory -working sets are already running on top of libhugetlbfs and in turn -hugetlbfs. Transparent Hugepage Support is an alternative means of -using huge pages for the backing of virtual memory with huge pages -that supports the automatic promotion and demotion of page sizes and -without the shortcomings of hugetlbfs. - -Currently it only works for anonymous memory mappings and tmpfs/shmem. -But in the future it can expand to other filesystems. - -The reason applications are running faster is because of two -factors. The first factor is almost completely irrelevant and it's not -of significant interest because it'll also have the downside of -requiring larger clear-page copy-page in page faults which is a -potentially negative effect. The first factor consists in taking a -single page fault for each 2M virtual region touched by userland (so -reducing the enter/exit kernel frequency by a 512 times factor). This -only matters the first time the memory is accessed for the lifetime of -a memory mapping. The second long lasting and much more important -factor will affect all subsequent accesses to the memory for the whole -runtime of the application. The second factor consist of two -components: 1) the TLB miss will run faster (especially with -virtualization using nested pagetables but almost always also on bare -metal without virtualization) and 2) a single TLB entry will be -mapping a much larger amount of virtual memory in turn reducing the -number of TLB misses. With virtualization and nested pagetables the -TLB can be mapped of larger size only if both KVM and the Linux guest -are using hugepages but a significant speedup already happens if only -one of the two is using hugepages just because of the fact the TLB -miss is going to run faster. - -Design -====== - -- "graceful fallback": mm components which don't have transparent hugepage - knowledge fall back to breaking huge pmd mapping into table of ptes and, - if necessary, split a transparent hugepage. Therefore these components - can continue working on the regular pages or regular pte mappings. - -- if a hugepage allocation fails because of memory fragmentation, - regular pages should be gracefully allocated instead and mixed in - the same vma without any failure or significant delay and without - userland noticing - -- if some task quits and more hugepages become available (either - immediately in the buddy or through the VM), guest physical memory - backed by regular pages should be relocated on hugepages - automatically (with khugepaged) - -- it doesn't require memory reservation and in turn it uses hugepages - whenever possible (the only possible reservation here is kernelcore= - to avoid unmovable pages to fragment all the memory but such a tweak - is not specific to transparent hugepage support and it's a generic - feature that applies to all dynamic high order allocations in the - kernel) - -Transparent Hugepage Support maximizes the usefulness of free memory -if compared to the reservation approach of hugetlbfs by allowing all -unused memory to be used as cache or other movable (or even unmovable -entities). It doesn't require reservation to prevent hugepage -allocation failures to be noticeable from userland. It allows paging -and all other advanced VM features to be available on the -hugepages. It requires no modifications for applications to take -advantage of it. - -Applications however can be further optimized to take advantage of -this feature, like for example they've been optimized before to avoid -a flood of mmap system calls for every malloc(4k). Optimizing userland -is by far not mandatory and khugepaged already can take care of long -lived page allocations even for hugepage unaware applications that -deals with large amounts of memory. - -In certain cases when hugepages are enabled system wide, application -may end up allocating more memory resources. An application may mmap a -large region but only touch 1 byte of it, in that case a 2M page might -be allocated instead of a 4k page for no good. This is why it's -possible to disable hugepages system-wide and to only have them inside -MADV_HUGEPAGE madvise regions. - -Embedded systems should enable hugepages only inside madvise regions -to eliminate any risk of wasting any precious byte of memory and to -only run faster. - -Applications that gets a lot of benefit from hugepages and that don't -risk to lose memory by using hugepages, should use -madvise(MADV_HUGEPAGE) on their critical mmapped regions. - -sysfs -===== - -Transparent Hugepage Support for anonymous memory can be entirely disabled -(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE -regions (to avoid the risk of consuming more memory resources) or enabled -system wide. This can be achieved with one of:: - - echo always >/sys/kernel/mm/transparent_hugepage/enabled - echo madvise >/sys/kernel/mm/transparent_hugepage/enabled - echo never >/sys/kernel/mm/transparent_hugepage/enabled - -It's also possible to limit defrag efforts in the VM to generate -anonymous hugepages in case they're not immediately free to madvise -regions or to never try to defrag memory and simply fallback to regular -pages unless hugepages are immediately available. Clearly if we spend CPU -time to defrag memory, we would expect to gain even more by the fact we -use hugepages later instead of regular pages. This isn't always -guaranteed, but it may be more likely in case the allocation is for a -MADV_HUGEPAGE region. - -:: - - echo always >/sys/kernel/mm/transparent_hugepage/defrag - echo defer >/sys/kernel/mm/transparent_hugepage/defrag - echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag - echo madvise >/sys/kernel/mm/transparent_hugepage/defrag - echo never >/sys/kernel/mm/transparent_hugepage/defrag - -always - means that an application requesting THP will stall on - allocation failure and directly reclaim pages and compact - memory in an effort to allocate a THP immediately. This may be - desirable for virtual machines that benefit heavily from THP - use and are willing to delay the VM start to utilise them. - -defer - means that an application will wake kswapd in the background - to reclaim pages and wake kcompactd to compact memory so that - THP is available in the near future. It's the responsibility - of khugepaged to then install the THP pages later. - -defer+madvise - will enter direct reclaim and compaction like ``always``, but - only for regions that have used madvise(MADV_HUGEPAGE); all - other regions will wake kswapd in the background to reclaim - pages and wake kcompactd to compact memory so that THP is - available in the near future. - -madvise - will enter direct reclaim like ``always`` but only for regions - that are have used madvise(MADV_HUGEPAGE). This is the default - behaviour. - -never - should be self-explanatory. - -By default kernel tries to use huge zero page on read page fault to -anonymous mapping. It's possible to disable huge zero page by writing 0 -or enable it back by writing 1:: - - echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page - echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page - -Some userspace (such as a test program, or an optimized memory allocation -library) may want to know the size (in bytes) of a transparent hugepage:: - - cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size - -khugepaged will be automatically started when -transparent_hugepage/enabled is set to "always" or "madvise, and it'll -be automatically shutdown if it's set to "never". - -khugepaged runs usually at low frequency so while one may not want to -invoke defrag algorithms synchronously during the page faults, it -should be worth invoking defrag at least in khugepaged. However it's -also possible to disable defrag in khugepaged by writing 0 or enable -defrag in khugepaged by writing 1:: - - echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag - echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag - -You can also control how many pages khugepaged should scan at each -pass:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan - -and how many milliseconds to wait in khugepaged between each pass (you -can set this to 0 to run khugepaged at 100% utilization of one core):: - - /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs - -and how many milliseconds to wait in khugepaged if there's an hugepage -allocation failure to throttle the next allocation attempt:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs - -The khugepaged progress can be seen in the number of pages collapsed:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed - -for each pass:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans - -``max_ptes_none`` specifies how many extra small pages (that are -not already mapped) can be allocated when collapsing a group -of small pages into one large page:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none - -A higher value leads to use additional memory for programs. -A lower value leads to gain less thp performance. Value of -max_ptes_none can waste cpu time very little, you can -ignore it. - -``max_ptes_swap`` specifies how many pages can be brought in from -swap when collapsing a group of pages into a transparent huge page:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap - -A higher value can cause excessive swap IO and waste -memory. A lower value can prevent THPs from being -collapsed, resulting fewer pages being collapsed into -THPs, and lower memory access performance. - -Boot parameter -============== - -You can change the sysfs boot time defaults of Transparent Hugepage -Support by passing the parameter ``transparent_hugepage=always`` or -``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` -to the kernel command line. - -Hugepages in tmpfs/shmem -======================== - -You can control hugepage allocation policy in tmpfs with mount option -``huge=``. It can have following values: - -always - Attempt to allocate huge pages every time we need a new page; - -never - Do not allocate huge pages; - -within_size - Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; - -advise - Only allocate huge pages if requested with fadvise()/madvise(); - -The default policy is ``never``. - -``mount -o remount,huge= /mountpoint`` works fine after mount: remounting -``huge=never`` will not attempt to break up huge pages at all, just stop more -from being allocated. - -There's also sysfs knob to control hugepage allocation policy for internal -shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount -is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or -MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. - -In addition to policies listed above, shmem_enabled allows two further -values: - -deny - For use in emergencies, to force the huge option off from - all mounts; -force - Force the huge option on for all - very useful for testing; - -Need of application restart -=========================== - -The transparent_hugepage/enabled values and tmpfs mount option only affect -future behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to the -regions registered in khugepaged. - -Monitoring usage -================ - -The number of anonymous transparent huge pages currently used by the -system is available by reading the AnonHugePages field in ``/proc/meminfo``. -To identify what applications are using anonymous transparent huge pages, -it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields -for each mapping. - -The number of file transparent huge pages mapped to userspace is available -by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. -To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields -for each mapping. - -Note that reading the smaps file is expensive and reading it -frequently will incur overhead. - -There are a number of counters in ``/proc/vmstat`` that may be used to -monitor how successfully the system is providing huge pages for use. - -thp_fault_alloc - is incremented every time a huge page is successfully - allocated to handle a page fault. This applies to both the - first time a page is faulted and for COW faults. - -thp_collapse_alloc - is incremented by khugepaged when it has found - a range of pages to collapse into one huge page and has - successfully allocated a new huge page to store the data. - -thp_fault_fallback - is incremented if a page fault fails to allocate - a huge page and instead falls back to using small pages. - -thp_collapse_alloc_failed - is incremented if khugepaged found a range - of pages that should be collapsed into one huge page but failed - the allocation. - -thp_file_alloc - is incremented every time a file huge page is successfully - allocated. - -thp_file_mapped - is incremented every time a file huge page is mapped into - user address space. - -thp_split_page - is incremented every time a huge page is split into base - pages. This can happen for a variety of reasons but a common - reason is that a huge page is old and is being reclaimed. - This action implies splitting all PMD the page mapped with. - -thp_split_page_failed - is incremented if kernel fails to split huge - page. This can happen if the page was pinned by somebody. - -thp_deferred_split_page - is incremented when a huge page is put onto split - queue. This happens when a huge page is partially unmapped and - splitting it would free up some memory. Pages on split queue are - going to be split under memory pressure. - -thp_split_pmd - is incremented every time a PMD split into table of PTEs. - This can happen, for instance, when application calls mprotect() or - munmap() on part of huge page. It doesn't split huge page, only - page table entry. - -thp_zero_page_alloc - is incremented every time a huge zero page is - successfully allocated. It includes allocations which where - dropped due race with other allocation. Note, it doesn't count - every map of the huge zero page, only its allocation. - -thp_zero_page_alloc_failed - is incremented if kernel fails to allocate - huge zero page and falls back to using small pages. - -As the system ages, allocating huge pages may be expensive as the -system uses memory compaction to copy data around memory to free a -huge page for use. There are some counters in ``/proc/vmstat`` to help -monitor this overhead. - -compact_stall - is incremented every time a process stalls to run - memory compaction so that a huge page is free for use. - -compact_success - is incremented if the system compacted memory and - freed a huge page for use. - -compact_fail - is incremented if the system tries to compact memory - but failed. - -compact_pages_moved - is incremented each time a page is moved. If - this value is increasing rapidly, it implies that the system - is copying a lot of data to satisfy the huge page allocation. - It is possible that the cost of copying exceeds any savings - from reduced TLB misses. - -compact_pagemigrate_failed - is incremented when the underlying mechanism - for moving a page failed. - -compact_blocks_moved - is incremented each time memory compaction examines - a huge page aligned range of pages. - -It is possible to establish how long the stalls were using the function -tracer to record how long was spent in __alloc_pages_nodemask and -using the mm_page_alloc tracepoint to identify which allocations were -for huge pages. - -get_user_pages and follow_page -============================== - -get_user_pages and follow_page if run on a hugepage, will return the -head or tail pages as usual (exactly as they would do on -hugetlbfs). Most gup users will only care about the actual physical -address of the page and its temporary pinning to release after the I/O -is complete, so they won't ever notice the fact the page is huge. But -if any driver is going to mangle over the page structure of the tail -page (like for checking page->mapping or other bits that are relevant -for the head page and not the tail page), it should be updated to jump -to check head page instead. Taking reference on any head/tail page would -prevent page from being split by anyone. - -.. note:: - these aren't new constraints to the GUP API, and they match the - same constrains that applies to hugetlbfs too, so any driver capable - of handling GUP on hugetlbfs will also work fine on transparent - hugepage backed mappings. - -In case you can't handle compound pages if they're returned by -follow_page, the FOLL_SPLIT bit can be specified as parameter to -follow_page, so that it will split the hugepages before returning -them. Migration for example passes FOLL_SPLIT as parameter to -follow_page because it's not hugepage aware and in fact it can't work -at all on hugetlbfs (but it instead works fine on transparent -hugepages thanks to FOLL_SPLIT). migration simply can't deal with -hugepages being returned (as it's not only checking the pfn of the -page and pinning it during the copy but it pretends to migrate the -memory in regular page sizes and with regular pte/pmd mappings). - -Optimizing the applications -=========================== - -To be guaranteed that the kernel will map a 2M page immediately in any -memory region, the mmap region has to be hugepage naturally -aligned. posix_memalign() can provide that guarantee. - -Hugetlbfs -========= - -You can use hugetlbfs on a kernel that has transparent hugepage -support enabled just fine as always. No difference can be noted in -hugetlbfs other than there will be less overall fragmentation. All -usual features belonging to hugetlbfs are preserved and -unaffected. libhugetlbfs will also work fine as usual. - -Graceful fallback -================= - -Code walking pagetables but unaware about huge pmds can simply call -split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by -pmd_offset. It's trivial to make the code transparent hugepage aware -by just grepping for "pmd_offset" and adding split_huge_pmd where -missing after pmd_offset returns the pmd. Thanks to the graceful -fallback design, with a one liner change, you can avoid to write -hundred if not thousand of lines of complex code to make your code -hugepage aware. - -If you're not walking pagetables but you run into a physical hugepage -but you can't handle it natively in your code, you can split it by -calling split_huge_page(page). This is what the Linux VM does before -it tries to swapout the hugepage for example. split_huge_page() can fail -if the page is pinned and you must handle this correctly. - -Example to make mremap.c transparent hugepage aware with a one liner -change:: - - diff --git a/mm/mremap.c b/mm/mremap.c - --- a/mm/mremap.c - +++ b/mm/mremap.c - @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru - return NULL; - - pmd = pmd_offset(pud, addr); - + split_huge_pmd(vma, pmd, addr); - if (pmd_none_or_clear_bad(pmd)) - return NULL; - -Locking in hugepage aware code -============================== - -We want as much code as possible hugepage aware, as calling -split_huge_page() or split_huge_pmd() has a cost. - -To make pagetable walks huge pmd aware, all you need to do is to call -pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_sem in read (or write) mode to be sure an huge pmd cannot be -created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_sem in write mode in addition to the anon_vma lock). If -pmd_trans_huge returns false, you just fallback in the old code -paths. If instead pmd_trans_huge returns true, you have to take the -page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the -page table lock will prevent the huge pmd to be converted into a -regular pmd from under you (split_huge_pmd can run in parallel to the -pagetable walk). If the second pmd_trans_huge returns false, you -should just drop the page table lock and fallback to the old code as -before. Otherwise you can proceed to process the huge pmd and the -hugepage natively. Once finished you can drop the page table lock. - -Refcounts and transparent huge pages -==================================== - -Refcounting on THP is mostly consistent with refcounting on other compound -pages: - - - get_page()/put_page() and GUP operate in head page's ->_refcount. - - - ->_refcount in tail pages is always zero: get_page_unless_zero() never - succeed on tail pages. - - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. - - - map/unmap of the whole compound page accounted in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. - -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This is optimization required to lower overhead of per-subpage mapcount -tracking. The alternative is alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page got split -for the first time, but still have PMD mapping. The additional references -go away with last compound_mapcount. - -File pages get PG_double_map set on first map of the page with PTE and -goes away when the page gets evicted from page cache. - -split_huge_page internally has to distribute the refcounts in the head -page to the tail pages before clearing all PG_head/tail bits from the page -structures. It can be done easily for refcounts taken by page table -entries. But we don't have enough information on how to distribute any -additional pins (i.e. from get_user_pages). split_huge_page() fails any -requests to split pinned huge page: it expects page count to be equal to -sum of mapcount of all sub-pages plus one (split_huge_page caller must -have reference for head page). - -split_huge_page uses migration entries to stabilize page->_refcount and -page->_mapcount of anonymous pages. File pages just got unmapped. - -We safe against physical memory scanners too: the only legitimate way -scanner can get reference to a page is get_page_unless_zero(). - -All tail pages have zero ->_refcount until atomic_add(). This prevents the -scanner from getting a reference to the tail page up to that point. After the -atomic_add() we don't care about the ->_refcount value. We already known how -many references should be uncharged from the head page. - -For head page get_page_unless_zero() will succeed and we don't mind. It's -clear where reference should go after split: it will stay on head page. - -Note that split_huge_pmd() doesn't have any limitation on refcounting: -pmd can be split at any point and never fails. - -Partial unmap and deferred_split_huge_page() -============================================ - -Unmapping part of THP (with munmap() or other way) is not going to free -memory immediately. Instead, we detect that a subpage of THP is not in use -in page_remove_rmap() and queue the THP for splitting if memory pressure -comes. Splitting will free up unused subpages. - -Splitting the page right away is not an option due to locking context in -the place where we can detect partial unmap. It's also might be -counterproductive since in many cases partial unmap happens during exit(2) if -a THP crosses a VMA boundary. - -Function deferred_split_huge_page() is used to queue page for splitting. -The splitting itself will happen when we get memory pressure via shrinker -interface. diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst new file mode 100644 index 000000000000..fdd84cb8d511 --- /dev/null +++ b/Documentation/vm/unevictable-lru.rst @@ -0,0 +1,614 @@ +.. _unevictable_lru: + +============================== +Unevictable LRU Infrastructure +============================== + +.. contents:: :local: + + +Introduction +============ + +This document describes the Linux memory manager's "Unevictable LRU" +infrastructure and the use of this to manage several types of "unevictable" +pages. + +The document attempts to provide the overall rationale behind this mechanism +and the rationale for some of the design decisions that drove the +implementation. The latter design rationale is discussed in the context of an +implementation description. Admittedly, one can obtain the implementation +details - the "what does it do?" - by reading the code. One hopes that the +descriptions below add value by provide the answer to "why does it do that?". + + + +The Unevictable LRU +=================== + +The Unevictable LRU facility adds an additional LRU list to track unevictable +pages and to hide these pages from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with page +reclaim in Linux. The problems have been observed at customer sites on large +memory x86_64 systems. + +To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of +main memory will have over 32 million 4k pages in a single zone. When a large +fraction of these pages are not evictable for any reason [see below], vmscan +will spend a lot of time scanning the LRU lists looking for the small fraction +of pages that are evictable. This can result in a situation where all CPUs are +spending 100% of their time in vmscan for hours or days on end, with the system +completely unresponsive. + +The unevictable list addresses the following classes of unevictable pages: + + * Those owned by ramfs. + + * Those mapped into SHM_LOCK'd shared memory regions. + + * Those mapped into VM_LOCKED [mlock()ed] VMAs. + +The infrastructure may also be able to handle other conditions that make pages +unevictable, either by definition or by circumstance, in the future. + + +The Unevictable Page List +------------------------- + +The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list +called the "unevictable" list and an associated page flag, PG_unevictable, to +indicate that the page is being managed on the unevictable list. + +The PG_unevictable flag is analogous to, and mutually exclusive with, the +PG_active flag in that it indicates on which LRU list a page resides when +PG_lru is set. + +The Unevictable LRU infrastructure maintains unevictable pages on an additional +LRU list for a few reasons: + + (1) We get to "treat unevictable pages just like we treat other pages in the + system - which means we get to use the same code to manipulate them, the + same code to isolate them (for migrate, etc.), the same code to keep track + of the statistics, etc..." [Rik van Riel] + + (2) We want to be able to migrate unevictable pages between nodes for memory + defragmentation, workload management and memory hotplug. The linux kernel + can only migrate pages that it can successfully isolate from the LRU + lists. If we were to maintain pages elsewhere than on an LRU-like list, + where they can be found by isolate_lru_page(), we would prevent their + migration, unless we reworked migration code to find the unevictable pages + itself. + + +The unevictable list does not differentiate between file-backed and anonymous, +swap-backed pages. This differentiation is only important while the pages are, +in fact, evictable. + +The unevictable list benefits from the "arrayification" of the per-zone LRU +lists and statistics originally proposed and posted by Christoph Lameter. + +The unevictable list does not use the LRU pagevec mechanism. Rather, +unevictable pages are placed directly on the page's zone's unevictable list +under the zone lru_lock. This allows us to prevent the stranding of pages on +the unevictable list when one task has the page isolated from the LRU and other +tasks are changing the "evictability" state of the page. + + +Memory Control Group Interaction +-------------------------------- + +The unevictable LRU facility interacts with the memory control group [aka +memory controller; see Documentation/cgroup-v1/memory.txt] by extending the +lru_list enum. + +The memory controller data structure automatically gets a per-zone unevictable +list as a result of the "arrayification" of the per-zone LRU lists (one per +lru_list enum element). The memory controller tracks the movement of pages to +and from the unevictable list. + +When a memory control group comes under memory pressure, the controller will +not attempt to reclaim pages on the unevictable list. This has a couple of +effects: + + (1) Because the pages are "hidden" from reclaim on the unevictable list, the + reclaim process can be more efficient, dealing only with pages that have a + chance of being reclaimed. + + (2) On the other hand, if too many of the pages charged to the control group + are unevictable, the evictable portion of the working set of the tasks in + the control group may not fit into the available memory. This can cause + the control group to thrash or to OOM-kill tasks. + + +.. _mark_addr_space_unevict: + +Marking Address Spaces Unevictable +---------------------------------- + +For facilities such as ramfs none of the pages attached to the address space +may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE +address space flag is provided, and this can be manipulated by a filesystem +using a number of wrapper functions: + + * ``void mapping_set_unevictable(struct address_space *mapping);`` + + Mark the address space as being completely unevictable. + + * ``void mapping_clear_unevictable(struct address_space *mapping);`` + + Mark the address space as being evictable. + + * ``int mapping_unevictable(struct address_space *mapping);`` + + Query the address space, and return true if it is completely + unevictable. + +These are currently used in two places in the kernel: + + (1) By ramfs to mark the address spaces of its inodes when they are created, + and this mark remains for the life of the inode. + + (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. + + Note that SHM_LOCK is not required to page in the locked pages if they're + swapped out; the application must touch the pages manually if it wants to + ensure they're in memory. + + +Detecting Unevictable Pages +--------------------------- + +The function page_evictable() in vmscan.c determines whether a page is +evictable or not using the query function outlined above [see section +:ref:`Marking address spaces unevictable `] +to check the AS_UNEVICTABLE flag. + +For address spaces that are so marked after being populated (as SHM regions +might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate +the page tables for the region as does, for example, mlock(), nor need it make +any special effort to push any pages in the SHM_LOCK'd area to the unevictable +list. Instead, vmscan will do this if and when it encounters the pages during +a reclamation scan. + +On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan +the pages in the region and "rescue" them from the unevictable list if no other +condition is keeping them unevictable. If an unevictable region is destroyed, +the pages are also "rescued" from the unevictable list in the process of +freeing them. + +page_evictable() also checks for mlocked pages by testing an additional page +flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is +faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED. + + +Vmscan's Handling of Unevictable Pages +-------------------------------------- + +If unevictable pages are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the pages until they +have become evictable again (via munlock() for example) and have been "rescued" +from the unevictable list. However, there may be situations where we decide, +for the sake of expediency, to leave a unevictable page on one of the regular +active/inactive LRU lists for vmscan to deal with. vmscan checks for such +pages in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such pages that it encounters: that is, it diverts those pages to the +unevictable list for the zone being scanned. + +There may be situations where a page is mapped into a VM_LOCKED VMA, but the +page is not marked as PG_mlocked. Such pages will make it all the way to +shrink_page_list() where they will be detected when vmscan walks the reverse +map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, +shrink_page_list() will cull the page at that point. + +To "cull" an unevictable page, vmscan simply puts the page back on the LRU list +using putback_lru_page() - the inverse operation to isolate_lru_page() - after +dropping the page lock. Because the condition which makes the page unevictable +may change once the page is unlocked, putback_lru_page() will recheck the +unevictable state of a page that it places on the unevictable list. If the +page has become unevictable, putback_lru_page() removes it from the list and +retries, including the page_unevictable() test. Because such a race is a rare +event and movement of pages onto the unevictable list should be rare, these +extra evictabilty checks should not occur in the majority of calls to +putback_lru_page(). + + +MLOCKED Pages +============= + +The unevictable page list is also useful for mlock(), in addition to ramfs and +SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in +NOMMU situations, all mappings are effectively mlocked. + + +History +------- + +The "Unevictable mlocked Pages" infrastructure is based on work originally +posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". +Nick posted his patch as an alternative to a patch posted by Christoph Lameter +to achieve the same objective: hiding mlocked pages from vmscan. + +In Nick's patch, he used one of the struct page LRU list link fields as a count +of VM_LOCKED VMAs that map the page. This use of the link field for a count +prevented the management of the pages on an LRU list, and thus mlocked pages +were not migratable as isolate_lru_page() could not find them, and the LRU list +link field was not available to the migration subsystem. + +Nick resolved this by putting mlocked pages back on the lru list before +attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When +Nick's patch was integrated with the Unevictable LRU work, the count was +replaced by walking the reverse map to determine whether any VM_LOCKED VMAs +mapped the page. More on this below. + + +Basic Management +---------------- + +mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable +pages. When such a page has been "noticed" by the memory management subsystem, +the page is marked with the PG_mlocked flag. This can be manipulated using the +PageMlocked() functions. + +A PG_mlocked page will be placed on the unevictable list when it is added to +the LRU. Such pages can be "noticed" by memory management in several places: + + (1) in the mlock()/mlockall() system call handlers; + + (2) in the mmap() system call handler when mmapping a region with the + MAP_LOCKED flag; + + (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE + flag + + (4) in the fault path, if mlocked pages are "culled" in the fault path, + and when a VM_LOCKED stack segment is expanded; or + + (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED VMA via try_to_unmap() + +all of which result in the VM_LOCKED flag being set for the VMA if it doesn't +already have it set. + +mlocked pages become unlocked and rescued from the unevictable list when: + + (1) mapped in a range unlocked via the munlock()/munlockall() system calls; + + (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including + unmapping at task exit; + + (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; + or + + (4) before a page is COW'd in a VM_LOCKED VMA. + + +mlock()/mlockall() System Call Handling +--------------------------------------- + +Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup() +for each VMA in the range specified by the call. In the case of mlockall(), +this is the entire active address space of the task. Note that mlock_fixup() +is used for both mlocking and munlocking a range of memory. A call to mlock() +an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is +treated as a no-op, and mlock_fixup() simply returns. + +If the VMA passes some filtering as described in "Filtering Special Vmas" +below, mlock_fixup() will attempt to merge the VMA with its neighbors or split +off a subset of the VMA if the range does not cover the entire VMA. Once the +VMA has been merged or split or neither, mlock_fixup() will call +populate_vma_page_range() to fault in the pages via get_user_pages() and to +mark the pages as mlocked via mlock_vma_page(). + +Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, +get_user_pages() will be unable to fault in the pages. That's okay. If pages +do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the +fault path or in vmscan. + +Also note that a page returned by get_user_pages() could be truncated or +migrated out from under us, while we're trying to mlock it. To detect this, +populate_vma_page_range() checks page_mapping() after acquiring the page lock. +If the page is still associated with its mapping, we'll go ahead and call +mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. +In the worst case, this will result in a page mapped in a VM_LOCKED VMA +remaining on a normal LRU list without being PageMlocked(). Again, vmscan will +detect and cull such pages. + +mlock_vma_page() will call TestSetPageMlocked() for each page returned by +get_user_pages(). We use TestSetPageMlocked() because the page might already +be mlocked by another task/VMA and we don't want to do extra work. We +especially do not want to count an mlocked page more than once in the +statistics. If the page was already mlocked, mlock_vma_page() need do nothing +more. + +If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the +page from the LRU, as it is likely on the appropriate active or inactive list +at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put +back the page - by calling putback_lru_page() - which will notice that the page +is now mlocked and divert the page to the zone's unevictable list. If +mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle +it later if and when it attempts to reclaim the page. + + +Filtering Special VMAs +---------------------- + +mlock_fixup() filters several classes of "special" VMAs: + +1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind + these mappings are inherently pinned, so we don't need to mark them as + mlocked. In any case, most of the pages have no struct page in which to so + mark the page. Because of this, get_user_pages() will fail for these VMAs, + so there is no sense in attempting to visit them. + +2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We + neither need nor want to mlock() these pages. However, to preserve the + prior behavior of mlock() - before the unevictable/mlock changes - + mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to + allocate the huge pages and populate the ptes. + +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages + are inherently unevictable and are not managed on the LRU lists. + mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls + make_pages_present() to populate the ptes. + +Note that for all of these special VMAs, mlock_fixup() does not set the +VM_LOCKED flag. Therefore, we won't have to deal with them later during +munlock(), munmap() or task exit. Neither does mlock_fixup() account these +VMAs against the task's "locked_vm". + +.. _munlock_munlockall_handling: + +munlock()/munlockall() System Call Handling +------------------------------------------- + +The munlock() and munlockall() system calls are handled by the same functions - +do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs +lock operation indicated by an argument. So, these system calls are also +handled by mlock_fixup(). Again, if called for an already munlocked VMA, +mlock_fixup() simply returns. Because of the VMA filtering discussed above, +VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be +ignored for munlock. + +If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the +specified range. The range is then munlocked via the function +populate_vma_page_range() - the same function used to mlock a VMA range - +passing a flag to indicate that munlock() is being performed. + +Because the VMA access protections could have been changed to PROT_NONE after +faulting in and mlocking pages, get_user_pages() was unreliable for visiting +these pages for munlocking. Because we don't want to leave pages mlocked, +get_user_pages() was enhanced to accept a flag to ignore the permissions when +fetching the pages - all of which should be resident as a result of previous +mlocking. + +For munlock(), populate_vma_page_range() unlocks individual pages by calling +munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked +flag using TestClearPageMlocked(). As with mlock_vma_page(), +munlock_vma_page() use the Test*PageMlocked() function to handle the case where +the page might have already been unlocked by another task. If the page was +mlocked, munlock_vma_page() updates that zone statistics for the number of +mlocked pages. Note, however, that at this point we haven't checked whether +the page is mapped by other VM_LOCKED VMAs. + +We can't call try_to_munlock(), the function that walks the reverse map to +check for other VM_LOCKED VMAs, without first isolating the page from the LRU. +try_to_munlock() is a variant of try_to_unmap() and thus requires that the page +not be on an LRU list [more on these below]. However, the call to +isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So, +we go ahead and clear PG_mlocked up front, as this might be the only chance we +have. If we can successfully isolate the page, we go ahead and +try_to_munlock(), which will restore the PG_mlocked flag and update the zone +page statistics if it finds another VMA holding the page mlocked. If we fail +to isolate the page, we'll have left a potentially mlocked page on the LRU. +This is fine, because we'll catch it later if and if vmscan tries to reclaim +the page. This should be relatively rare. + + +Migrating MLOCKED Pages +----------------------- + +A page that is being migrated has been isolated from the LRU lists and is held +locked across unmapping of the page, updating the page's address space entry +and copying the contents and state, until the page table entry has been +replaced with an entry that refers to the new page. Linux supports migration +of mlocked pages and other unevictable pages. This involves simply moving the +PG_mlocked and PG_unevictable states from the old page to the new page. + +Note that page migration can race with mlocking or munlocking of the same page. +This has been discussed from the mlock/munlock perspective in the respective +sections above. Both processes (migration and m[un]locking) hold the page +locked. This provides the first level of synchronization. Page migration +zeros out the page_mapping of the old page before unlocking it, so m[un]lock +can skip these pages by testing the page mapping under page lock. + +To complete page migration, we place the new and old pages back onto the LRU +after dropping the page lock. The "unneeded" page - old page on success, new +page on failure - will be freed when the reference count held by the migration +process is released. To ensure that we don't strand pages on the unevictable +list because of a race between munlock and migration, page migration uses the +putback_lru_page() function to add migrated pages back to the LRU. + + +Compacting MLOCKED Pages +------------------------ + +The unevictable LRU can be scanned for compactable regions and the default +behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls +this behavior (see Documentation/sysctl/vm.txt). Once scanning of the +unevictable LRU is enabled, the work of compaction is mostly handled by +the page migration code and the same work flow as described in MIGRATING +MLOCKED PAGES will apply. + +MLOCKING Transparent Huge Pages +------------------------------- + +A transparent huge page is represented by a single entry on an LRU list. +Therefore, we can only make unevictable an entire compound page, not +individual subpages. + +If a user tries to mlock() part of a huge page, we want the rest of the +page to be reclaimable. + +We cannot just split the page on partial mlock() as split_huge_page() can +fail and new intermittent failure mode for the syscall is undesirable. + +We handle this by keeping PTE-mapped huge pages on normal LRU lists: the +PMD on border of VM_LOCKED VMA will be split into PTE table. + +This way the huge page is accessible for vmscan. Under memory pressure the +page will be split, subpages which belong to VM_LOCKED VMAs will be moved +to unevictable LRU and the rest can be reclaimed. + +See also comment in follow_trans_huge_pmd(). + +mmap(MAP_LOCKED) System Call Handling +------------------------------------- + +In addition the mlock()/mlockall() system calls, an application can request +that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() +call. There is one important and subtle difference here, though. mmap() + mlock() +will fail if the range cannot be faulted in (e.g. because mm_populate fails) +and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped +area will still have properties of the locked area - aka. pages will not get +swapped out - but major page faults to fault memory in might still happen. + +Furthermore, any mmap() call or brk() call that expands the heap by a +task that has previously called mlockall() with the MCL_FUTURE flag will result +in the newly mapped memory being mlocked. Before the unevictable/mlock +changes, the kernel simply called make_pages_present() to allocate pages and +populate the page table. + +To mlock a range of memory under the unevictable/mlock infrastructure, the +mmap() handler and task address space expansion functions call +populate_vma_page_range() specifying the vma and the address range to mlock. + +The callers of populate_vma_page_range() will have already added the memory range +to be mlocked to the task's "locked_vm". To account for filtered VMAs, +populate_vma_page_range() returns the number of pages NOT mlocked. All of the +callers then subtract a non-negative return value from the task's locked_vm. A +negative return value represent an error - for example, from get_user_pages() +attempting to fault in a VMA with PROT_NONE access. In this case, we leave the +memory range accounted as locked_vm, as the protections could be changed later +and pages allocated into that region. + + +munmap()/exit()/exec() System Call Handling +------------------------------------------- + +When unmapping an mlocked region of memory, whether by an explicit call to +munmap() or via an internal unmap from exit() or exec() processing, we must +munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. + +To munlock a range of memory under the unevictable/mlock infrastructure, the +munmap() handler and task address space call tear down function +munlock_vma_pages_all(). The name reflects the observation that one always +specifies the entire VMA range when munlock()ing during unmap of a region. +Because of the VMA filtering when mlocking() regions, only "normal" VMAs that +actually contain mlocked pages will be passed to munlock_vma_pages_all(). + +munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup() +for the munlock case, calls __munlock_vma_pages_range() to walk the page table +for the VMA's memory range and munlock_vma_page() each resident page mapped by +the VMA. This effectively munlocks the page, only if this is the last +VM_LOCKED VMA that maps the page. + + +try_to_unmap() +-------------- + +Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may +have VM_LOCKED flag set. It is possible for a page mapped into one or more +VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one +of the active or inactive LRU lists. This could happen if, for example, a task +in the process of munlocking the page could not isolate the page from the LRU. +As a result, vmscan/shrink_page_list() might encounter such a page as described +in section "vmscan's handling of unevictable pages". To handle this situation, +try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse +map. + +try_to_unmap() is always called, by either vmscan for reclaim or for page +migration, with the argument page locked and isolated from the LRU. Separate +functions handle anonymous and mapped file and KSM pages, as these types of +pages have different reverse map lookup mechanisms, with different locking. +In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(), +it will call try_to_unmap_one() for every VMA which might contain the page. + +When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED +VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it, +and return SWAP_MLOCK to indicate that the page is unevictable: and the scan +stops there. + +mlock_vma_page() is called while holding the page table's lock (in addition +to the page lock, and the rmap lock): to serialize against concurrent mlock or +munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim, +holepunching, and truncation of file pages and their anonymous COWed pages. + + +try_to_munlock() Reverse Map Scan +--------------------------------- + +.. warning:: + [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the + page_referenced() reverse map walker. + +When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call +Handling ` above] tries to munlock a +page, it needs to determine whether or not the page is mapped by any +VM_LOCKED VMA without actually attempting to unmap all PTEs from the +page. For this purpose, the unevictable/mlock infrastructure +introduced a variant of try_to_unmap() called try_to_munlock(). + +try_to_munlock() calls the same functions as try_to_unmap() for anonymous and +mapped file and KSM pages with a flag argument specifying unlock versus unmap +processing. Again, these functions walk the respective reverse maps looking +for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case, +the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This +undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page. + +Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's +reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. +However, the scan can terminate when it encounters a VM_LOCKED VMA. +Although try_to_munlock() might be called a great many times when munlocking a +large region or tearing down a large address space that has been mlocked via +mlockall(), overall this is a fairly rare event. + + +Page Reclaim in shrink_*_list() +------------------------------- + +shrink_active_list() culls any obviously unevictable pages - i.e. +!page_evictable(page) - diverting these to the unevictable list. +However, shrink_active_list() only sees unevictable pages that made it onto the +active/inactive lru lists. Note that these pages do not have PageUnevictable +set - otherwise they would be on the unevictable list and shrink_active_list +would never see them. + +Some examples of these unevictable pages on the LRU lists are: + + (1) ramfs pages that have been placed on the LRU lists when first allocated. + + (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to + allocate or fault in the pages in the shared memory region. This happens + when an application accesses the page the first time after SHM_LOCK'ing + the segment. + + (3) mlocked pages that could not be isolated from the LRU and moved to the + unevictable list in mlock_vma_page(). + +shrink_inactive_list() also diverts any unevictable pages that it finds on the +inactive lists to the appropriate zone's unevictable list. + +shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd +after shrink_active_list() had moved them to the inactive list, or pages mapped +into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to +recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, +but will pass on to shrink_page_list(). + +shrink_page_list() again culls obviously unevictable pages that it could +encounter for similar reason to shrink_inactive_list(). Pages mapped into +VM_LOCKED VMAs but without PG_mlocked set will make it all the way to +try_to_unmap(). shrink_page_list() will divert them to the unevictable list +when try_to_unmap() returns SWAP_MLOCK, as discussed above. diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt deleted file mode 100644 index fdd84cb8d511..000000000000 --- a/Documentation/vm/unevictable-lru.txt +++ /dev/null @@ -1,614 +0,0 @@ -.. _unevictable_lru: - -============================== -Unevictable LRU Infrastructure -============================== - -.. contents:: :local: - - -Introduction -============ - -This document describes the Linux memory manager's "Unevictable LRU" -infrastructure and the use of this to manage several types of "unevictable" -pages. - -The document attempts to provide the overall rationale behind this mechanism -and the rationale for some of the design decisions that drove the -implementation. The latter design rationale is discussed in the context of an -implementation description. Admittedly, one can obtain the implementation -details - the "what does it do?" - by reading the code. One hopes that the -descriptions below add value by provide the answer to "why does it do that?". - - - -The Unevictable LRU -=================== - -The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page -reclaim in Linux. The problems have been observed at customer sites on large -memory x86_64 systems. - -To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of -main memory will have over 32 million 4k pages in a single zone. When a large -fraction of these pages are not evictable for any reason [see below], vmscan -will spend a lot of time scanning the LRU lists looking for the small fraction -of pages that are evictable. This can result in a situation where all CPUs are -spending 100% of their time in vmscan for hours or days on end, with the system -completely unresponsive. - -The unevictable list addresses the following classes of unevictable pages: - - * Those owned by ramfs. - - * Those mapped into SHM_LOCK'd shared memory regions. - - * Those mapped into VM_LOCKED [mlock()ed] VMAs. - -The infrastructure may also be able to handle other conditions that make pages -unevictable, either by definition or by circumstance, in the future. - - -The Unevictable Page List -------------------------- - -The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. - -The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when -PG_lru is set. - -The Unevictable LRU infrastructure maintains unevictable pages on an additional -LRU list for a few reasons: - - (1) We get to "treat unevictable pages just like we treat other pages in the - system - which means we get to use the same code to manipulate them, the - same code to isolate them (for migrate, etc.), the same code to keep track - of the statistics, etc..." [Rik van Riel] - - (2) We want to be able to migrate unevictable pages between nodes for memory - defragmentation, workload management and memory hotplug. The linux kernel - can only migrate pages that it can successfully isolate from the LRU - lists. If we were to maintain pages elsewhere than on an LRU-like list, - where they can be found by isolate_lru_page(), we would prevent their - migration, unless we reworked migration code to find the unevictable pages - itself. - - -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. - -The unevictable list benefits from the "arrayification" of the per-zone LRU -lists and statistics originally proposed and posted by Christoph Lameter. - -The unevictable list does not use the LRU pagevec mechanism. Rather, -unevictable pages are placed directly on the page's zone's unevictable list -under the zone lru_lock. This allows us to prevent the stranding of pages on -the unevictable list when one task has the page isolated from the LRU and other -tasks are changing the "evictability" state of the page. - - -Memory Control Group Interaction --------------------------------- - -The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/cgroup-v1/memory.txt] by extending the -lru_list enum. - -The memory controller data structure automatically gets a per-zone unevictable -list as a result of the "arrayification" of the per-zone LRU lists (one per -lru_list enum element). The memory controller tracks the movement of pages to -and from the unevictable list. - -When a memory control group comes under memory pressure, the controller will -not attempt to reclaim pages on the unevictable list. This has a couple of -effects: - - (1) Because the pages are "hidden" from reclaim on the unevictable list, the - reclaim process can be more efficient, dealing only with pages that have a - chance of being reclaimed. - - (2) On the other hand, if too many of the pages charged to the control group - are unevictable, the evictable portion of the working set of the tasks in - the control group may not fit into the available memory. This can cause - the control group to thrash or to OOM-kill tasks. - - -.. _mark_addr_space_unevict: - -Marking Address Spaces Unevictable ----------------------------------- - -For facilities such as ramfs none of the pages attached to the address space -may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE -address space flag is provided, and this can be manipulated by a filesystem -using a number of wrapper functions: - - * ``void mapping_set_unevictable(struct address_space *mapping);`` - - Mark the address space as being completely unevictable. - - * ``void mapping_clear_unevictable(struct address_space *mapping);`` - - Mark the address space as being evictable. - - * ``int mapping_unevictable(struct address_space *mapping);`` - - Query the address space, and return true if it is completely - unevictable. - -These are currently used in two places in the kernel: - - (1) By ramfs to mark the address spaces of its inodes when they are created, - and this mark remains for the life of the inode. - - (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. - - Note that SHM_LOCK is not required to page in the locked pages if they're - swapped out; the application must touch the pages manually if it wants to - ensure they're in memory. - - -Detecting Unevictable Pages ---------------------------- - -The function page_evictable() in vmscan.c determines whether a page is -evictable or not using the query function outlined above [see section -:ref:`Marking address spaces unevictable `] -to check the AS_UNEVICTABLE flag. - -For address spaces that are so marked after being populated (as SHM regions -might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate -the page tables for the region as does, for example, mlock(), nor need it make -any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during -a reclamation scan. - -On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan -the pages in the region and "rescue" them from the unevictable list if no other -condition is keeping them unevictable. If an unevictable region is destroyed, -the pages are also "rescued" from the unevictable list in the process of -freeing them. - -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED. - - -Vmscan's Handling of Unevictable Pages --------------------------------------- - -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they -have become evictable again (via munlock() for example) and have been "rescued" -from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave a unevictable page on one of the regular -active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the -unevictable list for the zone being scanned. - -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_page_list() where they will be detected when vmscan walks the reverse -map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, -shrink_page_list() will cull the page at that point. - -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, putback_lru_page() will recheck the -unevictable state of a page that it places on the unevictable list. If the -page has become unevictable, putback_lru_page() removes it from the list and -retries, including the page_unevictable() test. Because such a race is a rare -event and movement of pages onto the unevictable list should be rare, these -extra evictabilty checks should not occur in the majority of calls to -putback_lru_page(). - - -MLOCKED Pages -============= - -The unevictable page list is also useful for mlock(), in addition to ramfs and -SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in -NOMMU situations, all mappings are effectively mlocked. - - -History -------- - -The "Unevictable mlocked Pages" infrastructure is based on work originally -posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". -Nick posted his patch as an alternative to a patch posted by Christoph Lameter -to achieve the same objective: hiding mlocked pages from vmscan. - -In Nick's patch, he used one of the struct page LRU list link fields as a count -of VM_LOCKED VMAs that map the page. This use of the link field for a count -prevented the management of the pages on an LRU list, and thus mlocked pages -were not migratable as isolate_lru_page() could not find them, and the LRU list -link field was not available to the migration subsystem. - -Nick resolved this by putting mlocked pages back on the lru list before -attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When -Nick's patch was integrated with the Unevictable LRU work, the count was -replaced by walking the reverse map to determine whether any VM_LOCKED VMAs -mapped the page. More on this below. - - -Basic Management ----------------- - -mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable -pages. When such a page has been "noticed" by the memory management subsystem, -the page is marked with the PG_mlocked flag. This can be manipulated using the -PageMlocked() functions. - -A PG_mlocked page will be placed on the unevictable list when it is added to -the LRU. Such pages can be "noticed" by memory management in several places: - - (1) in the mlock()/mlockall() system call handlers; - - (2) in the mmap() system call handler when mmapping a region with the - MAP_LOCKED flag; - - (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE - flag - - (4) in the fault path, if mlocked pages are "culled" in the fault path, - and when a VM_LOCKED stack segment is expanded; or - - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to - reclaim a page in a VM_LOCKED VMA via try_to_unmap() - -all of which result in the VM_LOCKED flag being set for the VMA if it doesn't -already have it set. - -mlocked pages become unlocked and rescued from the unevictable list when: - - (1) mapped in a range unlocked via the munlock()/munlockall() system calls; - - (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including - unmapping at task exit; - - (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; - or - - (4) before a page is COW'd in a VM_LOCKED VMA. - - -mlock()/mlockall() System Call Handling ---------------------------------------- - -Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup() -for each VMA in the range specified by the call. In the case of mlockall(), -this is the entire active address space of the task. Note that mlock_fixup() -is used for both mlocking and munlocking a range of memory. A call to mlock() -an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is -treated as a no-op, and mlock_fixup() simply returns. - -If the VMA passes some filtering as described in "Filtering Special Vmas" -below, mlock_fixup() will attempt to merge the VMA with its neighbors or split -off a subset of the VMA if the range does not cover the entire VMA. Once the -VMA has been merged or split or neither, mlock_fixup() will call -populate_vma_page_range() to fault in the pages via get_user_pages() and to -mark the pages as mlocked via mlock_vma_page(). - -Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, -get_user_pages() will be unable to fault in the pages. That's okay. If pages -do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the -fault path or in vmscan. - -Also note that a page returned by get_user_pages() could be truncated or -migrated out from under us, while we're trying to mlock it. To detect this, -populate_vma_page_range() checks page_mapping() after acquiring the page lock. -If the page is still associated with its mapping, we'll go ahead and call -mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. -In the worst case, this will result in a page mapped in a VM_LOCKED VMA -remaining on a normal LRU list without being PageMlocked(). Again, vmscan will -detect and cull such pages. - -mlock_vma_page() will call TestSetPageMlocked() for each page returned by -get_user_pages(). We use TestSetPageMlocked() because the page might already -be mlocked by another task/VMA and we don't want to do extra work. We -especially do not want to count an mlocked page more than once in the -statistics. If the page was already mlocked, mlock_vma_page() need do nothing -more. - -If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the -page from the LRU, as it is likely on the appropriate active or inactive list -at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put -back the page - by calling putback_lru_page() - which will notice that the page -is now mlocked and divert the page to the zone's unevictable list. If -mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle -it later if and when it attempts to reclaim the page. - - -Filtering Special VMAs ----------------------- - -mlock_fixup() filters several classes of "special" VMAs: - -1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind - these mappings are inherently pinned, so we don't need to mark them as - mlocked. In any case, most of the pages have no struct page in which to so - mark the page. Because of this, get_user_pages() will fail for these VMAs, - so there is no sense in attempting to visit them. - -2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We - neither need nor want to mlock() these pages. However, to preserve the - prior behavior of mlock() - before the unevictable/mlock changes - - mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to - allocate the huge pages and populate the ptes. - -3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, - such as the VDSO page, relay channel pages, etc. These pages - are inherently unevictable and are not managed on the LRU lists. - mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls - make_pages_present() to populate the ptes. - -Note that for all of these special VMAs, mlock_fixup() does not set the -VM_LOCKED flag. Therefore, we won't have to deal with them later during -munlock(), munmap() or task exit. Neither does mlock_fixup() account these -VMAs against the task's "locked_vm". - -.. _munlock_munlockall_handling: - -munlock()/munlockall() System Call Handling -------------------------------------------- - -The munlock() and munlockall() system calls are handled by the same functions - -do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs -lock operation indicated by an argument. So, these system calls are also -handled by mlock_fixup(). Again, if called for an already munlocked VMA, -mlock_fixup() simply returns. Because of the VMA filtering discussed above, -VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be -ignored for munlock. - -If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. The range is then munlocked via the function -populate_vma_page_range() - the same function used to mlock a VMA range - -passing a flag to indicate that munlock() is being performed. - -Because the VMA access protections could have been changed to PROT_NONE after -faulting in and mlocking pages, get_user_pages() was unreliable for visiting -these pages for munlocking. Because we don't want to leave pages mlocked, -get_user_pages() was enhanced to accept a flag to ignore the permissions when -fetching the pages - all of which should be resident as a result of previous -mlocking. - -For munlock(), populate_vma_page_range() unlocks individual pages by calling -munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked -flag using TestClearPageMlocked(). As with mlock_vma_page(), -munlock_vma_page() use the Test*PageMlocked() function to handle the case where -the page might have already been unlocked by another task. If the page was -mlocked, munlock_vma_page() updates that zone statistics for the number of -mlocked pages. Note, however, that at this point we haven't checked whether -the page is mapped by other VM_LOCKED VMAs. - -We can't call try_to_munlock(), the function that walks the reverse map to -check for other VM_LOCKED VMAs, without first isolating the page from the LRU. -try_to_munlock() is a variant of try_to_unmap() and thus requires that the page -not be on an LRU list [more on these below]. However, the call to -isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So, -we go ahead and clear PG_mlocked up front, as this might be the only chance we -have. If we can successfully isolate the page, we go ahead and -try_to_munlock(), which will restore the PG_mlocked flag and update the zone -page statistics if it finds another VMA holding the page mlocked. If we fail -to isolate the page, we'll have left a potentially mlocked page on the LRU. -This is fine, because we'll catch it later if and if vmscan tries to reclaim -the page. This should be relatively rare. - - -Migrating MLOCKED Pages ------------------------ - -A page that is being migrated has been isolated from the LRU lists and is held -locked across unmapping of the page, updating the page's address space entry -and copying the contents and state, until the page table entry has been -replaced with an entry that refers to the new page. Linux supports migration -of mlocked pages and other unevictable pages. This involves simply moving the -PG_mlocked and PG_unevictable states from the old page to the new page. - -Note that page migration can race with mlocking or munlocking of the same page. -This has been discussed from the mlock/munlock perspective in the respective -sections above. Both processes (migration and m[un]locking) hold the page -locked. This provides the first level of synchronization. Page migration -zeros out the page_mapping of the old page before unlocking it, so m[un]lock -can skip these pages by testing the page mapping under page lock. - -To complete page migration, we place the new and old pages back onto the LRU -after dropping the page lock. The "unneeded" page - old page on success, new -page on failure - will be freed when the reference count held by the migration -process is released. To ensure that we don't strand pages on the unevictable -list because of a race between munlock and migration, page migration uses the -putback_lru_page() function to add migrated pages back to the LRU. - - -Compacting MLOCKED Pages ------------------------- - -The unevictable LRU can be scanned for compactable regions and the default -behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls -this behavior (see Documentation/sysctl/vm.txt). Once scanning of the -unevictable LRU is enabled, the work of compaction is mostly handled by -the page migration code and the same work flow as described in MIGRATING -MLOCKED PAGES will apply. - -MLOCKING Transparent Huge Pages -------------------------------- - -A transparent huge page is represented by a single entry on an LRU list. -Therefore, we can only make unevictable an entire compound page, not -individual subpages. - -If a user tries to mlock() part of a huge page, we want the rest of the -page to be reclaimable. - -We cannot just split the page on partial mlock() as split_huge_page() can -fail and new intermittent failure mode for the syscall is undesirable. - -We handle this by keeping PTE-mapped huge pages on normal LRU lists: the -PMD on border of VM_LOCKED VMA will be split into PTE table. - -This way the huge page is accessible for vmscan. Under memory pressure the -page will be split, subpages which belong to VM_LOCKED VMAs will be moved -to unevictable LRU and the rest can be reclaimed. - -See also comment in follow_trans_huge_pmd(). - -mmap(MAP_LOCKED) System Call Handling -------------------------------------- - -In addition the mlock()/mlockall() system calls, an application can request -that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() -call. There is one important and subtle difference here, though. mmap() + mlock() -will fail if the range cannot be faulted in (e.g. because mm_populate fails) -and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped -area will still have properties of the locked area - aka. pages will not get -swapped out - but major page faults to fault memory in might still happen. - -Furthermore, any mmap() call or brk() call that expands the heap by a -task that has previously called mlockall() with the MCL_FUTURE flag will result -in the newly mapped memory being mlocked. Before the unevictable/mlock -changes, the kernel simply called make_pages_present() to allocate pages and -populate the page table. - -To mlock a range of memory under the unevictable/mlock infrastructure, the -mmap() handler and task address space expansion functions call -populate_vma_page_range() specifying the vma and the address range to mlock. - -The callers of populate_vma_page_range() will have already added the memory range -to be mlocked to the task's "locked_vm". To account for filtered VMAs, -populate_vma_page_range() returns the number of pages NOT mlocked. All of the -callers then subtract a non-negative return value from the task's locked_vm. A -negative return value represent an error - for example, from get_user_pages() -attempting to fault in a VMA with PROT_NONE access. In this case, we leave the -memory range accounted as locked_vm, as the protections could be changed later -and pages allocated into that region. - - -munmap()/exit()/exec() System Call Handling -------------------------------------------- - -When unmapping an mlocked region of memory, whether by an explicit call to -munmap() or via an internal unmap from exit() or exec() processing, we must -munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any -way, so unmapping them required no processing. - -To munlock a range of memory under the unevictable/mlock infrastructure, the -munmap() handler and task address space call tear down function -munlock_vma_pages_all(). The name reflects the observation that one always -specifies the entire VMA range when munlock()ing during unmap of a region. -Because of the VMA filtering when mlocking() regions, only "normal" VMAs that -actually contain mlocked pages will be passed to munlock_vma_pages_all(). - -munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup() -for the munlock case, calls __munlock_vma_pages_range() to walk the page table -for the VMA's memory range and munlock_vma_page() each resident page mapped by -the VMA. This effectively munlocks the page, only if this is the last -VM_LOCKED VMA that maps the page. - - -try_to_unmap() --------------- - -Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may -have VM_LOCKED flag set. It is possible for a page mapped into one or more -VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one -of the active or inactive LRU lists. This could happen if, for example, a task -in the process of munlocking the page could not isolate the page from the LRU. -As a result, vmscan/shrink_page_list() might encounter such a page as described -in section "vmscan's handling of unevictable pages". To handle this situation, -try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse -map. - -try_to_unmap() is always called, by either vmscan for reclaim or for page -migration, with the argument page locked and isolated from the LRU. Separate -functions handle anonymous and mapped file and KSM pages, as these types of -pages have different reverse map lookup mechanisms, with different locking. -In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(), -it will call try_to_unmap_one() for every VMA which might contain the page. - -When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED -VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it, -and return SWAP_MLOCK to indicate that the page is unevictable: and the scan -stops there. - -mlock_vma_page() is called while holding the page table's lock (in addition -to the page lock, and the rmap lock): to serialize against concurrent mlock or -munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim, -holepunching, and truncation of file pages and their anonymous COWed pages. - - -try_to_munlock() Reverse Map Scan ---------------------------------- - -.. warning:: - [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the - page_referenced() reverse map walker. - -When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call -Handling ` above] tries to munlock a -page, it needs to determine whether or not the page is mapped by any -VM_LOCKED VMA without actually attempting to unmap all PTEs from the -page. For this purpose, the unevictable/mlock infrastructure -introduced a variant of try_to_unmap() called try_to_munlock(). - -try_to_munlock() calls the same functions as try_to_unmap() for anonymous and -mapped file and KSM pages with a flag argument specifying unlock versus unmap -processing. Again, these functions walk the respective reverse maps looking -for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case, -the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This -undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page. - -Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's -reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. -However, the scan can terminate when it encounters a VM_LOCKED VMA. -Although try_to_munlock() might be called a great many times when munlocking a -large region or tearing down a large address space that has been mlocked via -mlockall(), overall this is a fairly rare event. - - -Page Reclaim in shrink_*_list() -------------------------------- - -shrink_active_list() culls any obviously unevictable pages - i.e. -!page_evictable(page) - diverting these to the unevictable list. -However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive lru lists. Note that these pages do not have PageUnevictable -set - otherwise they would be on the unevictable list and shrink_active_list -would never see them. - -Some examples of these unevictable pages on the LRU lists are: - - (1) ramfs pages that have been placed on the LRU lists when first allocated. - - (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to - allocate or fault in the pages in the shared memory region. This happens - when an application accesses the page the first time after SHM_LOCK'ing - the segment. - - (3) mlocked pages that could not be isolated from the LRU and moved to the - unevictable list in mlock_vma_page(). - -shrink_inactive_list() also diverts any unevictable pages that it finds on the -inactive lists to the appropriate zone's unevictable list. - -shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd -after shrink_active_list() had moved them to the inactive list, or pages mapped -into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to -recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, -but will pass on to shrink_page_list(). - -shrink_page_list() again culls obviously unevictable pages that it could -encounter for similar reason to shrink_inactive_list(). Pages mapped into -VM_LOCKED VMAs but without PG_mlocked set will make it all the way to -try_to_unmap(). shrink_page_list() will divert them to the unevictable list -when try_to_unmap() returns SWAP_MLOCK, as discussed above. diff --git a/Documentation/vm/userfaultfd.rst b/Documentation/vm/userfaultfd.rst new file mode 100644 index 000000000000..5048cf661a8a --- /dev/null +++ b/Documentation/vm/userfaultfd.rst @@ -0,0 +1,241 @@ +.. _userfaultfd: + +=========== +Userfaultfd +=========== + +Objective +========= + +Userfaults allow the implementation of on-demand paging from userland +and more generally they allow userland to take control of various +memory page faults, something otherwise only the kernel code could do. + +For example userfaults allows a proper and more optimal implementation +of the PROT_NONE+SIGSEGV trick. + +Design +====== + +Userfaults are delivered and resolved through the userfaultfd syscall. + +The userfaultfd (aside from registering and unregistering virtual +memory ranges) provides two primary functionalities: + +1) read/POLLIN protocol to notify a userland thread of the faults + happening + +2) various UFFDIO_* ioctls that can manage the virtual memory regions + registered in the userfaultfd that allows userland to efficiently + resolve the userfaults it receives via 1) or to manage the virtual + memory in the background + +The real advantage of userfaults if compared to regular virtual memory +management of mremap/mprotect is that the userfaults in all their +operations never involve heavyweight structures like vmas (in fact the +userfaultfd runtime load never takes the mmap_sem for writing). + +Vmas are not suitable for page- (or hugepage) granular fault tracking +when dealing with virtual address spaces that could span +Terabytes. Too many vmas would be needed for that. + +The userfaultfd once opened by invoking the syscall, can also be +passed using unix domain sockets to a manager process, so the same +manager process could handle the userfaults of a multitude of +different processes without them being aware about what is going on +(well of course unless they later try to use the userfaultfd +themselves on the same region the manager is already tracking, which +is a corner case that would currently return -EBUSY). + +API +=== + +When first opened the userfaultfd must be enabled invoking the +UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or +a later API version) which will specify the read/POLLIN protocol +userland intends to speak on the UFFD and the uffdio_api.features +userland requires. The UFFDIO_API ioctl if successful (i.e. if the +requested uffdio_api.api is spoken also by the running kernel and the +requested features are going to be enabled) will return into +uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of +respectively all the available features of the read(2) protocol and +the generic ioctl available. + +The uffdio_api.features bitmask returned by the UFFDIO_API ioctl +defines what memory types are supported by the userfaultfd and what +events, except page fault notifications, may be generated. + +If the kernel supports registering userfaultfd ranges on hugetlbfs +virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in +uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be +set if the kernel supports registering userfaultfd ranges on shared +memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero +MAP_SHARED, memfd_create, etc). + +The userland application that wants to use userfaultfd with hugetlbfs +or shared memory need to set the corresponding flag in +uffdio_api.features to enable those features. + +If the userland desires to receive notifications for events other than +page faults, it has to verify that uffdio_api.features has appropriate +UFFD_FEATURE_EVENT_* bits set. These events are described in more +detail below in "Non-cooperative userfaultfd" section. + +Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should +be invoked (if present in the returned uffdio_api.ioctls bitmask) to +register a memory range in the userfaultfd by setting the +uffdio_register structure accordingly. The uffdio_register.mode +bitmask will specify to the kernel which kind of faults to track for +the range (UFFDIO_REGISTER_MODE_MISSING would track missing +pages). The UFFDIO_REGISTER ioctl will return the +uffdio_register.ioctls bitmask of ioctls that are suitable to resolve +userfaults on the range registered. Not all ioctls will necessarily be +supported for all memory types depending on the underlying virtual +memory backend (anonymous memory vs tmpfs vs real filebacked +mappings). + +Userland can use the uffdio_register.ioctls to manage the virtual +address space in the background (to add or potentially also remove +memory from the userfaultfd registered range). This means a userfault +could be triggering just before userland maps in the background the +user-faulted page. + +The primary ioctl to resolve userfaults is UFFDIO_COPY. That +atomically copies a page into the userfault registered range and wakes +up the blocked userfaults (unless uffdio_copy.mode & +UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to +UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an +half copied page since it'll keep userfaulting until the copy has +finished. + +QEMU/KVM +======== + +QEMU/KVM is using the userfaultfd syscall to implement postcopy live +migration. Postcopy live migration is one form of memory +externalization consisting of a virtual machine running with part or +all of its memory residing on a different node in the cloud. The +userfaultfd abstraction is generic enough that not a single line of +KVM kernel code had to be modified in order to add postcopy live +migration to QEMU. + +Guest async page faults, FOLL_NOWAIT and all other GUP features work +just fine in combination with userfaults. Userfaults trigger async +page faults in the guest scheduler so those guest processes that +aren't waiting for userfaults (i.e. network bound) can keep running in +the guest vcpus. + +It is generally beneficial to run one pass of precopy live migration +just before starting postcopy live migration, in order to avoid +generating userfaults for readonly guest regions. + +The implementation of postcopy live migration currently uses one +single bidirectional socket but in the future two different sockets +will be used (to reduce the latency of the userfaults to the minimum +possible without having to decrease /proc/sys/net/ipv4/tcp_wmem). + +The QEMU in the source node writes all pages that it knows are missing +in the destination node, into the socket, and the migration thread of +the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE +ioctls on the userfaultfd in order to map the received pages into the +guest (UFFDIO_ZEROCOPY is used if the source page was a zero page). + +A different postcopy thread in the destination node listens with +poll() to the userfaultfd in parallel. When a POLLIN event is +generated after a userfault triggers, the postcopy thread read() from +the userfaultfd and receives the fault address (or -EAGAIN in case the +userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run +by the parallel QEMU migration thread). + +After the QEMU postcopy thread (running in the destination node) gets +the userfault address it writes the information about the missing page +into the socket. The QEMU source node receives the information and +roughly "seeks" to that page address and continues sending all +remaining missing pages from that new page offset. Soon after that +(just the time to flush the tcp_wmem queue through the network) the +migration thread in the QEMU running in the destination node will +receive the page that triggered the userfault and it'll map it as +usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it +was spontaneously sent by the source or if it was an urgent page +requested through a userfault). + +By the time the userfaults start, the QEMU in the destination node +doesn't need to keep any per-page state bitmap relative to the live +migration around and a single per-page bitmap has to be maintained in +the QEMU running in the source node to know which pages are still +missing in the destination node. The bitmap in the source node is +checked to find which missing pages to send in round robin and we seek +over it when receiving incoming userfaults. After sending each page of +course the bitmap is updated accordingly. It's also useful to avoid +sending the same page twice (in case the userfault is read by the +postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration +thread). + +Non-cooperative userfaultfd +=========================== + +When the userfaultfd is monitored by an external manager, the manager +must be able to track changes in the process virtual memory +layout. Userfaultfd can notify the manager about such changes using +the same read(2) protocol as for the page fault notifications. The +manager has to explicitly enable these events by setting appropriate +bits in uffdio_api.features passed to UFFDIO_API ioctl: + +UFFD_FEATURE_EVENT_FORK + enable userfaultfd hooks for fork(). When this feature is + enabled, the userfaultfd context of the parent process is + duplicated into the newly created process. The manager + receives UFFD_EVENT_FORK with file descriptor of the new + userfaultfd context in the uffd_msg.fork. + +UFFD_FEATURE_EVENT_REMAP + enable notifications about mremap() calls. When the + non-cooperative process moves a virtual memory area to a + different location, the manager will receive + UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and + new addresses of the area and its original length. + +UFFD_FEATURE_EVENT_REMOVE + enable notifications about madvise(MADV_REMOVE) and + madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will + be generated upon these calls to madvise. The uffd_msg.remove + will contain start and end addresses of the removed area. + +UFFD_FEATURE_EVENT_UNMAP + enable notifications about memory unmapping. The manager will + get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and + end addresses of the unmapped area. + +Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP +are pretty similar, they quite differ in the action expected from the +userfaultfd manager. In the former case, the virtual memory is +removed, but the area is not, the area remains monitored by the +userfaultfd, and if a page fault occurs in that area it will be +delivered to the manager. The proper resolution for such page fault is +to zeromap the faulting address. However, in the latter case, when an +area is unmapped, either explicitly (with munmap() system call), or +implicitly (e.g. during mremap()), the area is removed and in turn the +userfaultfd context for such area disappears too and the manager will +not get further userland page faults from the removed area. Still, the +notification is required in order to prevent manager from using +UFFDIO_COPY on the unmapped area. + +Unlike userland page faults which have to be synchronous and require +explicit or implicit wakeup, all the events are delivered +asynchronously and the non-cooperative process resumes execution as +soon as manager executes read(). The userfaultfd manager should +carefully synchronize calls to UFFDIO_COPY with the events +processing. To aid the synchronization, the UFFDIO_COPY ioctl will +return -ENOSPC when the monitored process exits at the time of +UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed +its virtual memory layout simultaneously with outstanding UFFDIO_COPY +operation. + +The current asynchronous model of the event delivery is optimal for +single threaded non-cooperative userfaultfd manager implementations. A +synchronous event delivery model can be added later as a new +userfaultfd feature to facilitate multithreading enhancements of the +non cooperative manager, for example to allow UFFDIO_COPY ioctls to +run in parallel to the event reception. Single threaded +implementations should continue to use the current async event +delivery model instead. diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt deleted file mode 100644 index 5048cf661a8a..000000000000 --- a/Documentation/vm/userfaultfd.txt +++ /dev/null @@ -1,241 +0,0 @@ -.. _userfaultfd: - -=========== -Userfaultfd -=========== - -Objective -========= - -Userfaults allow the implementation of on-demand paging from userland -and more generally they allow userland to take control of various -memory page faults, something otherwise only the kernel code could do. - -For example userfaults allows a proper and more optimal implementation -of the PROT_NONE+SIGSEGV trick. - -Design -====== - -Userfaults are delivered and resolved through the userfaultfd syscall. - -The userfaultfd (aside from registering and unregistering virtual -memory ranges) provides two primary functionalities: - -1) read/POLLIN protocol to notify a userland thread of the faults - happening - -2) various UFFDIO_* ioctls that can manage the virtual memory regions - registered in the userfaultfd that allows userland to efficiently - resolve the userfaults it receives via 1) or to manage the virtual - memory in the background - -The real advantage of userfaults if compared to regular virtual memory -management of mremap/mprotect is that the userfaults in all their -operations never involve heavyweight structures like vmas (in fact the -userfaultfd runtime load never takes the mmap_sem for writing). - -Vmas are not suitable for page- (or hugepage) granular fault tracking -when dealing with virtual address spaces that could span -Terabytes. Too many vmas would be needed for that. - -The userfaultfd once opened by invoking the syscall, can also be -passed using unix domain sockets to a manager process, so the same -manager process could handle the userfaults of a multitude of -different processes without them being aware about what is going on -(well of course unless they later try to use the userfaultfd -themselves on the same region the manager is already tracking, which -is a corner case that would currently return -EBUSY). - -API -=== - -When first opened the userfaultfd must be enabled invoking the -UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or -a later API version) which will specify the read/POLLIN protocol -userland intends to speak on the UFFD and the uffdio_api.features -userland requires. The UFFDIO_API ioctl if successful (i.e. if the -requested uffdio_api.api is spoken also by the running kernel and the -requested features are going to be enabled) will return into -uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of -respectively all the available features of the read(2) protocol and -the generic ioctl available. - -The uffdio_api.features bitmask returned by the UFFDIO_API ioctl -defines what memory types are supported by the userfaultfd and what -events, except page fault notifications, may be generated. - -If the kernel supports registering userfaultfd ranges on hugetlbfs -virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in -uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be -set if the kernel supports registering userfaultfd ranges on shared -memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero -MAP_SHARED, memfd_create, etc). - -The userland application that wants to use userfaultfd with hugetlbfs -or shared memory need to set the corresponding flag in -uffdio_api.features to enable those features. - -If the userland desires to receive notifications for events other than -page faults, it has to verify that uffdio_api.features has appropriate -UFFD_FEATURE_EVENT_* bits set. These events are described in more -detail below in "Non-cooperative userfaultfd" section. - -Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should -be invoked (if present in the returned uffdio_api.ioctls bitmask) to -register a memory range in the userfaultfd by setting the -uffdio_register structure accordingly. The uffdio_register.mode -bitmask will specify to the kernel which kind of faults to track for -the range (UFFDIO_REGISTER_MODE_MISSING would track missing -pages). The UFFDIO_REGISTER ioctl will return the -uffdio_register.ioctls bitmask of ioctls that are suitable to resolve -userfaults on the range registered. Not all ioctls will necessarily be -supported for all memory types depending on the underlying virtual -memory backend (anonymous memory vs tmpfs vs real filebacked -mappings). - -Userland can use the uffdio_register.ioctls to manage the virtual -address space in the background (to add or potentially also remove -memory from the userfaultfd registered range). This means a userfault -could be triggering just before userland maps in the background the -user-faulted page. - -The primary ioctl to resolve userfaults is UFFDIO_COPY. That -atomically copies a page into the userfault registered range and wakes -up the blocked userfaults (unless uffdio_copy.mode & -UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to -UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an -half copied page since it'll keep userfaulting until the copy has -finished. - -QEMU/KVM -======== - -QEMU/KVM is using the userfaultfd syscall to implement postcopy live -migration. Postcopy live migration is one form of memory -externalization consisting of a virtual machine running with part or -all of its memory residing on a different node in the cloud. The -userfaultfd abstraction is generic enough that not a single line of -KVM kernel code had to be modified in order to add postcopy live -migration to QEMU. - -Guest async page faults, FOLL_NOWAIT and all other GUP features work -just fine in combination with userfaults. Userfaults trigger async -page faults in the guest scheduler so those guest processes that -aren't waiting for userfaults (i.e. network bound) can keep running in -the guest vcpus. - -It is generally beneficial to run one pass of precopy live migration -just before starting postcopy live migration, in order to avoid -generating userfaults for readonly guest regions. - -The implementation of postcopy live migration currently uses one -single bidirectional socket but in the future two different sockets -will be used (to reduce the latency of the userfaults to the minimum -possible without having to decrease /proc/sys/net/ipv4/tcp_wmem). - -The QEMU in the source node writes all pages that it knows are missing -in the destination node, into the socket, and the migration thread of -the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE -ioctls on the userfaultfd in order to map the received pages into the -guest (UFFDIO_ZEROCOPY is used if the source page was a zero page). - -A different postcopy thread in the destination node listens with -poll() to the userfaultfd in parallel. When a POLLIN event is -generated after a userfault triggers, the postcopy thread read() from -the userfaultfd and receives the fault address (or -EAGAIN in case the -userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run -by the parallel QEMU migration thread). - -After the QEMU postcopy thread (running in the destination node) gets -the userfault address it writes the information about the missing page -into the socket. The QEMU source node receives the information and -roughly "seeks" to that page address and continues sending all -remaining missing pages from that new page offset. Soon after that -(just the time to flush the tcp_wmem queue through the network) the -migration thread in the QEMU running in the destination node will -receive the page that triggered the userfault and it'll map it as -usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it -was spontaneously sent by the source or if it was an urgent page -requested through a userfault). - -By the time the userfaults start, the QEMU in the destination node -doesn't need to keep any per-page state bitmap relative to the live -migration around and a single per-page bitmap has to be maintained in -the QEMU running in the source node to know which pages are still -missing in the destination node. The bitmap in the source node is -checked to find which missing pages to send in round robin and we seek -over it when receiving incoming userfaults. After sending each page of -course the bitmap is updated accordingly. It's also useful to avoid -sending the same page twice (in case the userfault is read by the -postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration -thread). - -Non-cooperative userfaultfd -=========================== - -When the userfaultfd is monitored by an external manager, the manager -must be able to track changes in the process virtual memory -layout. Userfaultfd can notify the manager about such changes using -the same read(2) protocol as for the page fault notifications. The -manager has to explicitly enable these events by setting appropriate -bits in uffdio_api.features passed to UFFDIO_API ioctl: - -UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When this feature is - enabled, the userfaultfd context of the parent process is - duplicated into the newly created process. The manager - receives UFFD_EVENT_FORK with file descriptor of the new - userfaultfd context in the uffd_msg.fork. - -UFFD_FEATURE_EVENT_REMAP - enable notifications about mremap() calls. When the - non-cooperative process moves a virtual memory area to a - different location, the manager will receive - UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and - new addresses of the area and its original length. - -UFFD_FEATURE_EVENT_REMOVE - enable notifications about madvise(MADV_REMOVE) and - madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will - be generated upon these calls to madvise. The uffd_msg.remove - will contain start and end addresses of the removed area. - -UFFD_FEATURE_EVENT_UNMAP - enable notifications about memory unmapping. The manager will - get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and - end addresses of the unmapped area. - -Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP -are pretty similar, they quite differ in the action expected from the -userfaultfd manager. In the former case, the virtual memory is -removed, but the area is not, the area remains monitored by the -userfaultfd, and if a page fault occurs in that area it will be -delivered to the manager. The proper resolution for such page fault is -to zeromap the faulting address. However, in the latter case, when an -area is unmapped, either explicitly (with munmap() system call), or -implicitly (e.g. during mremap()), the area is removed and in turn the -userfaultfd context for such area disappears too and the manager will -not get further userland page faults from the removed area. Still, the -notification is required in order to prevent manager from using -UFFDIO_COPY on the unmapped area. - -Unlike userland page faults which have to be synchronous and require -explicit or implicit wakeup, all the events are delivered -asynchronously and the non-cooperative process resumes execution as -soon as manager executes read(). The userfaultfd manager should -carefully synchronize calls to UFFDIO_COPY with the events -processing. To aid the synchronization, the UFFDIO_COPY ioctl will -return -ENOSPC when the monitored process exits at the time of -UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed -its virtual memory layout simultaneously with outstanding UFFDIO_COPY -operation. - -The current asynchronous model of the event delivery is optimal for -single threaded non-cooperative userfaultfd manager implementations. A -synchronous event delivery model can be added later as a new -userfaultfd feature to facilitate multithreading enhancements of the -non cooperative manager, for example to allow UFFDIO_COPY ioctls to -run in parallel to the event reception. Single threaded -implementations should continue to use the current async event -delivery model instead. diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst new file mode 100644 index 000000000000..224e3c61d686 --- /dev/null +++ b/Documentation/vm/z3fold.rst @@ -0,0 +1,30 @@ +.. _z3fold: + +====== +z3fold +====== + +z3fold is a special purpose allocator for storing compressed pages. +It is designed to store up to three compressed pages per physical page. +It is a zbud derivative which allows for higher compression +ratio keeping the simplicity and determinism of its predecessor. + +The main differences between z3fold and zbud are: + +* unlike zbud, z3fold allows for up to PAGE_SIZE allocations +* z3fold can hold up to 3 compressed pages in its page +* z3fold doesn't export any API itself and is thus intended to be used + via the zpool API. + +To keep the determinism and simplicity, z3fold, just like zbud, always +stores an integral number of compressed pages per page, but it can store +up to 3 pages unlike zbud which can store at most 2. Therefore the +compression ratio goes to around 2.7x while zbud's one is around 1.7x. + +Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not +return a dereferenceable pointer. Instead, it returns an unsigned long +handle which encodes actual location of the allocated object. + +Keeping effective compression ratio close to zsmalloc's, z3fold doesn't +depend on MMU enabled and provides more predictable reclaim behavior +which makes it a better fit for small and response-critical systems. diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.txt deleted file mode 100644 index 224e3c61d686..000000000000 --- a/Documentation/vm/z3fold.txt +++ /dev/null @@ -1,30 +0,0 @@ -.. _z3fold: - -====== -z3fold -====== - -z3fold is a special purpose allocator for storing compressed pages. -It is designed to store up to three compressed pages per physical page. -It is a zbud derivative which allows for higher compression -ratio keeping the simplicity and determinism of its predecessor. - -The main differences between z3fold and zbud are: - -* unlike zbud, z3fold allows for up to PAGE_SIZE allocations -* z3fold can hold up to 3 compressed pages in its page -* z3fold doesn't export any API itself and is thus intended to be used - via the zpool API. - -To keep the determinism and simplicity, z3fold, just like zbud, always -stores an integral number of compressed pages per page, but it can store -up to 3 pages unlike zbud which can store at most 2. Therefore the -compression ratio goes to around 2.7x while zbud's one is around 1.7x. - -Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not -return a dereferenceable pointer. Instead, it returns an unsigned long -handle which encodes actual location of the allocated object. - -Keeping effective compression ratio close to zsmalloc's, z3fold doesn't -depend on MMU enabled and provides more predictable reclaim behavior -which makes it a better fit for small and response-critical systems. diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst new file mode 100644 index 000000000000..6e79893d6132 --- /dev/null +++ b/Documentation/vm/zsmalloc.rst @@ -0,0 +1,82 @@ +.. _zsmalloc: + +======== +zsmalloc +======== + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +==== + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + index +size + object size zspage stores +almost_empty + the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full + the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated + the number of objects allocated +obj_used + the number of objects allocated to the user +pages_used + the number of pages allocated for the class +pages_per_zspage + the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where + +* n = number of allocated objects +* N = total number of objects zspage can store +* f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt deleted file mode 100644 index 6e79893d6132..000000000000 --- a/Documentation/vm/zsmalloc.txt +++ /dev/null @@ -1,82 +0,0 @@ -.. _zsmalloc: - -======== -zsmalloc -======== - -This allocator is designed for use with zram. Thus, the allocator is -supposed to work well under low memory conditions. In particular, it -never attempts higher order page allocation which is very likely to -fail under memory pressure. On the other hand, if we just use single -(0-order) pages, it would suffer from very high fragmentation -- -any object of size PAGE_SIZE/2 or larger would occupy an entire page. -This was one of the major issues with its predecessor (xvmalloc). - -To overcome these issues, zsmalloc allocates a bunch of 0-order pages -and links them together using various 'struct page' fields. These linked -pages act as a single higher-order page i.e. an object can span 0-order -page boundaries. The code refers to these linked pages as a single entity -called zspage. - -For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE -since this satisfies the requirements of all its current users (in the -worst case, page is incompressible and is thus stored "as-is" i.e. in -uncompressed form). For allocation requests larger than this size, failure -is returned (see zs_malloc). - -Additionally, zs_malloc() does not return a dereferenceable pointer. -Instead, it returns an opaque handle (unsigned long) which encodes actual -location of the allocated object. The reason for this indirection is that -zsmalloc does not keep zspages permanently mapped since that would cause -issues on 32-bit systems where the VA region for kernel space mappings -is very small. So, before using the allocating memory, the object has to -be mapped using zs_map_object() to get a usable pointer and subsequently -unmapped using zs_unmap_object(). - -stat -==== - -With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via -``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - index -size - object size zspage stores -almost_empty - the number of ZS_ALMOST_EMPTY zspages(see below) -almost_full - the number of ZS_ALMOST_FULL zspages(see below) -obj_allocated - the number of objects allocated -obj_used - the number of objects allocated to the user -pages_used - the number of pages allocated for the class -pages_per_zspage - the number of 0-order pages to make a zspage - -We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where - -* n = number of allocated objects -* N = total number of objects zspage can store -* f = fullness_threshold_frac(ie, 4 at the moment) - -Similarly, we assign zspage to: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst new file mode 100644 index 000000000000..1444ecd40911 --- /dev/null +++ b/Documentation/vm/zswap.rst @@ -0,0 +1,135 @@ +.. _zswap: + +===== +zswap +===== + +Overview +======== + +Zswap is a lightweight compressed cache for swap pages. It takes pages that are +in the process of being swapped out and attempts to compress them into a +dynamically allocated RAM-based memory pool. zswap basically trades CPU cycles +for potentially reduced swap I/O.  This trade-off can also result in a +significant performance improvement if reads from the compressed cache are +faster than reads from a swap device. + +.. note:: + Zswap is a new feature as of v3.11 and interacts heavily with memory + reclaim. This interaction has not been fully explored on the large set of + potential configurations and workloads that exist. For this reason, zswap + is a work in progress and should be considered experimental. + + Some potential benefits: + +* Desktop/laptop users with limited RAM capacities can mitigate the + performance impact of swapping. +* Overcommitted guests that share a common I/O resource can + dramatically reduce their swap I/O pressure, avoiding heavy handed I/O + throttling by the hypervisor. This allows more work to get done with less + impact to the guest workload and guests sharing the I/O subsystem +* Users with SSDs as swap devices can extend the life of the device by + drastically reducing life-shortening writes. + +Zswap evicts pages from compressed cache on an LRU basis to the backing swap +device when the compressed pool reaches its size limit. This requirement had +been identified in prior community discussions. + +Zswap is disabled by default but can be enabled at boot time by setting +the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``. Zswap +can also be enabled and disabled at runtime using the sysfs interface. +An example command to enable zswap at runtime, assuming sysfs is mounted +at ``/sys``, is:: + + echo 1 > /sys/module/zswap/parameters/enabled + +When zswap is disabled at runtime it will stop storing pages that are +being swapped out. However, it will _not_ immediately write out or fault +back into memory all of the pages stored in the compressed pool. The +pages stored in zswap will remain in the compressed pool until they are +either invalidated or faulted back into memory. In order to force all +pages out of the compressed pool, a swapoff on the swap device(s) will +fault back into memory all swapped out pages, including those in the +compressed pool. + +Design +====== + +Zswap receives pages for compression through the Frontswap API and is able to +evict pages from its own compressed pool on an LRU basis and write them back to +the backing swap device in the case that the compressed pool is full. + +Zswap makes use of zpool for the managing the compressed memory pool. Each +allocation in zpool is not directly accessible by address. Rather, a handle is +returned by the allocation routine and that handle must be mapped before being +accessed. The compressed memory pool grows on demand and shrinks as compressed +pages are freed. The pool is not preallocated. By default, a zpool +of type zbud is created, but it can be selected at boot time by +setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can +also be changed at runtime using the sysfs ``zpool`` attribute, e.g.:: + + echo zbud > /sys/module/zswap/parameters/zpool + +The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which +means the compression ratio will always be 2:1 or worse (because of half-full +zbud pages). The zsmalloc type zpool has a more complex compressed page +storage method, and it can achieve greater storage densities. However, +zsmalloc does not implement compressed page eviction, so once zswap fills it +cannot evict the oldest page, it can only reject new pages. + +When a swap page is passed from frontswap to zswap, zswap maintains a mapping +of the swap entry, a combination of the swap type and swap offset, to the zpool +handle that references that compressed swap page. This mapping is achieved +with a red-black tree per swap type. The swap offset is the search key for the +tree nodes. + +During a page fault on a PTE that is a swap entry, frontswap calls the zswap +load function to decompress the page into the page allocated by the page fault +handler. + +Once there are no PTEs referencing a swap page stored in zswap (i.e. the count +in the swap_map goes to 0) the swap code calls the zswap invalidate function, +via frontswap, to free the compressed entry. + +Zswap seeks to be simple in its policies. Sysfs attributes allow for one user +controlled policy: + +* max_pool_percent - The maximum percentage of memory that the compressed + pool can occupy. + +The default compressor is lzo, but it can be selected at boot time by +setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``. +It can also be changed at runtime using the sysfs "compressor" +attribute, e.g.:: + + echo lzo > /sys/module/zswap/parameters/compressor + +When the zpool and/or compressor parameter is changed at runtime, any existing +compressed pages are not modified; they are left in their own zpool. When a +request is made for a page in an old zpool, it is uncompressed using its +original compressor. Once all pages are removed from an old zpool, the zpool +and its compressor are freed. + +Some of the pages in zswap are same-value filled pages (i.e. contents of the +page have same value or repetitive pattern). These pages include zero-filled +pages and they are handled differently. During store operation, a page is +checked if it is a same-value filled page before compressing it. If true, the +compressed length of the page is set to zero and the pattern or same-filled +value is stored. + +Same-value filled pages identification feature is enabled by default and can be +disabled at boot time by setting the ``same_filled_pages_enabled`` attribute +to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and +disabled at runtime using the sysfs ``same_filled_pages_enabled`` +attribute, e.g.:: + + echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled + +When zswap same-filled page identification is disabled at runtime, it will stop +checking for the same-value filled pages during store operation. However, the +existing pages which are marked as same-value filled pages remain stored +unchanged in zswap until they are either loaded or invalidated. + +A debugfs interface is provided for various statistic about pool size, number +of pages stored, same-value filled pages and various counters for the reasons +pages are rejected. diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt deleted file mode 100644 index 1444ecd40911..000000000000 --- a/Documentation/vm/zswap.txt +++ /dev/null @@ -1,135 +0,0 @@ -.. _zswap: - -===== -zswap -===== - -Overview -======== - -Zswap is a lightweight compressed cache for swap pages. It takes pages that are -in the process of being swapped out and attempts to compress them into a -dynamically allocated RAM-based memory pool. zswap basically trades CPU cycles -for potentially reduced swap I/O.  This trade-off can also result in a -significant performance improvement if reads from the compressed cache are -faster than reads from a swap device. - -.. note:: - Zswap is a new feature as of v3.11 and interacts heavily with memory - reclaim. This interaction has not been fully explored on the large set of - potential configurations and workloads that exist. For this reason, zswap - is a work in progress and should be considered experimental. - - Some potential benefits: - -* Desktop/laptop users with limited RAM capacities can mitigate the - performance impact of swapping. -* Overcommitted guests that share a common I/O resource can - dramatically reduce their swap I/O pressure, avoiding heavy handed I/O - throttling by the hypervisor. This allows more work to get done with less - impact to the guest workload and guests sharing the I/O subsystem -* Users with SSDs as swap devices can extend the life of the device by - drastically reducing life-shortening writes. - -Zswap evicts pages from compressed cache on an LRU basis to the backing swap -device when the compressed pool reaches its size limit. This requirement had -been identified in prior community discussions. - -Zswap is disabled by default but can be enabled at boot time by setting -the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``. Zswap -can also be enabled and disabled at runtime using the sysfs interface. -An example command to enable zswap at runtime, assuming sysfs is mounted -at ``/sys``, is:: - - echo 1 > /sys/module/zswap/parameters/enabled - -When zswap is disabled at runtime it will stop storing pages that are -being swapped out. However, it will _not_ immediately write out or fault -back into memory all of the pages stored in the compressed pool. The -pages stored in zswap will remain in the compressed pool until they are -either invalidated or faulted back into memory. In order to force all -pages out of the compressed pool, a swapoff on the swap device(s) will -fault back into memory all swapped out pages, including those in the -compressed pool. - -Design -====== - -Zswap receives pages for compression through the Frontswap API and is able to -evict pages from its own compressed pool on an LRU basis and write them back to -the backing swap device in the case that the compressed pool is full. - -Zswap makes use of zpool for the managing the compressed memory pool. Each -allocation in zpool is not directly accessible by address. Rather, a handle is -returned by the allocation routine and that handle must be mapped before being -accessed. The compressed memory pool grows on demand and shrinks as compressed -pages are freed. The pool is not preallocated. By default, a zpool -of type zbud is created, but it can be selected at boot time by -setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can -also be changed at runtime using the sysfs ``zpool`` attribute, e.g.:: - - echo zbud > /sys/module/zswap/parameters/zpool - -The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which -means the compression ratio will always be 2:1 or worse (because of half-full -zbud pages). The zsmalloc type zpool has a more complex compressed page -storage method, and it can achieve greater storage densities. However, -zsmalloc does not implement compressed page eviction, so once zswap fills it -cannot evict the oldest page, it can only reject new pages. - -When a swap page is passed from frontswap to zswap, zswap maintains a mapping -of the swap entry, a combination of the swap type and swap offset, to the zpool -handle that references that compressed swap page. This mapping is achieved -with a red-black tree per swap type. The swap offset is the search key for the -tree nodes. - -During a page fault on a PTE that is a swap entry, frontswap calls the zswap -load function to decompress the page into the page allocated by the page fault -handler. - -Once there are no PTEs referencing a swap page stored in zswap (i.e. the count -in the swap_map goes to 0) the swap code calls the zswap invalidate function, -via frontswap, to free the compressed entry. - -Zswap seeks to be simple in its policies. Sysfs attributes allow for one user -controlled policy: - -* max_pool_percent - The maximum percentage of memory that the compressed - pool can occupy. - -The default compressor is lzo, but it can be selected at boot time by -setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``. -It can also be changed at runtime using the sysfs "compressor" -attribute, e.g.:: - - echo lzo > /sys/module/zswap/parameters/compressor - -When the zpool and/or compressor parameter is changed at runtime, any existing -compressed pages are not modified; they are left in their own zpool. When a -request is made for a page in an old zpool, it is uncompressed using its -original compressor. Once all pages are removed from an old zpool, the zpool -and its compressor are freed. - -Some of the pages in zswap are same-value filled pages (i.e. contents of the -page have same value or repetitive pattern). These pages include zero-filled -pages and they are handled differently. During store operation, a page is -checked if it is a same-value filled page before compressing it. If true, the -compressed length of the page is set to zero and the pattern or same-filled -value is stored. - -Same-value filled pages identification feature is enabled by default and can be -disabled at boot time by setting the ``same_filled_pages_enabled`` attribute -to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and -disabled at runtime using the sysfs ``same_filled_pages_enabled`` -attribute, e.g.:: - - echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled - -When zswap same-filled page identification is disabled at runtime, it will stop -checking for the same-value filled pages during store operation. However, the -existing pages which are marked as same-value filled pages remain stored -unchanged in zswap until they are either loaded or invalidated. - -A debugfs interface is provided for various statistic about pool size, number -of pages stored, same-value filled pages and various counters for the reasons -pages are rejected. diff --git a/MAINTAINERS b/MAINTAINERS index 3bdc260e36b7..575849a8343e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15406,7 +15406,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/zsmalloc.c F: include/linux/zsmalloc.h -F: Documentation/vm/zsmalloc.txt +F: Documentation/vm/zsmalloc.rst ZSWAP COMPRESSED SWAP CACHING M: Seth Jennings diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index e96adcbcab41..f53e5060afe7 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -584,7 +584,7 @@ config ARCH_DISCONTIGMEM_ENABLE Say Y to support efficient handling of discontiguous physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. source "mm/Kconfig" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bbe12a038d21..3ac9bf4cc2a0 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -397,7 +397,7 @@ config ARCH_DISCONTIGMEM_ENABLE Say Y to support efficient handling of discontiguous physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_FLATMEM_ENABLE def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 8128c3b68d6b..4562810857eb 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2551,7 +2551,7 @@ config ARCH_DISCONTIGMEM_ENABLE Say Y to support efficient handling of discontiguous physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_SPARSEMEM_ENABLE bool diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 73ce5dd07642..f8c0f10949ea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -880,7 +880,7 @@ config PPC_MEM_KEYS page-based protections, but without requiring modification of the page tables when an application changes protection domains. - For details, see Documentation/vm/protection-keys.txt + For details, see Documentation/vm/protection-keys.rst If unsure, say y. diff --git a/fs/Kconfig b/fs/Kconfig index bc821a86d965..ba53dc2a9691 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -196,7 +196,7 @@ config HUGETLBFS help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read - for details. + for details. If unsure, say N. diff --git a/fs/dax.c b/fs/dax.c index 0276df90e86c..0eb65c34d5a6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -618,7 +618,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..91d14c4ac04a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -956,7 +956,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the - * Documentation/vm/soft-dirty.txt for full description + * Documentation/vm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = *pte; @@ -1436,7 +1436,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bits 57-60 zero * Bit 61 page is file-page or shared-anon diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 325017ad9311..77be87c095f2 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -16,7 +16,7 @@ /* * Heterogeneous Memory Management (HMM) * - * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it + * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it * is for. Here we focus on the HMM API description, with some explanation of * the underlying implementation. * diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7b4899c06f49..74ea5e2310a8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -45,7 +45,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.txt. + * include/linux/hmm.h and Documentation/vm/hmm.rst. * * MEMORY_DEVICE_PUBLIC: * Device memory that is cache coherent from device and CPU point of view. This @@ -67,7 +67,7 @@ enum memory_type { * page_free() * * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief + * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief * explanation in include/linux/memory_hotplug.h. * * The page_fault() callback must migrate page back, from device memory to diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2d07a1ed5a31..392e6af82701 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -174,7 +174,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.txt + * discussion on this see Documentation/vm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1149533aa2fa..df2c7d11f496 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -28,7 +28,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -51,7 +51,7 @@ extern void mmdrop(struct mm_struct *mm); * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7b6a59f722a3..4003973deff4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -53,7 +53,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.txt. Short description is we need struct pages for + * Documentation/vm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * diff --git a/mm/Kconfig b/mm/Kconfig index c782e8fb7235..b9f04213a353 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -312,7 +312,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.txt for more information: KSM is inactive + See Documentation/vm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). @@ -537,7 +537,7 @@ config MEM_SOFT_DIRTY into a page just as regular dirty bit, but unlike the latter it can be cleared by hands. - See Documentation/vm/soft-dirty.txt for more details. + See Documentation/vm/soft-dirty.rst for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" @@ -664,7 +664,7 @@ config IDLE_PAGE_TRACKING be useful to tune memory cgroup limits and/or for job placement within a compute cluster. - See Documentation/vm/idle_page_tracking.txt for more details. + See Documentation/vm/idle_page_tracking.rst for more details. # arch_add_memory() comprehends device memory config ARCH_HAS_ZONE_DEVICE diff --git a/mm/cleancache.c b/mm/cleancache.c index f7b9fdc79d97..126548b5a292 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -3,7 +3,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.txt for more information. + * Documentation/vm/cleancache.rst for more information. * * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/frontswap.c b/mm/frontswap.c index fec8b5044040..4f5476a0f955 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -3,7 +3,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.txt for more information. + * Documentation/vm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/hmm.c b/mm/hmm.c index 320545b98ff5..af176c6820cf 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -37,7 +37,7 @@ #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) /* - * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h + * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h */ DEFINE_STATIC_KEY_FALSE(device_private_key); EXPORT_SYMBOL(device_private_key); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87ab9b8f56b5..6d5911673450 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1185,7 +1185,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, * mmu_notifier_invalidate_range_end() happens which can lead to a * device seeing memory write in different order than CPU. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); @@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c204e3d132b..5af974abae46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3289,7 +3289,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); } @@ -4355,7 +4355,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/ksm.c b/mm/ksm.c index 293721f5da70..0b88698a9014 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1049,7 +1049,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1138,7 +1138,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); diff --git a/mm/mmap.c b/mm/mmap.c index 9efdc021ad22..39fc51d1639c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2769,7 +2769,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) diff --git a/mm/rmap.c b/mm/rmap.c index 47db27f8049e..854b703fbe2a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ if (ret) (*cleaned)++; @@ -1587,7 +1587,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * point at new page while a device still is using this * page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(page)); } @@ -1597,7 +1597,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, PageHuge(page)); put_page(page); diff --git a/mm/util.c b/mm/util.c index c1250501364f..e857c80c6f4a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -609,7 +609,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. -- cgit v1.2.3 From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:35 -0700 Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning Applications might use SO_RCVLOWAT on TCP socket hoping to receive one [E]POLLIN event only when a given amount of bytes are ready in socket receive queue. Problem is that receive autotuning is not aware of this constraint, meaning sk_rcvbuf might be too small to allow all bytes to be stored. Add a new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/tcp.h | 1 + net/core/sock.c | 5 ++++- net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 21 +++++++++++++++++++++ net/ipv6/af_inet6.c | 1 + 6 files changed, 29 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -197,6 +197,7 @@ struct proto_ops { int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); + int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/net/tcp.h b/include/net/tcp.h index 9c9b3768b350..b2318242cad8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); +int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/core/sock.c b/net/core/sock.c index 6444525f610c..b2c3db169ca1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -905,7 +905,10 @@ set_rcvbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + if (sock->ops->set_rcvlowat) + ret = sock->ops->set_rcvlowat(sk, val); + else + sk->sk_rcvlowat = val ? : 1; break; case SO_RCVTIMEO: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..f5c562aaef35 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bccc4c270087..0abd8d1d3d1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1701,6 +1701,27 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ +int tcp_set_rcvlowat(struct sock *sk, int val) +{ + sk->sk_rcvlowat = val ? : 1; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + /* val comes from user space and might be close to INT_MAX */ + val <<= 1; + if (val < 0) + val = INT_MAX; + + val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + if (val > sk->sk_rcvbuf) { + sk->sk_rcvbuf = val; + tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + } + return 0; +} +EXPORT_SYMBOL(tcp_set_rcvlowat); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8da0b513f188..e70d59fb26e1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = { .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; const struct proto_ops inet6_dgram_ops = { -- cgit v1.2.3 From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:37 -0700 Subject: tcp: avoid extra wakeups for SO_RCVLOWAT users SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only generated when enough bytes are available in receive queue, after David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().") But TCP still calls sk->sk_data_ready() for each chunk added in receive queue, meaning thread is awaken, and goes back to sleep shortly after. Tested: tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB -> Should get ~2 wakeups (c-switches) per MB, regardless of how many (tiny or big) packets were received. High speed (mostly full size GRO packets) received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit, cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit, cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping) received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit, cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b2318242cad8..0ee85c47c185 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_data_ready(struct sock *sk); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0abd8d1d3d1d..c768d306b657 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1705,6 +1705,10 @@ EXPORT_SYMBOL(tcp_peek_len); int tcp_set_rcvlowat(struct sock *sk, int val) { sk->sk_rcvlowat = val ? : 1; + + /* Check if we need to signal EPOLLIN right now */ + tcp_data_ready(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d854363a4387..f93687f97d80 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4576,6 +4576,17 @@ err: } +void tcp_data_ready(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = tp->rcv_nxt - tp->copied_seq; + + if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) + return; + + sk->sk_data_ready(sk); +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4633,7 +4644,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } @@ -5434,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } } -- cgit v1.2.3 From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:38 -0700 Subject: tcp: implement mmap() for zero copy receive Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 + net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 2 +- 4 files changed, 117 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ee85c47c185..833154e3df17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f5c562aaef35..3ebf599cebae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = { .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c768d306b657..438fbca96cd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +/* When user wants to mmap X pages, we first need to perform the mapping + * before freeing any skbs in receive queue, otherwise user would be unable + * to fallback to standard recvmsg(). This happens if some data in the + * requested block is not exactly fitting in a page. + * + * We only support order-0 pages for the moment. + * mmap() on TCP is very strict, there is no point + * trying to accommodate with pathological layouts. + */ +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + unsigned int nr_pages = size >> PAGE_SHIFT; + struct page **pages_array = NULL; + u32 seq, len, offset, nr = 0; + struct sock *sk = sock->sk; + const skb_frag_t *frags; + struct tcp_sock *tp; + struct sk_buff *skb; + int ret; + + if (vma->vm_pgoff || !nr_pages) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* TODO: Maybe the following is not needed if pages are COW */ + vma->vm_flags &= ~VM_MAYWRITE; + + lock_sock(sk); + + ret = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + sock_rps_record_flow(sk); + + if (tcp_inq(sk) < size) { + ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; + goto out; + } + tp = tcp_sk(sk); + seq = tp->copied_seq; + /* Abort if urgent data is in the area */ + if (unlikely(tp->urg_data)) { + u32 urg_offset = tp->urg_seq - seq; + + ret = -EINVAL; + if (urg_offset < size) + goto out; + } + ret = -ENOMEM; + pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!pages_array) + goto out; + skb = tcp_recv_skb(sk, seq, &offset); + ret = -EINVAL; +skb_start: + /* We do not support anything not in page frags */ + offset -= skb_headlen(skb); + if ((int)offset < 0) + goto out; + if (skb_has_frag_list(skb)) + goto out; + len = skb->data_len - offset; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } + while (nr < nr_pages) { + if (len) { + if (len < PAGE_SIZE) + goto out; + if (frags->size != PAGE_SIZE || frags->page_offset) + goto out; + pages_array[nr++] = skb_frag_page(frags); + frags++; + len -= PAGE_SIZE; + seq += PAGE_SIZE; + continue; + } + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + goto skb_start; + } + /* OK, we have a full set of pages ready to be inserted into vma */ + for (nr = 0; nr < nr_pages; nr++) { + ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), + pages_array[nr]); + if (ret) + goto out; + } + /* operation is complete, we can 'consume' all skbs */ + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, size); + + ret = 0; +out: + release_sock(sk); + kvfree(pages_array); + return ret; +} +EXPORT_SYMBOL(tcp_mmap); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e70d59fb26e1..2c694912df2e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */ - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, -- cgit v1.2.3 From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Apr 2018 21:37:13 +0200 Subject: PCI: Add two more values for PCIe Max_Read_Request_Size This patch adds missing values for the max read request size. E.g. network driver r8169 uses a value of 4K. Signed-off-by: Heiner Kallweit Acked-by: Bjorn Helgaas Signed-off-by: David S. Miller --- include/uapi/linux/pci_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 103ba797a8f3..83ade9b5cf95 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -506,6 +506,8 @@ #define PCI_EXP_DEVCTL_READRQ_256B 0x1000 /* 256 Bytes */ #define PCI_EXP_DEVCTL_READRQ_512B 0x2000 /* 512 Bytes */ #define PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */ #define PCI_EXP_DEVSTA 10 /* Device Status */ #define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */ -- cgit v1.2.3 From ef53e9e14714de2ce26eaae0244c07c426064d69 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 16 Apr 2018 15:07:13 -0700 Subject: net: Remove unused tcp_set_state tracepoint This tracepoint was replaced by inet_sock_set_state in 563e0bb and not used anywhere in the kernel anymore. Remove it. Signed-off-by: Andrey Ignatov Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 47 ---------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 878b2be7ce77..3dd68029d77a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -166,53 +166,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_ARGS(sk) ); -TRACE_EVENT(tcp_set_state, - - TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), - - TP_ARGS(sk, oldstate, newstate), - - TP_STRUCT__entry( - __field(const void *, skaddr) - __field(int, oldstate) - __field(int, newstate) - __field(__u16, sport) - __field(__u16, dport) - __array(__u8, saddr, 4) - __array(__u8, daddr, 4) - __array(__u8, saddr_v6, 16) - __array(__u8, daddr_v6, 16) - ), - - TP_fast_assign( - struct inet_sock *inet = inet_sk(sk); - __be32 *p32; - - __entry->skaddr = sk; - __entry->oldstate = oldstate; - __entry->newstate = newstate; - - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - p32 = (__be32 *) __entry->saddr; - *p32 = inet->inet_saddr; - - p32 = (__be32 *) __entry->daddr; - *p32 = inet->inet_daddr; - - TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, - sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); - ), - - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", - __entry->sport, __entry->dport, - __entry->saddr, __entry->daddr, - __entry->saddr_v6, __entry->daddr_v6, - show_tcp_state_name(__entry->oldstate), - show_tcp_state_name(__entry->newstate)) -); - TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), -- cgit v1.2.3 From e59644b720aed4b9ec9d3818b483f97376fb31ed Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sun, 1 Apr 2018 08:42:08 +0000 Subject: security: remove security_settime security_settime was a wrapper around security_settime64. There are no more users of it. Therefore it can be removed. It was removed in: commit 4eb1bca17933 ("time: Use do_settimeofday64() internally") Signed-off-by: Sargun Dhillon Signed-off-by: James Morris --- include/linux/security.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 3f5fd988ee87..5111fe8159ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -222,12 +222,6 @@ int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); int security_settime64(const struct timespec64 *ts, const struct timezone *tz); -static inline int security_settime(const struct timespec *ts, const struct timezone *tz) -{ - struct timespec64 ts64 = timespec_to_timespec64(*ts); - - return security_settime64(&ts64, tz); -} int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); @@ -509,14 +503,6 @@ static inline int security_settime64(const struct timespec64 *ts, return cap_settime(ts, tz); } -static inline int security_settime(const struct timespec *ts, - const struct timezone *tz) -{ - struct timespec64 ts64 = timespec_to_timespec64(*ts); - - return cap_settime(&ts64, tz); -} - static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages)); -- cgit v1.2.3 From b5f5f525c547e05fad3ecb4c8d6ceef9cdb14ac3 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Tue, 27 Mar 2018 08:25:18 -0700 Subject: clk: qcom: Add MSM8998 Global Clock Control (GCC) driver Add support for the global clock controller found on MSM8998 based devices. This should allow most non-multimedia device drivers to probe and control their clocks. Signed-off-by: Joonwoo Park Signed-off-by: Imran Khan Signed-off-by: Rajendra Nayak [bjorn: Specify regs for alpha_plls, fix white spaces and add binding] Signed-off-by: Bjorn Andersson Signed-off-by: Stephen Boyd --- .../devicetree/bindings/clock/qcom,gcc.txt | 1 + drivers/clk/qcom/Kconfig | 8 + drivers/clk/qcom/Makefile | 1 + drivers/clk/qcom/gcc-msm8998.c | 2834 ++++++++++++++++++++ include/dt-bindings/clock/qcom,gcc-msm8998.h | 208 ++ 5 files changed, 3052 insertions(+) create mode 100644 drivers/clk/qcom/gcc-msm8998.c create mode 100644 include/dt-bindings/clock/qcom,gcc-msm8998.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.txt b/Documentation/devicetree/bindings/clock/qcom,gcc.txt index 551d03be9665..d1fb8b213dde 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt +++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt @@ -17,6 +17,7 @@ Required properties : "qcom,gcc-msm8974pro-ac" "qcom,gcc-msm8994" "qcom,gcc-msm8996" + "qcom,gcc-msm8998" "qcom,gcc-mdm9615" - reg : shall contain base register location and length diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig index fbf4532f94b8..e42e1afb0c51 100644 --- a/drivers/clk/qcom/Kconfig +++ b/drivers/clk/qcom/Kconfig @@ -218,6 +218,14 @@ config MSM_MMCC_8996 Say Y if you want to support multimedia devices such as display, graphics, video encode/decode, camera, etc. +config MSM_GCC_8998 + tristate "MSM8998 Global Clock Controller" + depends on COMMON_CLK_QCOM + help + Support for the global clock controller on msm8998 devices. + Say Y if you want to use peripheral devices such as UART, SPI, + i2c, USB, UFS, SD/eMMC, PCIe, etc. + config SPMI_PMIC_CLKDIV tristate "SPMI PMIC clkdiv Support" depends on (COMMON_CLK_QCOM && SPMI) || COMPILE_TEST diff --git a/drivers/clk/qcom/Makefile b/drivers/clk/qcom/Makefile index 230332cf317e..7c09ab1a640c 100644 --- a/drivers/clk/qcom/Makefile +++ b/drivers/clk/qcom/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_MSM_GCC_8974) += gcc-msm8974.o obj-$(CONFIG_MSM_GCC_8994) += gcc-msm8994.o obj-$(CONFIG_MSM_GCC_8996) += gcc-msm8996.o obj-$(CONFIG_MSM_LCC_8960) += lcc-msm8960.o +obj-$(CONFIG_MSM_GCC_8998) += gcc-msm8998.o obj-$(CONFIG_MSM_MMCC_8960) += mmcc-msm8960.o obj-$(CONFIG_MSM_MMCC_8974) += mmcc-msm8974.o obj-$(CONFIG_MSM_MMCC_8996) += mmcc-msm8996.o diff --git a/drivers/clk/qcom/gcc-msm8998.c b/drivers/clk/qcom/gcc-msm8998.c new file mode 100644 index 000000000000..78d87f5c7098 --- /dev/null +++ b/drivers/clk/qcom/gcc-msm8998.c @@ -0,0 +1,2834 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common.h" +#include "clk-regmap.h" +#include "clk-alpha-pll.h" +#include "clk-pll.h" +#include "clk-rcg.h" +#include "clk-branch.h" +#include "reset.h" +#include "gdsc.h" + +#define F(f, s, h, m, n) { (f), (s), (2 * (h) - 1), (m), (n) } + +enum { + P_AUD_REF_CLK, + P_CORE_BI_PLL_TEST_SE, + P_GPLL0_OUT_MAIN, + P_GPLL4_OUT_MAIN, + P_PLL0_EARLY_DIV_CLK_SRC, + P_SLEEP_CLK, + P_XO, +}; + +static const struct parent_map gcc_parent_map_0[] = { + { P_XO, 0 }, + { P_GPLL0_OUT_MAIN, 1 }, + { P_PLL0_EARLY_DIV_CLK_SRC, 6 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_0[] = { + "xo", + "gpll0_out_main", + "gpll0_out_main", + "core_bi_pll_test_se", +}; + +static const struct parent_map gcc_parent_map_1[] = { + { P_XO, 0 }, + { P_GPLL0_OUT_MAIN, 1 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_1[] = { + "xo", + "gpll0_out_main", + "core_bi_pll_test_se", +}; + +static const struct parent_map gcc_parent_map_2[] = { + { P_XO, 0 }, + { P_GPLL0_OUT_MAIN, 1 }, + { P_SLEEP_CLK, 5 }, + { P_PLL0_EARLY_DIV_CLK_SRC, 6 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_2[] = { + "xo", + "gpll0_out_main", + "core_pi_sleep_clk", + "gpll0_out_main", + "core_bi_pll_test_se", +}; + +static const struct parent_map gcc_parent_map_3[] = { + { P_XO, 0 }, + { P_SLEEP_CLK, 5 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_3[] = { + "xo", + "core_pi_sleep_clk", + "core_bi_pll_test_se", +}; + +static const struct parent_map gcc_parent_map_4[] = { + { P_XO, 0 }, + { P_GPLL0_OUT_MAIN, 1 }, + { P_GPLL4_OUT_MAIN, 5 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_4[] = { + "xo", + "gpll0_out_main", + "gpll4_out_main", + "core_bi_pll_test_se", +}; + +static const struct parent_map gcc_parent_map_5[] = { + { P_XO, 0 }, + { P_GPLL0_OUT_MAIN, 1 }, + { P_AUD_REF_CLK, 2 }, + { P_CORE_BI_PLL_TEST_SE, 7 }, +}; + +static const char * const gcc_parent_names_5[] = { + "xo", + "gpll0_out_main", + "aud_ref_clk", + "core_bi_pll_test_se", +}; + +static struct pll_vco fabia_vco[] = { + { 250000000, 2000000000, 0 }, + { 125000000, 1000000000, 1 }, +}; + +static struct clk_alpha_pll gpll0 = { + .offset = 0x0, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .vco_table = fabia_vco, + .num_vco = ARRAY_SIZE(fabia_vco), + .clkr = { + .enable_reg = 0x52000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gpll0", + .parent_names = (const char *[]){ "xo" }, + .num_parents = 1, + .ops = &clk_alpha_pll_ops, + } + }, +}; + +static struct clk_alpha_pll_postdiv gpll0_out_even = { + .offset = 0x0, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll0_out_even", + .parent_names = (const char *[]){ "gpll0" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll0_out_main = { + .offset = 0x0, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll0_out_main", + .parent_names = (const char *[]){ "gpll0" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll0_out_odd = { + .offset = 0x0, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll0_out_odd", + .parent_names = (const char *[]){ "gpll0" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll0_out_test = { + .offset = 0x0, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll0_out_test", + .parent_names = (const char *[]){ "gpll0" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll gpll1 = { + .offset = 0x1000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .vco_table = fabia_vco, + .num_vco = ARRAY_SIZE(fabia_vco), + .clkr = { + .enable_reg = 0x52000, + .enable_mask = BIT(1), + .hw.init = &(struct clk_init_data){ + .name = "gpll1", + .parent_names = (const char *[]){ "xo" }, + .num_parents = 1, + .ops = &clk_alpha_pll_ops, + } + }, +}; + +static struct clk_alpha_pll_postdiv gpll1_out_even = { + .offset = 0x1000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll1_out_even", + .parent_names = (const char *[]){ "gpll1" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll1_out_main = { + .offset = 0x1000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll1_out_main", + .parent_names = (const char *[]){ "gpll1" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll1_out_odd = { + .offset = 0x1000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll1_out_odd", + .parent_names = (const char *[]){ "gpll1" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll1_out_test = { + .offset = 0x1000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll1_out_test", + .parent_names = (const char *[]){ "gpll1" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll gpll2 = { + .offset = 0x2000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .vco_table = fabia_vco, + .num_vco = ARRAY_SIZE(fabia_vco), + .clkr = { + .enable_reg = 0x52000, + .enable_mask = BIT(2), + .hw.init = &(struct clk_init_data){ + .name = "gpll2", + .parent_names = (const char *[]){ "xo" }, + .num_parents = 1, + .ops = &clk_alpha_pll_ops, + } + }, +}; + +static struct clk_alpha_pll_postdiv gpll2_out_even = { + .offset = 0x2000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll2_out_even", + .parent_names = (const char *[]){ "gpll2" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll2_out_main = { + .offset = 0x2000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll2_out_main", + .parent_names = (const char *[]){ "gpll2" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll2_out_odd = { + .offset = 0x2000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll2_out_odd", + .parent_names = (const char *[]){ "gpll2" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll2_out_test = { + .offset = 0x2000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll2_out_test", + .parent_names = (const char *[]){ "gpll2" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll gpll3 = { + .offset = 0x3000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .vco_table = fabia_vco, + .num_vco = ARRAY_SIZE(fabia_vco), + .clkr = { + .enable_reg = 0x52000, + .enable_mask = BIT(3), + .hw.init = &(struct clk_init_data){ + .name = "gpll3", + .parent_names = (const char *[]){ "xo" }, + .num_parents = 1, + .ops = &clk_alpha_pll_ops, + } + }, +}; + +static struct clk_alpha_pll_postdiv gpll3_out_even = { + .offset = 0x3000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll3_out_even", + .parent_names = (const char *[]){ "gpll3" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll3_out_main = { + .offset = 0x3000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll3_out_main", + .parent_names = (const char *[]){ "gpll3" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll3_out_odd = { + .offset = 0x3000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll3_out_odd", + .parent_names = (const char *[]){ "gpll3" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll3_out_test = { + .offset = 0x3000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll3_out_test", + .parent_names = (const char *[]){ "gpll3" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll gpll4 = { + .offset = 0x77000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .vco_table = fabia_vco, + .num_vco = ARRAY_SIZE(fabia_vco), + .clkr = { + .enable_reg = 0x52000, + .enable_mask = BIT(4), + .hw.init = &(struct clk_init_data){ + .name = "gpll4", + .parent_names = (const char *[]){ "xo" }, + .num_parents = 1, + .ops = &clk_alpha_pll_ops, + } + }, +}; + +static struct clk_alpha_pll_postdiv gpll4_out_even = { + .offset = 0x77000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll4_out_even", + .parent_names = (const char *[]){ "gpll4" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll4_out_main = { + .offset = 0x77000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll4_out_main", + .parent_names = (const char *[]){ "gpll4" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll4_out_odd = { + .offset = 0x77000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll4_out_odd", + .parent_names = (const char *[]){ "gpll4" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static struct clk_alpha_pll_postdiv gpll4_out_test = { + .offset = 0x77000, + .regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_DEFAULT], + .clkr.hw.init = &(struct clk_init_data){ + .name = "gpll4_out_test", + .parent_names = (const char *[]){ "gpll4" }, + .num_parents = 1, + .ops = &clk_alpha_pll_postdiv_ops, + }, +}; + +static const struct freq_tbl ftbl_blsp1_qup1_i2c_apps_clk_src[] = { + F(19200000, P_XO, 1, 0, 0), + F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0), + { } +}; + +static struct clk_rcg2 blsp1_qup1_i2c_apps_clk_src = { + .cmd_rcgr = 0x19020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup1_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_blsp1_qup1_spi_apps_clk_src[] = { + F(960000, P_XO, 10, 1, 2), + F(4800000, P_XO, 4, 0, 0), + F(9600000, P_XO, 2, 0, 0), + F(15000000, P_GPLL0_OUT_MAIN, 10, 1, 4), + F(19200000, P_XO, 1, 0, 0), + F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2), + F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0), + { } +}; + +static struct clk_rcg2 blsp1_qup1_spi_apps_clk_src = { + .cmd_rcgr = 0x1900c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup1_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup2_i2c_apps_clk_src = { + .cmd_rcgr = 0x1b020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup2_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup2_spi_apps_clk_src = { + .cmd_rcgr = 0x1b00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup2_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup3_i2c_apps_clk_src = { + .cmd_rcgr = 0x1d020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup3_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup3_spi_apps_clk_src = { + .cmd_rcgr = 0x1d00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup3_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup4_i2c_apps_clk_src = { + .cmd_rcgr = 0x1f020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup4_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup4_spi_apps_clk_src = { + .cmd_rcgr = 0x1f00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup4_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup5_i2c_apps_clk_src = { + .cmd_rcgr = 0x21020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup5_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup5_spi_apps_clk_src = { + .cmd_rcgr = 0x2100c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup5_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup6_i2c_apps_clk_src = { + .cmd_rcgr = 0x23020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup6_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_qup6_spi_apps_clk_src = { + .cmd_rcgr = 0x2300c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_qup6_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_blsp1_uart1_apps_clk_src[] = { + F(3686400, P_GPLL0_OUT_MAIN, 1, 96, 15625), + F(7372800, P_GPLL0_OUT_MAIN, 1, 192, 15625), + F(14745600, P_GPLL0_OUT_MAIN, 1, 384, 15625), + F(16000000, P_GPLL0_OUT_MAIN, 5, 2, 15), + F(19200000, P_XO, 1, 0, 0), + F(24000000, P_GPLL0_OUT_MAIN, 5, 1, 5), + F(32000000, P_GPLL0_OUT_MAIN, 1, 4, 75), + F(40000000, P_GPLL0_OUT_MAIN, 15, 0, 0), + F(46400000, P_GPLL0_OUT_MAIN, 1, 29, 375), + F(48000000, P_GPLL0_OUT_MAIN, 12.5, 0, 0), + F(51200000, P_GPLL0_OUT_MAIN, 1, 32, 375), + F(56000000, P_GPLL0_OUT_MAIN, 1, 7, 75), + F(58982400, P_GPLL0_OUT_MAIN, 1, 1536, 15625), + F(60000000, P_GPLL0_OUT_MAIN, 10, 0, 0), + F(63157895, P_GPLL0_OUT_MAIN, 9.5, 0, 0), + { } +}; + +static struct clk_rcg2 blsp1_uart1_apps_clk_src = { + .cmd_rcgr = 0x1a00c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_uart1_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_uart2_apps_clk_src = { + .cmd_rcgr = 0x1c00c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_uart2_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp1_uart3_apps_clk_src = { + .cmd_rcgr = 0x1e00c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp1_uart3_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup1_i2c_apps_clk_src = { + .cmd_rcgr = 0x26020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup1_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup1_spi_apps_clk_src = { + .cmd_rcgr = 0x2600c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup1_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup2_i2c_apps_clk_src = { + .cmd_rcgr = 0x28020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup2_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup2_spi_apps_clk_src = { + .cmd_rcgr = 0x2800c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup2_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup3_i2c_apps_clk_src = { + .cmd_rcgr = 0x2a020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup3_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup3_spi_apps_clk_src = { + .cmd_rcgr = 0x2a00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup3_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup4_i2c_apps_clk_src = { + .cmd_rcgr = 0x2c020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup4_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup4_spi_apps_clk_src = { + .cmd_rcgr = 0x2c00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup4_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup5_i2c_apps_clk_src = { + .cmd_rcgr = 0x2e020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup5_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup5_spi_apps_clk_src = { + .cmd_rcgr = 0x2e00c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup5_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup6_i2c_apps_clk_src = { + .cmd_rcgr = 0x30020, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_blsp1_qup1_i2c_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup6_i2c_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_qup6_spi_apps_clk_src = { + .cmd_rcgr = 0x3000c, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_qup1_spi_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_qup6_spi_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_uart1_apps_clk_src = { + .cmd_rcgr = 0x2700c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_uart1_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_uart2_apps_clk_src = { + .cmd_rcgr = 0x2900c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_uart2_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 blsp2_uart3_apps_clk_src = { + .cmd_rcgr = 0x2b00c, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_blsp1_uart1_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "blsp2_uart3_apps_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_gp1_clk_src[] = { + F(19200000, P_XO, 1, 0, 0), + F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0), + F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0), + { } +}; + +static struct clk_rcg2 gp1_clk_src = { + .cmd_rcgr = 0x64004, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_2, + .freq_tbl = ftbl_gp1_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "gp1_clk_src", + .parent_names = gcc_parent_names_2, + .num_parents = 5, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 gp2_clk_src = { + .cmd_rcgr = 0x65004, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_2, + .freq_tbl = ftbl_gp1_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "gp2_clk_src", + .parent_names = gcc_parent_names_2, + .num_parents = 5, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 gp3_clk_src = { + .cmd_rcgr = 0x66004, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_2, + .freq_tbl = ftbl_gp1_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "gp3_clk_src", + .parent_names = gcc_parent_names_2, + .num_parents = 5, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_hmss_ahb_clk_src[] = { + F(19200000, P_XO, 1, 0, 0), + F(37500000, P_GPLL0_OUT_MAIN, 16, 0, 0), + F(75000000, P_GPLL0_OUT_MAIN, 8, 0, 0), + { } +}; + +static struct clk_rcg2 hmss_ahb_clk_src = { + .cmd_rcgr = 0x48014, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_hmss_ahb_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "hmss_ahb_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_hmss_rbcpr_clk_src[] = { + F(19200000, P_XO, 1, 0, 0), + { } +}; + +static struct clk_rcg2 hmss_rbcpr_clk_src = { + .cmd_rcgr = 0x48044, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_hmss_rbcpr_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "hmss_rbcpr_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_pcie_aux_clk_src[] = { + F(1010526, P_XO, 1, 1, 19), + { } +}; + +static struct clk_rcg2 pcie_aux_clk_src = { + .cmd_rcgr = 0x6c000, + .mnd_width = 16, + .hid_width = 5, + .parent_map = gcc_parent_map_3, + .freq_tbl = ftbl_pcie_aux_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "pcie_aux_clk_src", + .parent_names = gcc_parent_names_3, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_pdm2_clk_src[] = { + F(60000000, P_GPLL0_OUT_MAIN, 10, 0, 0), + { } +}; + +static struct clk_rcg2 pdm2_clk_src = { + .cmd_rcgr = 0x33010, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_pdm2_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "pdm2_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_sdcc2_apps_clk_src[] = { + F(144000, P_XO, 16, 3, 25), + F(400000, P_XO, 12, 1, 4), + F(20000000, P_GPLL0_OUT_MAIN, 15, 1, 2), + F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2), + F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0), + F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0), + F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0), + { } +}; + +static struct clk_rcg2 sdcc2_apps_clk_src = { + .cmd_rcgr = 0x14010, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_4, + .freq_tbl = ftbl_sdcc2_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "sdcc2_apps_clk_src", + .parent_names = gcc_parent_names_4, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_sdcc4_apps_clk_src[] = { + F(144000, P_XO, 16, 3, 25), + F(400000, P_XO, 12, 1, 4), + F(20000000, P_GPLL0_OUT_MAIN, 15, 1, 2), + F(25000000, P_GPLL0_OUT_MAIN, 12, 1, 2), + F(50000000, P_GPLL0_OUT_MAIN, 12, 0, 0), + F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0), + { } +}; + +static struct clk_rcg2 sdcc4_apps_clk_src = { + .cmd_rcgr = 0x16010, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_1, + .freq_tbl = ftbl_sdcc4_apps_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "sdcc4_apps_clk_src", + .parent_names = gcc_parent_names_1, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_tsif_ref_clk_src[] = { + F(105495, P_XO, 1, 1, 182), + { } +}; + +static struct clk_rcg2 tsif_ref_clk_src = { + .cmd_rcgr = 0x36010, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_5, + .freq_tbl = ftbl_tsif_ref_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "tsif_ref_clk_src", + .parent_names = gcc_parent_names_5, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_ufs_axi_clk_src[] = { + F(100000000, P_GPLL0_OUT_MAIN, 6, 0, 0), + F(200000000, P_GPLL0_OUT_MAIN, 3, 0, 0), + F(240000000, P_GPLL0_OUT_MAIN, 2.5, 0, 0), + { } +}; + +static struct clk_rcg2 ufs_axi_clk_src = { + .cmd_rcgr = 0x75018, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_ufs_axi_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "ufs_axi_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_usb30_master_clk_src[] = { + F(19200000, P_XO, 1, 0, 0), + F(120000000, P_GPLL0_OUT_MAIN, 5, 0, 0), + F(150000000, P_GPLL0_OUT_MAIN, 4, 0, 0), + { } +}; + +static struct clk_rcg2 usb30_master_clk_src = { + .cmd_rcgr = 0xf014, + .mnd_width = 8, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_usb30_master_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "usb30_master_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_rcg2 usb30_mock_utmi_clk_src = { + .cmd_rcgr = 0xf028, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_0, + .freq_tbl = ftbl_hmss_rbcpr_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "usb30_mock_utmi_clk_src", + .parent_names = gcc_parent_names_0, + .num_parents = 4, + .ops = &clk_rcg2_ops, + }, +}; + +static const struct freq_tbl ftbl_usb3_phy_aux_clk_src[] = { + F(1200000, P_XO, 16, 0, 0), + { } +}; + +static struct clk_rcg2 usb3_phy_aux_clk_src = { + .cmd_rcgr = 0x5000c, + .mnd_width = 0, + .hid_width = 5, + .parent_map = gcc_parent_map_3, + .freq_tbl = ftbl_usb3_phy_aux_clk_src, + .clkr.hw.init = &(struct clk_init_data){ + .name = "usb3_phy_aux_clk_src", + .parent_names = gcc_parent_names_3, + .num_parents = 3, + .ops = &clk_rcg2_ops, + }, +}; + +static struct clk_branch gcc_aggre1_noc_xo_clk = { + .halt_reg = 0x8202c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x8202c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_aggre1_noc_xo_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_aggre1_ufs_axi_clk = { + .halt_reg = 0x82028, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x82028, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_aggre1_ufs_axi_clk", + .parent_names = (const char *[]){ + "ufs_axi_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_aggre1_usb3_axi_clk = { + .halt_reg = 0x82024, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x82024, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_aggre1_usb3_axi_clk", + .parent_names = (const char *[]){ + "usb30_master_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_apss_qdss_tsctr_div2_clk = { + .halt_reg = 0x48090, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x48090, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_apss_qdss_tsctr_div2_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_apss_qdss_tsctr_div8_clk = { + .halt_reg = 0x48094, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x48094, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_apss_qdss_tsctr_div8_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_bimc_hmss_axi_clk = { + .halt_reg = 0x48004, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(22), + .hw.init = &(struct clk_init_data){ + .name = "gcc_bimc_hmss_axi_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_bimc_mss_q6_axi_clk = { + .halt_reg = 0x4401c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x4401c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_bimc_mss_q6_axi_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_ahb_clk = { + .halt_reg = 0x17004, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(17), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup1_i2c_apps_clk = { + .halt_reg = 0x19008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x19008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup1_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup1_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup1_spi_apps_clk = { + .halt_reg = 0x19004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x19004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup1_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup1_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup2_i2c_apps_clk = { + .halt_reg = 0x1b008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1b008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup2_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup2_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup2_spi_apps_clk = { + .halt_reg = 0x1b004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1b004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup2_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup2_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup3_i2c_apps_clk = { + .halt_reg = 0x1d008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1d008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup3_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup3_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup3_spi_apps_clk = { + .halt_reg = 0x1d004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1d004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup3_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup3_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup4_i2c_apps_clk = { + .halt_reg = 0x1f008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1f008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup4_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup4_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup4_spi_apps_clk = { + .halt_reg = 0x1f004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1f004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup4_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup4_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup5_i2c_apps_clk = { + .halt_reg = 0x21008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x21008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup5_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup5_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup5_spi_apps_clk = { + .halt_reg = 0x21004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x21004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup5_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup5_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup6_i2c_apps_clk = { + .halt_reg = 0x23008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x23008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup6_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup6_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_qup6_spi_apps_clk = { + .halt_reg = 0x23004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x23004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_qup6_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp1_qup6_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_sleep_clk = { + .halt_reg = 0x17008, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(16), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_sleep_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_uart1_apps_clk = { + .halt_reg = 0x1a004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1a004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_uart1_apps_clk", + .parent_names = (const char *[]){ + "blsp1_uart1_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_uart2_apps_clk = { + .halt_reg = 0x1c004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1c004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_uart2_apps_clk", + .parent_names = (const char *[]){ + "blsp1_uart2_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp1_uart3_apps_clk = { + .halt_reg = 0x1e004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x1e004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp1_uart3_apps_clk", + .parent_names = (const char *[]){ + "blsp1_uart3_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_ahb_clk = { + .halt_reg = 0x25004, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(15), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup1_i2c_apps_clk = { + .halt_reg = 0x26008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x26008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup1_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup1_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup1_spi_apps_clk = { + .halt_reg = 0x26004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x26004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup1_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup1_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup2_i2c_apps_clk = { + .halt_reg = 0x28008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x28008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup2_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup2_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup2_spi_apps_clk = { + .halt_reg = 0x28004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x28004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup2_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup2_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup3_i2c_apps_clk = { + .halt_reg = 0x2a008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2a008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup3_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup3_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup3_spi_apps_clk = { + .halt_reg = 0x2a004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2a004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup3_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup3_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup4_i2c_apps_clk = { + .halt_reg = 0x2c008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2c008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup4_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup4_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup4_spi_apps_clk = { + .halt_reg = 0x2c004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2c004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup4_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup4_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup5_i2c_apps_clk = { + .halt_reg = 0x2e008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2e008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup5_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup5_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup5_spi_apps_clk = { + .halt_reg = 0x2e004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2e004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup5_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup5_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup6_i2c_apps_clk = { + .halt_reg = 0x30008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x30008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup6_i2c_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup6_i2c_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_qup6_spi_apps_clk = { + .halt_reg = 0x30004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x30004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_qup6_spi_apps_clk", + .parent_names = (const char *[]){ + "blsp2_qup6_spi_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_sleep_clk = { + .halt_reg = 0x25008, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(14), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_sleep_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_uart1_apps_clk = { + .halt_reg = 0x27004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x27004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_uart1_apps_clk", + .parent_names = (const char *[]){ + "blsp2_uart1_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_uart2_apps_clk = { + .halt_reg = 0x29004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x29004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_uart2_apps_clk", + .parent_names = (const char *[]){ + "blsp2_uart2_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_blsp2_uart3_apps_clk = { + .halt_reg = 0x2b004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x2b004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_blsp2_uart3_apps_clk", + .parent_names = (const char *[]){ + "blsp2_uart3_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_cfg_noc_usb3_axi_clk = { + .halt_reg = 0x5018, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x5018, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_cfg_noc_usb3_axi_clk", + .parent_names = (const char *[]){ + "usb30_master_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gp1_clk = { + .halt_reg = 0x64000, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x64000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gp1_clk", + .parent_names = (const char *[]){ + "gp1_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gp2_clk = { + .halt_reg = 0x65000, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x65000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gp2_clk", + .parent_names = (const char *[]){ + "gp2_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gp3_clk = { + .halt_reg = 0x66000, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x66000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gp3_clk", + .parent_names = (const char *[]){ + "gp3_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gpu_bimc_gfx_clk = { + .halt_reg = 0x71010, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x71010, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gpu_bimc_gfx_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gpu_bimc_gfx_src_clk = { + .halt_reg = 0x7100c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x7100c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gpu_bimc_gfx_src_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gpu_cfg_ahb_clk = { + .halt_reg = 0x71004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x71004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gpu_cfg_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_gpu_snoc_dvm_gfx_clk = { + .halt_reg = 0x71018, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x71018, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_gpu_snoc_dvm_gfx_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_hmss_ahb_clk = { + .halt_reg = 0x48000, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(21), + .hw.init = &(struct clk_init_data){ + .name = "gcc_hmss_ahb_clk", + .parent_names = (const char *[]){ + "hmss_ahb_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_hmss_at_clk = { + .halt_reg = 0x48010, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x48010, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_hmss_at_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_hmss_dvm_bus_clk = { + .halt_reg = 0x4808c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x4808c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_hmss_dvm_bus_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_hmss_rbcpr_clk = { + .halt_reg = 0x48008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x48008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_hmss_rbcpr_clk", + .parent_names = (const char *[]){ + "hmss_rbcpr_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_hmss_trig_clk = { + .halt_reg = 0x4800c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x4800c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_hmss_trig_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_lpass_at_clk = { + .halt_reg = 0x47020, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x47020, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_lpass_at_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_lpass_trig_clk = { + .halt_reg = 0x4701c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x4701c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_lpass_trig_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_mmss_noc_cfg_ahb_clk = { + .halt_reg = 0x9004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x9004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_mmss_noc_cfg_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_mmss_qm_ahb_clk = { + .halt_reg = 0x9030, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x9030, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_mmss_qm_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_mmss_qm_core_clk = { + .halt_reg = 0x900c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x900c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_mmss_qm_core_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_mmss_sys_noc_axi_clk = { + .halt_reg = 0x9000, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x9000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_mmss_sys_noc_axi_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_mss_at_clk = { + .halt_reg = 0x8a00c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x8a00c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_mss_at_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_0_aux_clk = { + .halt_reg = 0x6b014, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6b014, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_0_aux_clk", + .parent_names = (const char *[]){ + "pcie_aux_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_0_cfg_ahb_clk = { + .halt_reg = 0x6b010, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6b010, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_0_cfg_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_0_mstr_axi_clk = { + .halt_reg = 0x6b00c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6b00c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_0_mstr_axi_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_0_pipe_clk = { + .halt_reg = 0x6b018, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6b018, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_0_pipe_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_0_slv_axi_clk = { + .halt_reg = 0x6b008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6b008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_0_slv_axi_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pcie_phy_aux_clk = { + .halt_reg = 0x6f004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6f004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pcie_phy_aux_clk", + .parent_names = (const char *[]){ + "pcie_aux_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pdm2_clk = { + .halt_reg = 0x3300c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x3300c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pdm2_clk", + .parent_names = (const char *[]){ + "pdm2_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pdm_ahb_clk = { + .halt_reg = 0x33004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x33004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pdm_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_pdm_xo4_clk = { + .halt_reg = 0x33008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x33008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_pdm_xo4_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_prng_ahb_clk = { + .halt_reg = 0x34004, + .halt_check = BRANCH_HALT_VOTED, + .clkr = { + .enable_reg = 0x52004, + .enable_mask = BIT(13), + .hw.init = &(struct clk_init_data){ + .name = "gcc_prng_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_sdcc2_ahb_clk = { + .halt_reg = 0x14008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x14008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_sdcc2_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_sdcc2_apps_clk = { + .halt_reg = 0x14004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x14004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_sdcc2_apps_clk", + .parent_names = (const char *[]){ + "sdcc2_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_sdcc4_ahb_clk = { + .halt_reg = 0x16008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x16008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_sdcc4_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_sdcc4_apps_clk = { + .halt_reg = 0x16004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x16004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_sdcc4_apps_clk", + .parent_names = (const char *[]){ + "sdcc4_apps_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_tsif_ahb_clk = { + .halt_reg = 0x36004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x36004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_tsif_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_tsif_inactivity_timers_clk = { + .halt_reg = 0x3600c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x3600c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_tsif_inactivity_timers_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_tsif_ref_clk = { + .halt_reg = 0x36008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x36008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_tsif_ref_clk", + .parent_names = (const char *[]){ + "tsif_ref_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_ahb_clk = { + .halt_reg = 0x7500c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x7500c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_ahb_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_axi_clk = { + .halt_reg = 0x75008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x75008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_axi_clk", + .parent_names = (const char *[]){ + "ufs_axi_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_ice_core_clk = { + .halt_reg = 0x7600c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x7600c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_ice_core_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_phy_aux_clk = { + .halt_reg = 0x76040, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x76040, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_phy_aux_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_rx_symbol_0_clk = { + .halt_reg = 0x75014, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x75014, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_rx_symbol_0_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_rx_symbol_1_clk = { + .halt_reg = 0x7605c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x7605c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_rx_symbol_1_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_tx_symbol_0_clk = { + .halt_reg = 0x75010, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x75010, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_tx_symbol_0_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_ufs_unipro_core_clk = { + .halt_reg = 0x76008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x76008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_ufs_unipro_core_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb30_master_clk = { + .halt_reg = 0xf008, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0xf008, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb30_master_clk", + .parent_names = (const char *[]){ + "usb30_master_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb30_mock_utmi_clk = { + .halt_reg = 0xf010, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0xf010, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb30_mock_utmi_clk", + .parent_names = (const char *[]){ + "usb30_mock_utmi_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb30_sleep_clk = { + .halt_reg = 0xf00c, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0xf00c, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb30_sleep_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb3_phy_aux_clk = { + .halt_reg = 0x50000, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x50000, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb3_phy_aux_clk", + .parent_names = (const char *[]){ + "usb3_phy_aux_clk_src", + }, + .num_parents = 1, + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb3_phy_pipe_clk = { + .halt_reg = 0x50004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x50004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb3_phy_pipe_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct clk_branch gcc_usb_phy_cfg_ahb2phy_clk = { + .halt_reg = 0x6a004, + .halt_check = BRANCH_HALT, + .clkr = { + .enable_reg = 0x6a004, + .enable_mask = BIT(0), + .hw.init = &(struct clk_init_data){ + .name = "gcc_usb_phy_cfg_ahb2phy_clk", + .ops = &clk_branch2_ops, + }, + }, +}; + +static struct gdsc pcie_0_gdsc = { + .gdscr = 0x6b004, + .gds_hw_ctrl = 0x0, + .pd = { + .name = "pcie_0_gdsc", + }, + .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, +}; + +static struct gdsc ufs_gdsc = { + .gdscr = 0x75004, + .gds_hw_ctrl = 0x0, + .pd = { + .name = "ufs_gdsc", + }, + .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, +}; + +static struct gdsc usb_30_gdsc = { + .gdscr = 0xf004, + .gds_hw_ctrl = 0x0, + .pd = { + .name = "usb_30_gdsc", + }, + .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, +}; + +static struct clk_regmap *gcc_msm8998_clocks[] = { + [BLSP1_QUP1_I2C_APPS_CLK_SRC] = &blsp1_qup1_i2c_apps_clk_src.clkr, + [BLSP1_QUP1_SPI_APPS_CLK_SRC] = &blsp1_qup1_spi_apps_clk_src.clkr, + [BLSP1_QUP2_I2C_APPS_CLK_SRC] = &blsp1_qup2_i2c_apps_clk_src.clkr, + [BLSP1_QUP2_SPI_APPS_CLK_SRC] = &blsp1_qup2_spi_apps_clk_src.clkr, + [BLSP1_QUP3_I2C_APPS_CLK_SRC] = &blsp1_qup3_i2c_apps_clk_src.clkr, + [BLSP1_QUP3_SPI_APPS_CLK_SRC] = &blsp1_qup3_spi_apps_clk_src.clkr, + [BLSP1_QUP4_I2C_APPS_CLK_SRC] = &blsp1_qup4_i2c_apps_clk_src.clkr, + [BLSP1_QUP4_SPI_APPS_CLK_SRC] = &blsp1_qup4_spi_apps_clk_src.clkr, + [BLSP1_QUP5_I2C_APPS_CLK_SRC] = &blsp1_qup5_i2c_apps_clk_src.clkr, + [BLSP1_QUP5_SPI_APPS_CLK_SRC] = &blsp1_qup5_spi_apps_clk_src.clkr, + [BLSP1_QUP6_I2C_APPS_CLK_SRC] = &blsp1_qup6_i2c_apps_clk_src.clkr, + [BLSP1_QUP6_SPI_APPS_CLK_SRC] = &blsp1_qup6_spi_apps_clk_src.clkr, + [BLSP1_UART1_APPS_CLK_SRC] = &blsp1_uart1_apps_clk_src.clkr, + [BLSP1_UART2_APPS_CLK_SRC] = &blsp1_uart2_apps_clk_src.clkr, + [BLSP1_UART3_APPS_CLK_SRC] = &blsp1_uart3_apps_clk_src.clkr, + [BLSP2_QUP1_I2C_APPS_CLK_SRC] = &blsp2_qup1_i2c_apps_clk_src.clkr, + [BLSP2_QUP1_SPI_APPS_CLK_SRC] = &blsp2_qup1_spi_apps_clk_src.clkr, + [BLSP2_QUP2_I2C_APPS_CLK_SRC] = &blsp2_qup2_i2c_apps_clk_src.clkr, + [BLSP2_QUP2_SPI_APPS_CLK_SRC] = &blsp2_qup2_spi_apps_clk_src.clkr, + [BLSP2_QUP3_I2C_APPS_CLK_SRC] = &blsp2_qup3_i2c_apps_clk_src.clkr, + [BLSP2_QUP3_SPI_APPS_CLK_SRC] = &blsp2_qup3_spi_apps_clk_src.clkr, + [BLSP2_QUP4_I2C_APPS_CLK_SRC] = &blsp2_qup4_i2c_apps_clk_src.clkr, + [BLSP2_QUP4_SPI_APPS_CLK_SRC] = &blsp2_qup4_spi_apps_clk_src.clkr, + [BLSP2_QUP5_I2C_APPS_CLK_SRC] = &blsp2_qup5_i2c_apps_clk_src.clkr, + [BLSP2_QUP5_SPI_APPS_CLK_SRC] = &blsp2_qup5_spi_apps_clk_src.clkr, + [BLSP2_QUP6_I2C_APPS_CLK_SRC] = &blsp2_qup6_i2c_apps_clk_src.clkr, + [BLSP2_QUP6_SPI_APPS_CLK_SRC] = &blsp2_qup6_spi_apps_clk_src.clkr, + [BLSP2_UART1_APPS_CLK_SRC] = &blsp2_uart1_apps_clk_src.clkr, + [BLSP2_UART2_APPS_CLK_SRC] = &blsp2_uart2_apps_clk_src.clkr, + [BLSP2_UART3_APPS_CLK_SRC] = &blsp2_uart3_apps_clk_src.clkr, + [GCC_AGGRE1_NOC_XO_CLK] = &gcc_aggre1_noc_xo_clk.clkr, + [GCC_AGGRE1_UFS_AXI_CLK] = &gcc_aggre1_ufs_axi_clk.clkr, + [GCC_AGGRE1_USB3_AXI_CLK] = &gcc_aggre1_usb3_axi_clk.clkr, + [GCC_APSS_QDSS_TSCTR_DIV2_CLK] = &gcc_apss_qdss_tsctr_div2_clk.clkr, + [GCC_APSS_QDSS_TSCTR_DIV8_CLK] = &gcc_apss_qdss_tsctr_div8_clk.clkr, + [GCC_BIMC_HMSS_AXI_CLK] = &gcc_bimc_hmss_axi_clk.clkr, + [GCC_BIMC_MSS_Q6_AXI_CLK] = &gcc_bimc_mss_q6_axi_clk.clkr, + [GCC_BLSP1_AHB_CLK] = &gcc_blsp1_ahb_clk.clkr, + [GCC_BLSP1_QUP1_I2C_APPS_CLK] = &gcc_blsp1_qup1_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP1_SPI_APPS_CLK] = &gcc_blsp1_qup1_spi_apps_clk.clkr, + [GCC_BLSP1_QUP2_I2C_APPS_CLK] = &gcc_blsp1_qup2_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP2_SPI_APPS_CLK] = &gcc_blsp1_qup2_spi_apps_clk.clkr, + [GCC_BLSP1_QUP3_I2C_APPS_CLK] = &gcc_blsp1_qup3_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP3_SPI_APPS_CLK] = &gcc_blsp1_qup3_spi_apps_clk.clkr, + [GCC_BLSP1_QUP4_I2C_APPS_CLK] = &gcc_blsp1_qup4_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP4_SPI_APPS_CLK] = &gcc_blsp1_qup4_spi_apps_clk.clkr, + [GCC_BLSP1_QUP5_I2C_APPS_CLK] = &gcc_blsp1_qup5_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP5_SPI_APPS_CLK] = &gcc_blsp1_qup5_spi_apps_clk.clkr, + [GCC_BLSP1_QUP6_I2C_APPS_CLK] = &gcc_blsp1_qup6_i2c_apps_clk.clkr, + [GCC_BLSP1_QUP6_SPI_APPS_CLK] = &gcc_blsp1_qup6_spi_apps_clk.clkr, + [GCC_BLSP1_SLEEP_CLK] = &gcc_blsp1_sleep_clk.clkr, + [GCC_BLSP1_UART1_APPS_CLK] = &gcc_blsp1_uart1_apps_clk.clkr, + [GCC_BLSP1_UART2_APPS_CLK] = &gcc_blsp1_uart2_apps_clk.clkr, + [GCC_BLSP1_UART3_APPS_CLK] = &gcc_blsp1_uart3_apps_clk.clkr, + [GCC_BLSP2_AHB_CLK] = &gcc_blsp2_ahb_clk.clkr, + [GCC_BLSP2_QUP1_I2C_APPS_CLK] = &gcc_blsp2_qup1_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP1_SPI_APPS_CLK] = &gcc_blsp2_qup1_spi_apps_clk.clkr, + [GCC_BLSP2_QUP2_I2C_APPS_CLK] = &gcc_blsp2_qup2_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP2_SPI_APPS_CLK] = &gcc_blsp2_qup2_spi_apps_clk.clkr, + [GCC_BLSP2_QUP3_I2C_APPS_CLK] = &gcc_blsp2_qup3_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP3_SPI_APPS_CLK] = &gcc_blsp2_qup3_spi_apps_clk.clkr, + [GCC_BLSP2_QUP4_I2C_APPS_CLK] = &gcc_blsp2_qup4_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP4_SPI_APPS_CLK] = &gcc_blsp2_qup4_spi_apps_clk.clkr, + [GCC_BLSP2_QUP5_I2C_APPS_CLK] = &gcc_blsp2_qup5_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP5_SPI_APPS_CLK] = &gcc_blsp2_qup5_spi_apps_clk.clkr, + [GCC_BLSP2_QUP6_I2C_APPS_CLK] = &gcc_blsp2_qup6_i2c_apps_clk.clkr, + [GCC_BLSP2_QUP6_SPI_APPS_CLK] = &gcc_blsp2_qup6_spi_apps_clk.clkr, + [GCC_BLSP2_SLEEP_CLK] = &gcc_blsp2_sleep_clk.clkr, + [GCC_BLSP2_UART1_APPS_CLK] = &gcc_blsp2_uart1_apps_clk.clkr, + [GCC_BLSP2_UART2_APPS_CLK] = &gcc_blsp2_uart2_apps_clk.clkr, + [GCC_BLSP2_UART3_APPS_CLK] = &gcc_blsp2_uart3_apps_clk.clkr, + [GCC_CFG_NOC_USB3_AXI_CLK] = &gcc_cfg_noc_usb3_axi_clk.clkr, + [GCC_GP1_CLK] = &gcc_gp1_clk.clkr, + [GCC_GP2_CLK] = &gcc_gp2_clk.clkr, + [GCC_GP3_CLK] = &gcc_gp3_clk.clkr, + [GCC_GPU_BIMC_GFX_CLK] = &gcc_gpu_bimc_gfx_clk.clkr, + [GCC_GPU_BIMC_GFX_SRC_CLK] = &gcc_gpu_bimc_gfx_src_clk.clkr, + [GCC_GPU_CFG_AHB_CLK] = &gcc_gpu_cfg_ahb_clk.clkr, + [GCC_GPU_SNOC_DVM_GFX_CLK] = &gcc_gpu_snoc_dvm_gfx_clk.clkr, + [GCC_HMSS_AHB_CLK] = &gcc_hmss_ahb_clk.clkr, + [GCC_HMSS_AT_CLK] = &gcc_hmss_at_clk.clkr, + [GCC_HMSS_DVM_BUS_CLK] = &gcc_hmss_dvm_bus_clk.clkr, + [GCC_HMSS_RBCPR_CLK] = &gcc_hmss_rbcpr_clk.clkr, + [GCC_HMSS_TRIG_CLK] = &gcc_hmss_trig_clk.clkr, + [GCC_LPASS_AT_CLK] = &gcc_lpass_at_clk.clkr, + [GCC_LPASS_TRIG_CLK] = &gcc_lpass_trig_clk.clkr, + [GCC_MMSS_NOC_CFG_AHB_CLK] = &gcc_mmss_noc_cfg_ahb_clk.clkr, + [GCC_MMSS_QM_AHB_CLK] = &gcc_mmss_qm_ahb_clk.clkr, + [GCC_MMSS_QM_CORE_CLK] = &gcc_mmss_qm_core_clk.clkr, + [GCC_MMSS_SYS_NOC_AXI_CLK] = &gcc_mmss_sys_noc_axi_clk.clkr, + [GCC_MSS_AT_CLK] = &gcc_mss_at_clk.clkr, + [GCC_PCIE_0_AUX_CLK] = &gcc_pcie_0_aux_clk.clkr, + [GCC_PCIE_0_CFG_AHB_CLK] = &gcc_pcie_0_cfg_ahb_clk.clkr, + [GCC_PCIE_0_MSTR_AXI_CLK] = &gcc_pcie_0_mstr_axi_clk.clkr, + [GCC_PCIE_0_PIPE_CLK] = &gcc_pcie_0_pipe_clk.clkr, + [GCC_PCIE_0_SLV_AXI_CLK] = &gcc_pcie_0_slv_axi_clk.clkr, + [GCC_PCIE_PHY_AUX_CLK] = &gcc_pcie_phy_aux_clk.clkr, + [GCC_PDM2_CLK] = &gcc_pdm2_clk.clkr, + [GCC_PDM_AHB_CLK] = &gcc_pdm_ahb_clk.clkr, + [GCC_PDM_XO4_CLK] = &gcc_pdm_xo4_clk.clkr, + [GCC_PRNG_AHB_CLK] = &gcc_prng_ahb_clk.clkr, + [GCC_SDCC2_AHB_CLK] = &gcc_sdcc2_ahb_clk.clkr, + [GCC_SDCC2_APPS_CLK] = &gcc_sdcc2_apps_clk.clkr, + [GCC_SDCC4_AHB_CLK] = &gcc_sdcc4_ahb_clk.clkr, + [GCC_SDCC4_APPS_CLK] = &gcc_sdcc4_apps_clk.clkr, + [GCC_TSIF_AHB_CLK] = &gcc_tsif_ahb_clk.clkr, + [GCC_TSIF_INACTIVITY_TIMERS_CLK] = &gcc_tsif_inactivity_timers_clk.clkr, + [GCC_TSIF_REF_CLK] = &gcc_tsif_ref_clk.clkr, + [GCC_UFS_AHB_CLK] = &gcc_ufs_ahb_clk.clkr, + [GCC_UFS_AXI_CLK] = &gcc_ufs_axi_clk.clkr, + [GCC_UFS_ICE_CORE_CLK] = &gcc_ufs_ice_core_clk.clkr, + [GCC_UFS_PHY_AUX_CLK] = &gcc_ufs_phy_aux_clk.clkr, + [GCC_UFS_RX_SYMBOL_0_CLK] = &gcc_ufs_rx_symbol_0_clk.clkr, + [GCC_UFS_RX_SYMBOL_1_CLK] = &gcc_ufs_rx_symbol_1_clk.clkr, + [GCC_UFS_TX_SYMBOL_0_CLK] = &gcc_ufs_tx_symbol_0_clk.clkr, + [GCC_UFS_UNIPRO_CORE_CLK] = &gcc_ufs_unipro_core_clk.clkr, + [GCC_USB30_MASTER_CLK] = &gcc_usb30_master_clk.clkr, + [GCC_USB30_MOCK_UTMI_CLK] = &gcc_usb30_mock_utmi_clk.clkr, + [GCC_USB30_SLEEP_CLK] = &gcc_usb30_sleep_clk.clkr, + [GCC_USB3_PHY_AUX_CLK] = &gcc_usb3_phy_aux_clk.clkr, + [GCC_USB3_PHY_PIPE_CLK] = &gcc_usb3_phy_pipe_clk.clkr, + [GCC_USB_PHY_CFG_AHB2PHY_CLK] = &gcc_usb_phy_cfg_ahb2phy_clk.clkr, + [GP1_CLK_SRC] = &gp1_clk_src.clkr, + [GP2_CLK_SRC] = &gp2_clk_src.clkr, + [GP3_CLK_SRC] = &gp3_clk_src.clkr, + [GPLL0] = &gpll0.clkr, + [GPLL0_OUT_EVEN] = &gpll0_out_even.clkr, + [GPLL0_OUT_MAIN] = &gpll0_out_main.clkr, + [GPLL0_OUT_ODD] = &gpll0_out_odd.clkr, + [GPLL0_OUT_TEST] = &gpll0_out_test.clkr, + [GPLL1] = &gpll1.clkr, + [GPLL1_OUT_EVEN] = &gpll1_out_even.clkr, + [GPLL1_OUT_MAIN] = &gpll1_out_main.clkr, + [GPLL1_OUT_ODD] = &gpll1_out_odd.clkr, + [GPLL1_OUT_TEST] = &gpll1_out_test.clkr, + [GPLL2] = &gpll2.clkr, + [GPLL2_OUT_EVEN] = &gpll2_out_even.clkr, + [GPLL2_OUT_MAIN] = &gpll2_out_main.clkr, + [GPLL2_OUT_ODD] = &gpll2_out_odd.clkr, + [GPLL2_OUT_TEST] = &gpll2_out_test.clkr, + [GPLL3] = &gpll3.clkr, + [GPLL3_OUT_EVEN] = &gpll3_out_even.clkr, + [GPLL3_OUT_MAIN] = &gpll3_out_main.clkr, + [GPLL3_OUT_ODD] = &gpll3_out_odd.clkr, + [GPLL3_OUT_TEST] = &gpll3_out_test.clkr, + [GPLL4] = &gpll4.clkr, + [GPLL4_OUT_EVEN] = &gpll4_out_even.clkr, + [GPLL4_OUT_MAIN] = &gpll4_out_main.clkr, + [GPLL4_OUT_ODD] = &gpll4_out_odd.clkr, + [GPLL4_OUT_TEST] = &gpll4_out_test.clkr, + [HMSS_AHB_CLK_SRC] = &hmss_ahb_clk_src.clkr, + [HMSS_RBCPR_CLK_SRC] = &hmss_rbcpr_clk_src.clkr, + [PCIE_AUX_CLK_SRC] = &pcie_aux_clk_src.clkr, + [PDM2_CLK_SRC] = &pdm2_clk_src.clkr, + [SDCC2_APPS_CLK_SRC] = &sdcc2_apps_clk_src.clkr, + [SDCC4_APPS_CLK_SRC] = &sdcc4_apps_clk_src.clkr, + [TSIF_REF_CLK_SRC] = &tsif_ref_clk_src.clkr, + [UFS_AXI_CLK_SRC] = &ufs_axi_clk_src.clkr, + [USB30_MASTER_CLK_SRC] = &usb30_master_clk_src.clkr, + [USB30_MOCK_UTMI_CLK_SRC] = &usb30_mock_utmi_clk_src.clkr, + [USB3_PHY_AUX_CLK_SRC] = &usb3_phy_aux_clk_src.clkr, +}; + +static struct gdsc *gcc_msm8998_gdscs[] = { + [PCIE_0_GDSC] = &pcie_0_gdsc, + [UFS_GDSC] = &ufs_gdsc, + [USB_30_GDSC] = &usb_30_gdsc, +}; + +static const struct qcom_reset_map gcc_msm8998_resets[] = { + [GCC_BLSP1_QUP1_BCR] = { 0x102400 }, + [GCC_BLSP1_QUP2_BCR] = { 0x110592 }, + [GCC_BLSP1_QUP3_BCR] = { 0x118784 }, + [GCC_BLSP1_QUP4_BCR] = { 0x126976 }, + [GCC_BLSP1_QUP5_BCR] = { 0x135168 }, + [GCC_BLSP1_QUP6_BCR] = { 0x143360 }, + [GCC_BLSP2_QUP1_BCR] = { 0x155648 }, + [GCC_BLSP2_QUP2_BCR] = { 0x163840 }, + [GCC_BLSP2_QUP3_BCR] = { 0x172032 }, + [GCC_BLSP2_QUP4_BCR] = { 0x180224 }, + [GCC_BLSP2_QUP5_BCR] = { 0x188416 }, + [GCC_BLSP2_QUP6_BCR] = { 0x196608 }, + [GCC_PCIE_0_BCR] = { 0x438272 }, + [GCC_PDM_BCR] = { 0x208896 }, + [GCC_SDCC2_BCR] = { 0x81920 }, + [GCC_SDCC4_BCR] = { 0x90112 }, + [GCC_TSIF_BCR] = { 0x221184 }, + [GCC_UFS_BCR] = { 0x479232 }, + [GCC_USB_30_BCR] = { 0x61440 }, +}; + +static const struct regmap_config gcc_msm8998_regmap_config = { + .reg_bits = 32, + .reg_stride = 4, + .val_bits = 32, + .max_register = 0x8f000, + .fast_io = true, +}; + +static const struct qcom_cc_desc gcc_msm8998_desc = { + .config = &gcc_msm8998_regmap_config, + .clks = gcc_msm8998_clocks, + .num_clks = ARRAY_SIZE(gcc_msm8998_clocks), + .resets = gcc_msm8998_resets, + .num_resets = ARRAY_SIZE(gcc_msm8998_resets), + .gdscs = gcc_msm8998_gdscs, + .num_gdscs = ARRAY_SIZE(gcc_msm8998_gdscs), +}; + +static int gcc_msm8998_probe(struct platform_device *pdev) +{ + struct regmap *regmap; + int ret; + + regmap = qcom_cc_map(pdev, &gcc_msm8998_desc); + if (IS_ERR(regmap)) + return PTR_ERR(regmap); + + /* + * Set the HMSS_AHB_CLK_SLEEP_ENA bit to allow the hmss_ahb_clk to be + * turned off by hardware during certain apps low power modes. + */ + ret = regmap_update_bits(regmap, 0x52008, BIT(21), BIT(21)); + if (ret) + return ret; + + return qcom_cc_really_probe(pdev, &gcc_msm8998_desc, regmap); +} + +static const struct of_device_id gcc_msm8998_match_table[] = { + { .compatible = "qcom,gcc-msm8998" }, + { } +}; +MODULE_DEVICE_TABLE(of, gcc_msm8998_match_table); + +static struct platform_driver gcc_msm8998_driver = { + .probe = gcc_msm8998_probe, + .driver = { + .name = "gcc-msm8998", + .of_match_table = gcc_msm8998_match_table, + }, +}; + +static int __init gcc_msm8998_init(void) +{ + return platform_driver_register(&gcc_msm8998_driver); +} +core_initcall(gcc_msm8998_init); + +static void __exit gcc_msm8998_exit(void) +{ + platform_driver_unregister(&gcc_msm8998_driver); +} +module_exit(gcc_msm8998_exit); + +MODULE_DESCRIPTION("QCOM GCC msm8998 Driver"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:gcc-msm8998"); diff --git a/include/dt-bindings/clock/qcom,gcc-msm8998.h b/include/dt-bindings/clock/qcom,gcc-msm8998.h new file mode 100644 index 000000000000..58a242e656b1 --- /dev/null +++ b/include/dt-bindings/clock/qcom,gcc-msm8998.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _DT_BINDINGS_CLK_MSM_GCC_COBALT_H +#define _DT_BINDINGS_CLK_MSM_GCC_COBALT_H + +#define BLSP1_QUP1_I2C_APPS_CLK_SRC 0 +#define BLSP1_QUP1_SPI_APPS_CLK_SRC 1 +#define BLSP1_QUP2_I2C_APPS_CLK_SRC 2 +#define BLSP1_QUP2_SPI_APPS_CLK_SRC 3 +#define BLSP1_QUP3_I2C_APPS_CLK_SRC 4 +#define BLSP1_QUP3_SPI_APPS_CLK_SRC 5 +#define BLSP1_QUP4_I2C_APPS_CLK_SRC 6 +#define BLSP1_QUP4_SPI_APPS_CLK_SRC 7 +#define BLSP1_QUP5_I2C_APPS_CLK_SRC 8 +#define BLSP1_QUP5_SPI_APPS_CLK_SRC 9 +#define BLSP1_QUP6_I2C_APPS_CLK_SRC 10 +#define BLSP1_QUP6_SPI_APPS_CLK_SRC 11 +#define BLSP1_UART1_APPS_CLK_SRC 12 +#define BLSP1_UART2_APPS_CLK_SRC 13 +#define BLSP1_UART3_APPS_CLK_SRC 14 +#define BLSP2_QUP1_I2C_APPS_CLK_SRC 15 +#define BLSP2_QUP1_SPI_APPS_CLK_SRC 16 +#define BLSP2_QUP2_I2C_APPS_CLK_SRC 17 +#define BLSP2_QUP2_SPI_APPS_CLK_SRC 18 +#define BLSP2_QUP3_I2C_APPS_CLK_SRC 19 +#define BLSP2_QUP3_SPI_APPS_CLK_SRC 20 +#define BLSP2_QUP4_I2C_APPS_CLK_SRC 21 +#define BLSP2_QUP4_SPI_APPS_CLK_SRC 22 +#define BLSP2_QUP5_I2C_APPS_CLK_SRC 23 +#define BLSP2_QUP5_SPI_APPS_CLK_SRC 24 +#define BLSP2_QUP6_I2C_APPS_CLK_SRC 25 +#define BLSP2_QUP6_SPI_APPS_CLK_SRC 26 +#define BLSP2_UART1_APPS_CLK_SRC 27 +#define BLSP2_UART2_APPS_CLK_SRC 28 +#define BLSP2_UART3_APPS_CLK_SRC 29 +#define GCC_AGGRE1_NOC_XO_CLK 30 +#define GCC_AGGRE1_UFS_AXI_CLK 31 +#define GCC_AGGRE1_USB3_AXI_CLK 32 +#define GCC_APSS_QDSS_TSCTR_DIV2_CLK 33 +#define GCC_APSS_QDSS_TSCTR_DIV8_CLK 34 +#define GCC_BIMC_HMSS_AXI_CLK 35 +#define GCC_BIMC_MSS_Q6_AXI_CLK 36 +#define GCC_BLSP1_AHB_CLK 37 +#define GCC_BLSP1_QUP1_I2C_APPS_CLK 38 +#define GCC_BLSP1_QUP1_SPI_APPS_CLK 39 +#define GCC_BLSP1_QUP2_I2C_APPS_CLK 40 +#define GCC_BLSP1_QUP2_SPI_APPS_CLK 41 +#define GCC_BLSP1_QUP3_I2C_APPS_CLK 42 +#define GCC_BLSP1_QUP3_SPI_APPS_CLK 43 +#define GCC_BLSP1_QUP4_I2C_APPS_CLK 44 +#define GCC_BLSP1_QUP4_SPI_APPS_CLK 45 +#define GCC_BLSP1_QUP5_I2C_APPS_CLK 46 +#define GCC_BLSP1_QUP5_SPI_APPS_CLK 47 +#define GCC_BLSP1_QUP6_I2C_APPS_CLK 48 +#define GCC_BLSP1_QUP6_SPI_APPS_CLK 49 +#define GCC_BLSP1_SLEEP_CLK 50 +#define GCC_BLSP1_UART1_APPS_CLK 51 +#define GCC_BLSP1_UART2_APPS_CLK 52 +#define GCC_BLSP1_UART3_APPS_CLK 53 +#define GCC_BLSP2_AHB_CLK 54 +#define GCC_BLSP2_QUP1_I2C_APPS_CLK 55 +#define GCC_BLSP2_QUP1_SPI_APPS_CLK 56 +#define GCC_BLSP2_QUP2_I2C_APPS_CLK 57 +#define GCC_BLSP2_QUP2_SPI_APPS_CLK 58 +#define GCC_BLSP2_QUP3_I2C_APPS_CLK 59 +#define GCC_BLSP2_QUP3_SPI_APPS_CLK 60 +#define GCC_BLSP2_QUP4_I2C_APPS_CLK 61 +#define GCC_BLSP2_QUP4_SPI_APPS_CLK 62 +#define GCC_BLSP2_QUP5_I2C_APPS_CLK 63 +#define GCC_BLSP2_QUP5_SPI_APPS_CLK 64 +#define GCC_BLSP2_QUP6_I2C_APPS_CLK 65 +#define GCC_BLSP2_QUP6_SPI_APPS_CLK 66 +#define GCC_BLSP2_SLEEP_CLK 67 +#define GCC_BLSP2_UART1_APPS_CLK 68 +#define GCC_BLSP2_UART2_APPS_CLK 69 +#define GCC_BLSP2_UART3_APPS_CLK 70 +#define GCC_CFG_NOC_USB3_AXI_CLK 71 +#define GCC_GP1_CLK 72 +#define GCC_GP2_CLK 73 +#define GCC_GP3_CLK 74 +#define GCC_GPU_BIMC_GFX_CLK 75 +#define GCC_GPU_BIMC_GFX_SRC_CLK 76 +#define GCC_GPU_CFG_AHB_CLK 77 +#define GCC_GPU_SNOC_DVM_GFX_CLK 78 +#define GCC_HMSS_AHB_CLK 79 +#define GCC_HMSS_AT_CLK 80 +#define GCC_HMSS_DVM_BUS_CLK 81 +#define GCC_HMSS_RBCPR_CLK 82 +#define GCC_HMSS_TRIG_CLK 83 +#define GCC_LPASS_AT_CLK 84 +#define GCC_LPASS_TRIG_CLK 85 +#define GCC_MMSS_NOC_CFG_AHB_CLK 86 +#define GCC_MMSS_QM_AHB_CLK 87 +#define GCC_MMSS_QM_CORE_CLK 88 +#define GCC_MMSS_SYS_NOC_AXI_CLK 89 +#define GCC_MSS_AT_CLK 90 +#define GCC_PCIE_0_AUX_CLK 91 +#define GCC_PCIE_0_CFG_AHB_CLK 92 +#define GCC_PCIE_0_MSTR_AXI_CLK 93 +#define GCC_PCIE_0_PIPE_CLK 94 +#define GCC_PCIE_0_SLV_AXI_CLK 95 +#define GCC_PCIE_PHY_AUX_CLK 96 +#define GCC_PDM2_CLK 97 +#define GCC_PDM_AHB_CLK 98 +#define GCC_PDM_XO4_CLK 99 +#define GCC_PRNG_AHB_CLK 100 +#define GCC_SDCC2_AHB_CLK 101 +#define GCC_SDCC2_APPS_CLK 102 +#define GCC_SDCC4_AHB_CLK 103 +#define GCC_SDCC4_APPS_CLK 104 +#define GCC_TSIF_AHB_CLK 105 +#define GCC_TSIF_INACTIVITY_TIMERS_CLK 106 +#define GCC_TSIF_REF_CLK 107 +#define GCC_UFS_AHB_CLK 108 +#define GCC_UFS_AXI_CLK 109 +#define GCC_UFS_ICE_CORE_CLK 110 +#define GCC_UFS_PHY_AUX_CLK 111 +#define GCC_UFS_RX_SYMBOL_0_CLK 112 +#define GCC_UFS_RX_SYMBOL_1_CLK 113 +#define GCC_UFS_TX_SYMBOL_0_CLK 114 +#define GCC_UFS_UNIPRO_CORE_CLK 115 +#define GCC_USB30_MASTER_CLK 116 +#define GCC_USB30_MOCK_UTMI_CLK 117 +#define GCC_USB30_SLEEP_CLK 118 +#define GCC_USB3_PHY_AUX_CLK 119 +#define GCC_USB3_PHY_PIPE_CLK 120 +#define GCC_USB_PHY_CFG_AHB2PHY_CLK 121 +#define GP1_CLK_SRC 122 +#define GP2_CLK_SRC 123 +#define GP3_CLK_SRC 124 +#define GPLL0 125 +#define GPLL0_OUT_EVEN 126 +#define GPLL0_OUT_MAIN 127 +#define GPLL0_OUT_ODD 128 +#define GPLL0_OUT_TEST 129 +#define GPLL1 130 +#define GPLL1_OUT_EVEN 131 +#define GPLL1_OUT_MAIN 132 +#define GPLL1_OUT_ODD 133 +#define GPLL1_OUT_TEST 134 +#define GPLL2 135 +#define GPLL2_OUT_EVEN 136 +#define GPLL2_OUT_MAIN 137 +#define GPLL2_OUT_ODD 138 +#define GPLL2_OUT_TEST 139 +#define GPLL3 140 +#define GPLL3_OUT_EVEN 141 +#define GPLL3_OUT_MAIN 142 +#define GPLL3_OUT_ODD 143 +#define GPLL3_OUT_TEST 144 +#define GPLL4 145 +#define GPLL4_OUT_EVEN 146 +#define GPLL4_OUT_MAIN 147 +#define GPLL4_OUT_ODD 148 +#define GPLL4_OUT_TEST 149 +#define GPLL6 150 +#define GPLL6_OUT_EVEN 151 +#define GPLL6_OUT_MAIN 152 +#define GPLL6_OUT_ODD 153 +#define GPLL6_OUT_TEST 154 +#define HMSS_AHB_CLK_SRC 155 +#define HMSS_RBCPR_CLK_SRC 156 +#define PCIE_AUX_CLK_SRC 157 +#define PDM2_CLK_SRC 158 +#define SDCC2_APPS_CLK_SRC 159 +#define SDCC4_APPS_CLK_SRC 160 +#define TSIF_REF_CLK_SRC 161 +#define UFS_AXI_CLK_SRC 162 +#define USB30_MASTER_CLK_SRC 163 +#define USB30_MOCK_UTMI_CLK_SRC 164 +#define USB3_PHY_AUX_CLK_SRC 165 + +#define PCIE_0_GDSC 0 +#define UFS_GDSC 1 +#define USB_30_GDSC 2 + +#define GCC_BLSP1_QUP1_BCR 0 +#define GCC_BLSP1_QUP2_BCR 1 +#define GCC_BLSP1_QUP3_BCR 2 +#define GCC_BLSP1_QUP4_BCR 3 +#define GCC_BLSP1_QUP5_BCR 4 +#define GCC_BLSP1_QUP6_BCR 5 +#define GCC_BLSP2_QUP1_BCR 6 +#define GCC_BLSP2_QUP2_BCR 7 +#define GCC_BLSP2_QUP3_BCR 8 +#define GCC_BLSP2_QUP4_BCR 9 +#define GCC_BLSP2_QUP5_BCR 10 +#define GCC_BLSP2_QUP6_BCR 11 +#define GCC_PCIE_0_BCR 12 +#define GCC_PDM_BCR 13 +#define GCC_SDCC2_BCR 14 +#define GCC_SDCC4_BCR 15 +#define GCC_TSIF_BCR 16 +#define GCC_UFS_BCR 17 +#define GCC_USB_30_BCR 18 + +#endif -- cgit v1.2.3 From 8b4aa4d8dc283a408eb14ca5e27e0a9a689c2e5a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 5 Apr 2018 12:03:05 -0400 Subject: media: mmp-camera.h: add missing platform data Those definitions used to be part of the original patch: https://patchwork.kernel.org/patch/2815221/ But, somehow, nobody ever noticed until today. Years later, Arnd discovered that mmp-camera driver doesn't build and make it depend on BROKEN. Add the missing bits here, in order to remove BROKEN dependency. Signed-off-by: Mauro Carvalho Chehab --- include/linux/platform_data/media/mmp-camera.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/media/mmp-camera.h b/include/linux/platform_data/media/mmp-camera.h index 83804028115c..d2d3a443eedf 100644 --- a/include/linux/platform_data/media/mmp-camera.h +++ b/include/linux/platform_data/media/mmp-camera.h @@ -3,8 +3,27 @@ * Information for the Marvell Armada MMP camera */ +#include + +enum dphy3_algo { + DPHY3_ALGO_DEFAULT = 0, + DPHY3_ALGO_PXA910, + DPHY3_ALGO_PXA2128 +}; + struct mmp_camera_platform_data { struct platform_device *i2c_device; int sensor_power_gpio; int sensor_reset_gpio; + enum v4l2_mbus_type bus_type; + int mclk_min; /* The minimal value of MCLK */ + int mclk_src; /* which clock source the MCLK derives from */ + int mclk_div; /* Clock Divider Value for MCLK */ + /* + * MIPI support + */ + int dphy[3]; /* DPHY: CSI2_DPHY3, CSI2_DPHY5, CSI2_DPHY6 */ + enum dphy3_algo dphy3_algo; /* algos for calculate CSI2_DPHY3 */ + int lane; /* ccic used lane number; 0 means DVP mode */ + int lane_clk; }; -- cgit v1.2.3 From 190d7f02ce8ef6774a69d3ec18c288c8a9601a4e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 8 Dec 2017 15:28:18 +0100 Subject: HID: input: do not increment usages when a duplicate is found This is something that bothered us from a long time. When hid-input doesn't know how to map a usage, it uses *_MISC. But there is something else which increments the usage if the evdev code is already used. This leads to few issues: - some devices may have their ABS_X mapped to ABS_Y if they export a bad set of usages (see the DragonRise joysticks IIRC -> fixed in a specific HID driver) - *_MISC + N might (will) conflict with other defined axes (my Logitech H800 exports some multitouch axes because of that) - this prevents to freely add some new evdev usages, because "hey, my headset will now report ABS_COFFEE, and it's not coffee capable". So let's try to kill this nonsense, and hope we won't break too many devices. I my headset case, the ABS_MISC axes are created because of some proprietary usages, so we might not break that many devices. For backward compatibility, a quirk HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE is created and can be applied to any device that needs this behavior. Signed-off-by: Benjamin Tissoires Acked-by: Peter Hutterer Acked-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 33 +++++++++++++++++++++++++++++++-- include/linux/hid.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 6836a856c243..04056773102e 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1100,8 +1100,31 @@ mapped: set_bit(usage->type, input->evbit); - while (usage->code <= max && test_and_set_bit(usage->code, bit)) - usage->code = find_next_zero_bit(bit, max + 1, usage->code); + /* + * This part is *really* controversial: + * - HID aims at being generic so we should do our best to export + * all incoming events + * - HID describes what events are, so there is no reason for ABS_X + * to be mapped to ABS_Y + * - HID is using *_MISC+N as a default value, but nothing prevents + * *_MISC+N to overwrite a legitimate even, which confuses userspace + * (for instance ABS_MISC + 7 is ABS_MT_SLOT, which has a different + * processing) + * + * If devices still want to use this (at their own risk), they will + * have to use the quirk HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE, but + * the default should be a reliable mapping. + */ + while (usage->code <= max && test_and_set_bit(usage->code, bit)) { + if (device->quirks & HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE) { + usage->code = find_next_zero_bit(bit, + max + 1, + usage->code); + } else { + device->status |= HID_STAT_DUP_DETECTED; + goto ignore; + } + } if (usage->code > max) goto ignore; @@ -1611,6 +1634,8 @@ int hidinput_connect(struct hid_device *hid, unsigned int force) INIT_LIST_HEAD(&hid->inputs); INIT_WORK(&hid->led_work, hidinput_led_worker); + hid->status &= ~HID_STAT_DUP_DETECTED; + if (!force) { for (i = 0; i < hid->maxcollection; i++) { struct hid_collection *col = &hid->collection[i]; @@ -1677,6 +1702,10 @@ int hidinput_connect(struct hid_device *hid, unsigned int force) goto out_unwind; } + if (hid->status & HID_STAT_DUP_DETECTED) + hid_dbg(hid, + "Some usages could not be mapped, please use HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE if this is legitimate.\n"); + return 0; out_unwind: diff --git a/include/linux/hid.h b/include/linux/hid.h index 8da3e1f48195..0267aa5c1ea3 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -345,6 +345,7 @@ struct hid_item { #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) #define HID_QUIRK_HAVE_SPECIAL_DRIVER BIT(19) +#define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE BIT(20) #define HID_QUIRK_FULLSPEED_INTERVAL BIT(28) #define HID_QUIRK_NO_INIT_REPORTS BIT(29) #define HID_QUIRK_NO_IGNORE BIT(30) @@ -502,6 +503,7 @@ struct hid_output_fifo { #define HID_STAT_ADDED BIT(0) #define HID_STAT_PARSED BIT(1) +#define HID_STAT_DUP_DETECTED BIT(2) struct hid_input { struct list_head list; -- cgit v1.2.3 From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:26 +0200 Subject: xdp: introduce xdp_return_frame API and use in cpumap Introduce an xdp_return_frame API, and convert over cpumap as the first user, given it have queued XDP frame structure to leverage. V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck. V6: Remove comment that id will be added later (Req by Alex Duyck) V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 27 ++++++++++++++++++++++++ kernel/bpf/cpumap.c | 60 ++++++++++++++++++++++++++++++++--------------------- net/core/xdp.c | 18 ++++++++++++++++ 3 files changed, 81 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..e4207699c410 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,43 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_MAX, +}; + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +static inline +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) + page_frag_free(data); + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); #endif /* __LINUX_NET_XDP_H__ */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..3e4bbcbe3e86 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -188,6 +168,10 @@ struct xdp_pkt { u16 len; u16 headroom; u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; struct net_device *dev_rx; }; @@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); xdp_pkt->metasize = metasize; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_pkt->mem = xdp->rxq->mem; + return xdp_pkt; } @@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_pkt *xdp_pkt; + + while ((xdp_pkt = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdp_pkt)) + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdp_pkt); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); continue; } @@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_pkt *xdp_pkt = bq->q[i]; int err; err = __ptr_ring_produce(q, xdp_pkt); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); } processed++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..7e6b3545277d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + if (type >= MEM_TYPE_MAX) + return -EINVAL; + + xdp_rxq->mem.type = type; + + if (allocator) + return -EOPNOTSUPP; + + /* TODO: Allocate an ID that maps to allocator pointer + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + */ + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -- cgit v1.2.3 From 106ca27f2922e8de820d1bd3d79b1cbdf2d78eea Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:37 +0200 Subject: xdp: move struct xdp_buff from filter.h to xdp.h This is done to prepare for the next patch, and it is also nice to move this XDP related struct out of filter.h. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 24 +----------------------- include/net/xdp.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index fc4e8f91b03d..4da8b2308174 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -30,6 +30,7 @@ struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; +struct xdp_buff; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -500,14 +501,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - struct sk_msg_buff { void *data; void *data_end; @@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/net/xdp.h b/include/net/xdp.h index e4207699c410..15f8ade008b5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -50,6 +50,13 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + struct xdp_rxq_info *rxq; +}; static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) @@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From c0048cff8abb69c956ce1277d17a3f7a14e41522 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:42 +0200 Subject: xdp: introduce a new xdp_frame type This is needed to convert drivers tuntap and virtio_net. This is a generalization of what is done inside cpumap, which will be converted later. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 15f8ade008b5..756c42811e78 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -58,6 +58,46 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) { -- cgit v1.2.3 From 1ffcbc8537d0bc32aaca7000cb9c904ec4b6300f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:47 +0200 Subject: tun: convert to use generic xdp_frame and xdp_return_frame API The tuntap driver invented it's own driver specific way of queuing XDP packets, by storing the xdp_buff information in the top of the XDP frame data. Convert it over to use the more generic xdp_frame structure. The main problem with the in-driver method is that the xdp_rxq_info pointer cannot be trused/used when dequeueing the frame. V3: Remove check based on feedback from Jason Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/tun.c | 43 ++++++++++++++++++++----------------------- drivers/vhost/net.c | 7 ++++--- include/linux/if_tun.h | 4 ++-- 3 files changed, 26 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 28583aa0c17d..2c85e5cac2a9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -248,11 +248,11 @@ struct veth { __be16 h_vlan_TCI; }; -bool tun_is_xdp_buff(void *ptr) +bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } -EXPORT_SYMBOL(tun_is_xdp_buff); +EXPORT_SYMBOL(tun_is_xdp_frame); void *tun_xdp_to_ptr(void *ptr) { @@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr) { if (!ptr) return; - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - put_page(virt_to_head_page(xdp->data)); + xdp_return_frame(xdpf->data, &xdpf->mem); } else { __skb_array_destroy_skb(ptr); } @@ -1298,17 +1298,14 @@ static const struct net_device_ops tun_netdev_ops = { static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) { struct tun_struct *tun = netdev_priv(dev); - struct xdp_buff *buff = xdp->data_hard_start; - int headroom = xdp->data - xdp->data_hard_start; + struct xdp_frame *frame; struct tun_file *tfile; u32 numqueues; int ret = 0; - /* Assure headroom is available and buff is properly aligned */ - if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp))) - return -ENOSPC; - - *buff = *xdp; + frame = convert_to_xdp_frame(xdp); + if (unlikely(!frame)) + return -EOVERFLOW; rcu_read_lock(); @@ -1323,7 +1320,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) { + if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { this_cpu_inc(tun->pcpu_stats->tx_dropped); ret = -ENOSPC; } @@ -2001,11 +1998,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, - struct xdp_buff *xdp, + struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; - size_t size = xdp->data_end - xdp->data; + size_t size = xdp_frame->len; struct tun_pcpu_stats *stats; size_t ret; @@ -2021,7 +2018,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } - ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz; + ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); @@ -2189,11 +2186,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, return err; } - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - ret = tun_put_user_xdp(tun, tfile, xdp, to); - put_page(virt_to_head_page(xdp->data)); + ret = tun_put_user_xdp(tun, tfile, xdpf, to); + xdp_return_frame(xdpf->data, &xdpf->mem); } else { struct sk_buff *skb = ptr; @@ -2432,10 +2429,10 @@ out_free: static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - return xdp->data_end - xdp->data; + return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 986058a57917..bbf38befefb2 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -32,6 +32,7 @@ #include #include +#include #include "vhost.h" @@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) static int vhost_net_buf_peek_len(void *ptr) { - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - return xdp->data_end - xdp->data; + return xdpf->len; } return __skb_array_len_with_tag(ptr); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index fd00170b494f..3d2996dc7d85 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -22,7 +22,7 @@ #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_buff(void *ptr); +bool tun_is_xdp_frame(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); @@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } -static inline bool tun_is_xdp_buff(void *ptr) +static inline bool tun_is_xdp_frame(void *ptr) { return false; } -- cgit v1.2.3 From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:57 +0200 Subject: bpf: cpumap convert to use generic xdp_frame The generic xdp_frame format, was inspired by the cpumap own internal xdp_pkt format. It is now time to convert it over to the generic xdp_frame format. The cpumap needs one extra field dev_rx. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 1 + kernel/bpf/cpumap.c | 100 +++++++++++++++------------------------------------- 2 files changed, 29 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 756c42811e78..ea3773f94f65 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,6 +67,7 @@ struct xdp_frame { * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ }; /* Convert xdp_buff to xdp_frame */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3e4bbcbe3e86..bcdc4dea5ce7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. - */ - struct xdp_mem_info mem; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_pkt->mem = xdp->rxq->mem; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - while ((xdp_pkt = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdp_pkt)) - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf->data, &xdpf->mem); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); continue; } @@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - struct xdp_pkt *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); } processed++; } @@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } -- cgit v1.2.3 From 8d5d88527587516bd58ff0f3810f07c38e65e2be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:12 +0200 Subject: xdp: rhashtable with allocator ID to pointer mapping Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. Instead of using the IDR infrastructure, which uses a radix tree, use a dynamic rhashtable, for creating ID to pointer lookup table, because this is faster. The problem that is being solved here is that, the xdp_rxq_info pointer (stored in xdp_buff) cannot be used directly, as the guaranteed lifetime is too short. The info is needed on a (potentially) remote CPU during DMA-TX completion time . In an xdp_frame the xdp_mem_info is stored, when it got converted from an xdp_buff, which is sufficient for the simple page refcnt based recycle schemes. For more advanced allocators there is a need to store a pointer to the registered allocator. Thus, there is a need to guard the lifetime or validity of the allocator pointer, which is done through this rhashtable ID map to pointer. The removal and validity of of the allocator and helper struct xdp_mem_allocator is guarded by RCU. The allocator will be created by the driver, and registered with xdp_rxq_info_reg_mem_model(). It is up-to debate who is responsible for freeing the allocator pointer or invoking the allocator destructor function. In any case, this must happen via RCU freeing. Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. V4: Per req of Jason Wang - Use xdp_rxq_info_reg_mem_model() in all drivers implementing XDP_REDIRECT, even-though it's not strictly necessary when allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero). V6: Per req of Alex Duyck - Introduce rhashtable_lookup() call in later patch V8: Address sparse should be static warnings (from kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 +- drivers/net/tun.c | 6 + drivers/net/virtio_net.c | 7 + include/net/xdp.h | 14 +- net/core/xdp.c | 223 +++++++++++++++++++++++++- 5 files changed, 241 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 0bfe6cf2bf8b..f10904ec2172 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6370,7 +6370,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, struct device *dev = rx_ring->dev; int orig_node = dev_to_node(dev); int ring_node = -1; - int size; + int size, err; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; @@ -6407,6 +6407,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, rx_ring->queue_index) < 0) goto err; + err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err) { + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + goto err; + } + rx_ring->xdp_prog = adapter->xdp_prog; return 0; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2c85e5cac2a9..283bde85c455 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tun->dev, tfile->queue_index); if (err < 0) goto out; + err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err < 0) { + xdp_rxq_info_unreg(&tfile->xdp_rxq); + goto out; + } err = 0; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f50e1ad81ad4..42d338fe9a8d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1305,6 +1305,13 @@ static int virtnet_open(struct net_device *dev) if (err < 0) return err; + err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err < 0) { + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); + return err; + } + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); } diff --git a/include/net/xdp.h b/include/net/xdp.h index ea3773f94f65..5f67c62540aa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,6 +41,7 @@ enum xdp_mem_type { struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; }; struct xdp_rxq_info { @@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -static inline -void xdp_return_frame(void *data, struct xdp_mem_info *mem) -{ - if (mem->type == MEM_TYPE_PAGE_SHARED) - page_frag_free(data); - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - - put_page(page); - } -} +void xdp_return_frame(void *data, struct xdp_mem_info *mem); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/net/core/xdp.c b/net/core/xdp.c index 7e6b3545277d..8b2cb79b5de0 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,9 @@ */ #include #include +#include +#include +#include #include @@ -13,6 +16,99 @@ #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { + struct xdp_mem_info mem; + void *allocator; + struct rhash_head node; + struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ + return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + /* TODO: Depending on allocator type/pointer free resources */ + + /* Poison memory */ + xa->mem.id = 0xFFFF; + xa->mem.type = 0xF0F0; + xa->allocator = (void *)0xDEAD9001; + + kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + struct xdp_mem_allocator *xa; + int id = xdp_rxq->mem.id; + int err; + + if (id == 0) + return; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + + err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); + WARN_ON(err); + + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +117,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; + + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -72,20 +174,131 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator) { + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + if (type >= MEM_TYPE_MAX) return -EINVAL; xdp_rxq->mem.type = type; - if (allocator) - return -EOPNOTSUPP; + if (!allocator) + return 0; + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ret; + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return -ENOMEM; + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + xdp_rxq->mem.id = id; + xdp_alloc->mem = xdp_rxq->mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + errno = PTR_ERR(ptr); + goto err; + } + + mutex_unlock(&mem_id_lock); - /* TODO: Allocate an ID that maps to allocator pointer - * See: https://www.kernel.org/doc/html/latest/core-api/idr.html - */ return 0; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return errno; } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) { + page_frag_free(data); + return; + } + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} +EXPORT_SYMBOL_GPL(xdp_return_frame); -- cgit v1.2.3 From ff7d6b27f894f1469dc51ccb828b7363ccd9799f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:17 +0200 Subject: page_pool: refurbish version of page_pool code Need a fast page recycle mechanism for ndo_xdp_xmit API for returning pages on DMA-TX completion time, which have good cross CPU performance, given DMA-TX completion time can happen on a remote CPU. Refurbish my page_pool code, that was presented[1] at MM-summit 2016. Adapted page_pool code to not depend the page allocator and integration into struct page. The DMA mapping feature is kept, even-though it will not be activated/used in this patchset. [1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf V2: Adjustments requested by Tariq - Changed page_pool_create return codes, don't return NULL, only ERR_PTR, as this simplifies err handling in drivers. V4: many small improvements and cleanups - Add DOC comment section, that can be used by kernel-doc - Improve fallback mode, to work better with refcnt based recycling e.g. remove a WARN as pointed out by Tariq e.g. quicker fallback if ptr_ring is empty. V5: Fixed SPDX license as pointed out by Alexei V6: Adjustments requested by Eric Dumazet - Adjust ____cacheline_aligned_in_smp usage/placement - Move rcu_head in struct page_pool - Free pages quicker on destroy, minimize resources delayed an RCU period - Remove code for forward/backward compat ABI interface V8: Issues found by kbuild test robot - Address sparse should be static warnings - Only compile+link when a driver use/select page_pool, mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 + include/net/page_pool.h | 129 ++++++++++ net/Kconfig | 3 + net/core/Makefile | 1 + net/core/page_pool.c | 317 ++++++++++++++++++++++++ 5 files changed, 451 insertions(+) create mode 100644 include/net/page_pool.h create mode 100644 net/core/page_pool.c (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index c032319f1cb9..12257034131e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -30,6 +30,7 @@ config MLX5_CORE_EN bool "Mellanox Technologies ConnectX-4 Ethernet support" depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE depends on IPV6=y || IPV6=n || MLX5_CORE=m + select PAGE_POOL default n ---help--- Ethernet support in Mellanox Technologies ConnectX-4 NIC. diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..1fe77db59518 --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include /* Needed by ptr_ring */ +#include +#include + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +{ + __page_pool_put_page(pool, page, false); +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +#endif /* _NET_PAGE_POOL_H */ diff --git a/net/Kconfig b/net/Kconfig index 0428f12c25c2..6fa1a4493b8c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -423,6 +423,9 @@ config MAY_USE_DEVLINK on MAY_USE_DEVLINK to ensure they do not cause link errors when devlink is a loadable module and the driver using it is built-in. +config PAGE_POOL + bool + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..7080417f8bc8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ fib_notifier.o xdp.o obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ +#include +#include +#include + +#include +#include +#include +#include +#include /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + return -ENOMEM; + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err = 0; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Test for safe-context, caller should provide this guarantee */ + if (likely(in_serving_softirq())) { + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + return page; + } + /* Slower-path: Alloc array empty, time to refill + * + * Open-coded bulk ptr_ring consumer. + * + * Discussion: the ring consumer lock is not really + * needed due to the softirq/NAPI protection, but + * later need the ability to reclaim pages on the + * ring. Thus, keeping the locks. + */ + spin_lock(&r->consumer_lock); + while ((page = __ptr_ring_consume(r))) { + if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) + break; + pool->alloc.cache[pool->alloc.count++] = page; + } + spin_unlock(&r->consumer_lock); + return page; + } + + /* Slow-path: Get page from locked ring queue */ + page = ptr_ring_consume(&pool->ring); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t _gfp) +{ + struct page *page; + gfp_t gfp = _gfp; + dma_addr_t dma; + + /* We could always set __GFP_COMP, and avoid this branch, as + * prep_new_page() can handle order-0 with __GFP_COMP. + */ + if (pool->p.order) + gfp |= __GFP_COMP; + + /* FUTURE development: + * + * Current slow-path essentially falls back to single page + * allocations, which doesn't improve performance. This code + * need bulk allocation support from the page allocator code. + */ + + /* Cache was empty, do real allocation */ + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) + return NULL; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_map; + + /* Setup DMA mapping: use page->private for DMA-addr + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + return NULL; + } + set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, + struct page *page) +{ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return; + + /* DMA unmap */ + dma_unmap_page(pool->p.dev, page_private(page), + PAGE_SIZE << pool->p.order, pool->p.dma_dir); + set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ + __page_pool_clean_page(pool, page); + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, + struct page *page) +{ + int ret; + /* BH protection not needed if current is serving softirq */ + if (in_serving_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + return false; + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + return true; +} + +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + */ + if (likely(page_ref_count(page) == 1)) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (allow_direct && in_serving_softirq()) + if (__page_pool_recycle_direct(page, pool)) + return; + + if (!__page_pool_recycle_into_ring(pool, page)) { + /* Cache full, fallback to free pages */ + __page_pool_return_page(pool, page); + } + return; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + __page_pool_clean_page(pool, page); + put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + __page_pool_return_page(pool, page); + } +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ + struct page_pool *pool; + + pool = container_of(rcu, struct page_pool, rcu); + + WARN(pool->alloc.count, "API usage violation"); + + __page_pool_empty_ring(pool); + ptr_ring_cleanup(&pool->ring, NULL); + kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ + struct page *page; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + __page_pool_empty_ring(pool); + + /* An xdp_mem_allocator can still ref page_pool pointer */ + call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); -- cgit v1.2.3 From 57d0a1c1ac9e6a836bbab4698ba2a2e03f64bf1b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:22 +0200 Subject: xdp: allow page_pool as an allocator type in xdp_return_frame New allocator type MEM_TYPE_PAGE_POOL for page_pool usage. The registered allocator page_pool pointer is not available directly from xdp_rxq_info, but it could be (if needed). For now, the driver should keep separate track of the page_pool pointer, which it should use for RX-ring page allocation. As suggested by Saeed, to maintain a symmetric API it is the drivers responsibility to allocate/create and free/destroy the page_pool. Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers responsibility to free the page_pool, but with a RCU free call. This is done easily via the page_pool helper page_pool_destroy() (which avoids touching any driver code during the RCU callback, which could happen after the driver have been unloaded). V8: address issues found by kbuild test robot - Address sparse should be static warnings - Allow xdp.o to be compiled without page_pool.o V9: Remove inline from .c file, compiler knows best Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 14 ++++++++++++ include/net/xdp.h | 3 +++ net/core/xdp.c | 60 +++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 65 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1fe77db59518..c79087153148 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool, static inline void page_pool_put_page(struct page_pool *pool, struct page *page) { + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL __page_pool_put_page(pool, page, false); +#endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, @@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 5f67c62540aa..d0ee437753dc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -36,6 +36,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, MEM_TYPE_MAX, }; @@ -44,6 +45,8 @@ struct xdp_mem_info { u32 id; }; +struct page_pool; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; diff --git a/net/core/xdp.c b/net/core/xdp.c index 8b2cb79b5de0..33e382afbd95 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -27,7 +28,10 @@ static struct rhashtable *mem_id_ht; struct xdp_mem_allocator { struct xdp_mem_info mem; - void *allocator; + union { + void *allocator; + struct page_pool *page_pool; + }; struct rhash_head node; struct rcu_head rcu; }; @@ -74,7 +78,9 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* TODO: Depending on allocator type/pointer free resources */ + /* Notice, driver is expected to free the *allocator, + * e.g. page_pool, and MUST also use RCU free. + */ /* Poison memory */ xa->mem.id = 0xFFFF; @@ -225,6 +231,17 @@ again: return id; } +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator) { @@ -238,13 +255,16 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, return -EFAULT; } - if (type >= MEM_TYPE_MAX) - return -EINVAL; + if (!__is_supported_mem_type(type)) + return -EOPNOTSUPP; xdp_rxq->mem.type = type; - if (!allocator) + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return -EINVAL; /* Setup time check page_pool req */ return 0; + } /* Delay init of rhashtable to save memory if feature isn't used */ if (!mem_id_init) { @@ -290,15 +310,31 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); void xdp_return_frame(void *data, struct xdp_mem_info *mem) { - if (mem->type == MEM_TYPE_PAGE_SHARED) { + struct xdp_mem_allocator *xa; + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_put_page(xa->page_pool, page); + else + put_page(page); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: page_frag_free(data); - return; - } - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + break; } } EXPORT_SYMBOL_GPL(xdp_return_frame); -- cgit v1.2.3 From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:32 +0200 Subject: xdp: transition into using xdp_frame for return API Changing API xdp_return_frame() to take struct xdp_frame as argument, seems like a natural choice. But there are some subtle performance details here that needs extra care, which is a deliberate choice. When de-referencing xdp_frame on a remote CPU during DMA-TX completion, result in the cache-line is change to "Shared" state. Later when the page is reused for RX, then this xdp_frame cache-line is written, which change the state to "Modified". This situation already happens (naturally) for, virtio_net, tun and cpumap as the xdp_frame pointer is the queued object. In tun and cpumap, the ptr_ring is used for efficiently transferring cache-lines (with pointers) between CPUs. Thus, the only option is to de-referencing xdp_frame. It is only the ixgbe driver that had an optimization, in which it can avoid doing the de-reference of xdp_frame. The driver already have TX-ring queue, which (in case of remote DMA-TX completion) have to be transferred between CPUs anyhow. In this data area, we stored a struct xdp_mem_info and a data pointer, which allowed us to avoid de-referencing xdp_frame. To compensate for this, a prefetchw is used for telling the cache coherency protocol about our access pattern. My benchmarks show that this prefetchw is enough to compensate the ixgbe driver. V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address and offset in dma_sync call") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 5 ++--- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 +--- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 17 +++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + drivers/net/tun.c | 4 ++-- drivers/net/virtio_net.c | 2 +- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 6 +++--- net/core/xdp.c | 4 +++- 9 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 96c54cbfb1f9..c8bf4d35fdea 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -638,8 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) kfree(tx_buffer->raw_buf); else if (ring_is_xdp(ring)) - xdp_return_frame(tx_buffer->xdpf->data, - &tx_buffer->xdpf->mem); + xdp_return_frame(tx_buffer->xdpf); else dev_kfree_skb_any(tx_buffer->skb); if (dma_unmap_len(tx_buffer, len)) @@ -842,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, /* free the skb/XDP data */ if (ring_is_xdp(tx_ring)) - xdp_return_frame(tx_buf->xdpf->data, &tx_buf->xdpf->mem); + xdp_return_frame(tx_buf->xdpf); else napi_consume_skb(tx_buf->skb, napi_budget); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index abb5248e917e..7dd5038cfcc4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -241,8 +241,7 @@ struct ixgbe_tx_buffer { unsigned long time_stamp; union { struct sk_buff *skb; - /* XDP uses address ptr on irq_clean */ - void *data; + struct xdp_frame *xdpf; }; unsigned int bytecount; unsigned short gso_segs; @@ -250,7 +249,6 @@ struct ixgbe_tx_buffer { DEFINE_DMA_UNMAP_ADDR(dma); DEFINE_DMA_UNMAP_LEN(len); u32 tx_flags; - struct xdp_mem_info xdp_mem; }; struct ixgbe_rx_buffer { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f10904ec2172..4f2864165723 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, /* free the skb */ if (ring_is_xdp(tx_ring)) - xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem); + xdp_return_frame(tx_buffer->xdpf); else napi_consume_skb(tx_buffer->skb, napi_budget); @@ -2386,6 +2386,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; + prefetchw(xdp.data_hard_start); /* xdp_frame write */ skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); } @@ -5797,7 +5798,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring) /* Free all the Tx ring sk_buffs */ if (ring_is_xdp(tx_ring)) - xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem); + xdp_return_frame(tx_buffer->xdpf); else dev_kfree_skb_any(tx_buffer->skb); @@ -8348,16 +8349,21 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()]; struct ixgbe_tx_buffer *tx_buffer; union ixgbe_adv_tx_desc *tx_desc; + struct xdp_frame *xdpf; u32 len, cmd_type; dma_addr_t dma; u16 i; - len = xdp->data_end - xdp->data; + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + len = xdpf->len; if (unlikely(!ixgbe_desc_unused(ring))) return IXGBE_XDP_CONSUMED; - dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE); + dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE); if (dma_mapping_error(ring->dev, dma)) return IXGBE_XDP_CONSUMED; @@ -8372,8 +8378,7 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, dma_unmap_len_set(tx_buffer, len, len); dma_unmap_addr_set(tx_buffer, dma, dma); - tx_buffer->data = xdp->data; - tx_buffer->xdp_mem = xdp->rxq->mem; + tx_buffer->xdpf = xdpf; tx_desc->read.buffer_addr = cpu_to_le64(dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f42436d7f2d9..7bbf0db27a01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -890,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset, frag_size, DMA_FROM_DEVICE); + prefetchw(va); /* xdp_frame data area */ prefetch(data); wi->offset += frag_size; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 283bde85c455..bec130cdbd9d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -663,7 +663,7 @@ void tun_ptr_free(void *ptr) if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } @@ -2196,7 +2196,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 42d338fe9a8d..ab3d7cbc4c49 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -430,7 +430,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, /* Free up any pending old buffers before queueing new ones. */ while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) - xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem); + xdp_return_frame(xdpf_sent); xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) diff --git a/include/net/xdp.h b/include/net/xdp.h index d0ee437753dc..137ad5f9f40f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -void xdp_return_frame(void *data, struct xdp_mem_info *mem); +void xdp_return_frame(struct xdp_frame *xdpf); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bcdc4dea5ce7..c95b04ec103e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) while ((xdpf = ptr_ring_consume(ring))) if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); continue; } @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } processed++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 33e382afbd95..0c86b53a3a63 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,9 +308,11 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(void *data, struct xdp_mem_info *mem) +void xdp_return_frame(struct xdp_frame *xdpf) { + struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; + void *data = xdpf->data; struct page *page; switch (mem->type) { -- cgit v1.2.3 From 44fa2dbd475996ddc8f3a0e6113dee983e0ee3aa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:37 +0200 Subject: xdp: transition into using xdp_frame for ndo_xdp_xmit Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct xdp_buff. This brings xdp_return_frame and ndp_xdp_xmit in sync. This builds towards changing the API further to become a bulk API, because xdp_buff is not a queue-able object while xdp_frame is. V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP") V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 30 +++++++++++++++------------ drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 24 +++++++++++---------- drivers/net/tun.c | 19 ++++++++++------- drivers/net/virtio_net.c | 24 ++++++++++++--------- include/linux/netdevice.h | 4 ++-- net/core/filter.c | 17 +++++++++++++-- 7 files changed, 74 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index c8bf4d35fdea..87fb27ab9c24 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring, #define I40E_XDP_CONSUMED 1 #define I40E_XDP_TX 2 -static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, +static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, struct i40e_ring *xdp_ring); +static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, + struct i40e_ring *xdp_ring) +{ + struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); + + if (unlikely(!xdpf)) + return I40E_XDP_CONSUMED; + + return i40e_xmit_xdp_ring(xdpf, xdp_ring); +} + /** * i40e_run_xdp - run an XDP program * @rx_ring: Rx ring being processed @@ -2233,7 +2244,7 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring, break; case XDP_TX: xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; - result = i40e_xmit_xdp_ring(xdp, xdp_ring); + result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring); break; case XDP_REDIRECT: err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); @@ -3480,21 +3491,14 @@ dma_error: * @xdp: data to transmit * @xdp_ring: XDP Tx ring **/ -static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, +static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, struct i40e_ring *xdp_ring) { u16 i = xdp_ring->next_to_use; struct i40e_tx_buffer *tx_bi; struct i40e_tx_desc *tx_desc; - struct xdp_frame *xdpf; + u32 size = xdpf->len; dma_addr_t dma; - u32 size; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return I40E_XDP_CONSUMED; - - size = xdpf->len; if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) { xdp_ring->tx_stats.tx_busy++; @@ -3684,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) * * Returns Zero if sent, else an error code **/ -int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); @@ -3697,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) return -ENXIO; - err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]); + err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); if (err != I40E_XDP_TX) return -ENOSPC; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 857b1d743c8d..4bf318b8be85 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -511,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); void i40e_detect_recover_hung(struct i40e_vsi *vsi); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); -int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp); +int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); void i40e_xdp_flush(struct net_device *dev); /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4f2864165723..51e7d82a5860 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, #define IXGBE_XDP_TX 2 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, - struct xdp_buff *xdp); + struct xdp_frame *xdpf); static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, struct ixgbe_ring *rx_ring, @@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, { int err, result = IXGBE_XDP_PASS; struct bpf_prog *xdp_prog; + struct xdp_frame *xdpf; u32 act; rcu_read_lock(); @@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, if (!xdp_prog) goto xdp_out; + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: - result = ixgbe_xmit_xdp_ring(adapter, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) { + result = IXGBE_XDP_CONSUMED; + break; + } + result = ixgbe_xmit_xdp_ring(adapter, xdpf); break; case XDP_REDIRECT: err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); @@ -2386,7 +2394,6 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - prefetchw(xdp.data_hard_start); /* xdp_frame write */ skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); } @@ -8344,20 +8351,15 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, } static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, - struct xdp_buff *xdp) + struct xdp_frame *xdpf) { struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()]; struct ixgbe_tx_buffer *tx_buffer; union ixgbe_adv_tx_desc *tx_desc; - struct xdp_frame *xdpf; u32 len, cmd_type; dma_addr_t dma; u16 i; - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - len = xdpf->len; if (unlikely(!ixgbe_desc_unused(ring))) @@ -10010,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ring *ring; @@ -10026,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!ring)) return -ENXIO; - err = ixgbe_xmit_xdp_ring(adapter, xdp); + err = ixgbe_xmit_xdp_ring(adapter, xdpf); if (err != IXGBE_XDP_TX) return -ENOSPC; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bec130cdbd9d..1e58be152d5c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1301,18 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) { struct tun_struct *tun = netdev_priv(dev); - struct xdp_frame *frame; struct tun_file *tfile; u32 numqueues; int ret = 0; - frame = convert_to_xdp_frame(xdp); - if (unlikely(!frame)) - return -EOVERFLOW; - rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); @@ -1336,6 +1331,16 @@ out: return ret; } +static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) +{ + struct xdp_frame *frame = convert_to_xdp_frame(xdp); + + if (unlikely(!frame)) + return -EOVERFLOW; + + return tun_xdp_xmit(dev, frame); +} + static void tun_xdp_flush(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -1683,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, case XDP_TX: get_page(alloc_frag->page); alloc_frag->offset += buflen; - if (tun_xdp_xmit(tun->dev, &xdp)) + if (tun_xdp_tx(tun->dev, &xdp)) goto err_redirect; tun_xdp_flush(tun->dev); rcu_read_unlock(); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ab3d7cbc4c49..01694e26f03e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -416,10 +416,10 @@ static void virtnet_xdp_flush(struct net_device *dev) } static int __virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_buff *xdp) + struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; - struct xdp_frame *xdpf, *xdpf_sent; + struct xdp_frame *xdpf_sent; struct send_queue *sq; unsigned int len; unsigned int qp; @@ -432,10 +432,6 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) xdp_return_frame(xdpf_sent); - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - /* virtqueue want to use data area in-front of packet */ if (unlikely(xdpf->metasize > 0)) return -EOPNOTSUPP; @@ -459,7 +455,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, return 0; } -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = vi->rq; @@ -472,7 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (!xdp_prog) return -ENXIO; - return __virtnet_xdp_xmit(vi, xdp); + return __virtnet_xdp_xmit(vi, xdpf); } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) @@ -569,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; + struct xdp_frame *xdpf; struct xdp_buff xdp; void *orig_data; u32 act; @@ -611,7 +608,10 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - err = __virtnet_xdp_xmit(vi, &xdp); + xdpf = convert_to_xdp_frame(&xdp); + if (unlikely(!xdpf)) + goto err_xdp; + err = __virtnet_xdp_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); goto err_xdp; @@ -702,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { + struct xdp_frame *xdpf; struct page *xdp_page; struct xdp_buff xdp; void *data; @@ -766,7 +767,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - err = __virtnet_xdp_xmit(vi, &xdp); + xdpf = convert_to_xdp_frame(&xdp); + if (unlikely(!xdpf)) + goto err_xdp; + err = __virtnet_xdp_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); if (unlikely(xdp_page != page)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf44503ea81a..14e0777ffcfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1165,7 +1165,7 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. * void (*ndo_xdp_flush)(struct net_device *dev); @@ -1356,7 +1356,7 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_buff *xdp); + struct xdp_frame *xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/net/core/filter.c b/net/core/filter.c index d31aff93270d..3bb0cb98a9be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2749,13 +2749,18 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { + struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; dev->netdev_ops->ndo_xdp_flush(dev); @@ -2771,11 +2776,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct net_device *dev = fwd; + struct xdp_frame *xdpf; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: move to inside map code instead, for bulk support + * err = dev_map_enqueue(dev, xdp); + */ + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; __dev_map_insert_ctx(map, index); -- cgit v1.2.3 From 45f8cb57da0d7a9ead4b39d7f5def333a5b0c08b Mon Sep 17 00:00:00 2001 From: Liam Girdwood Date: Tue, 27 Mar 2018 14:30:40 +0100 Subject: ASoC: core: Allow topology to override machine driver FE DAI link config. Machine drivers statically define a number of DAI links that currently cannot be changed or removed by topology. This means PCMs and platform components cannot be changed by topology at runtime AND machine drivers are tightly coupled to topology. This patch allows topology to override the machine driver DAI link config in order to reuse machine drivers with different topologies and platform components. The patch supports :- 1) create new FE PCMs with a topology defined PCM ID. 2) destroy existing static FE PCMs 3) change the platform component driver. 4) assign any new HW params fixups. The patch requires no changes to the machine drivers, but does add some platform component flags that the platform component driver can assign before loading topologies. Signed-off-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc.h | 10 +++++++ sound/soc/soc-core.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++-- sound/soc/soc-pcm.c | 12 ++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index ad266d7e9553..fac4ff04fb7d 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1012,6 +1012,13 @@ struct snd_soc_platform_driver { /* platform stream compress ops */ const struct snd_compr_ops *compr_ops; + + /* this platform uses topology and ignore machine driver FEs */ + const char *ignore_machine; + int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd, + struct snd_pcm_hw_params *params); + bool use_dai_pcm_id; /* use the DAI link PCM ID as PCM device number */ + int be_pcm_base; /* base device ID for all BE PCMs */ }; struct snd_soc_dai_link_component { @@ -1118,6 +1125,9 @@ struct snd_soc_dai_link { /* pmdown_time is ignored at stop */ unsigned int ignore_pmdown_time:1; + /* Do not create a PCM for this DAI link (Backend link) */ + unsigned int ignore:1; + struct list_head list; /* DAI link list of the soc card */ struct snd_soc_dobj dobj; /* For topology */ }; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index bf7ca32ab31f..8f01fe8296ff 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1050,6 +1050,9 @@ static int soc_bind_dai_link(struct snd_soc_card *card, const char *platform_name; int i; + if (dai_link->ignore) + return 0; + dev_dbg(card->dev, "ASoC: binding %s\n", dai_link->name); if (soc_is_dai_link_bound(card, dai_link)) { @@ -1672,7 +1675,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, { struct snd_soc_dai_link *dai_link = rtd->dai_link; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - int i, ret; + int i, ret, num; dev_dbg(card->dev, "ASoC: probe %s dai link %d late %d\n", card->name, rtd->num, order); @@ -1718,9 +1721,23 @@ static int soc_probe_link_dais(struct snd_soc_card *card, soc_dpcm_debugfs_add(rtd); #endif + /* + * most drivers will register their PCMs using DAI link ordering but + * topology based drivers can use the DAI link id field to set PCM + * device number and then use rtd + a base offset of the BEs. + */ + if (rtd->platform->driver->use_dai_pcm_id) { + if (rtd->dai_link->no_pcm) + num = rtd->platform->driver->be_pcm_base + rtd->num; + else + num = rtd->dai_link->id; + } else { + num = rtd->num; + } + if (cpu_dai->driver->compress_new) { /*create compress_device"*/ - ret = cpu_dai->driver->compress_new(rtd, rtd->num); + ret = cpu_dai->driver->compress_new(rtd, num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create compress %s\n", dai_link->stream_name); @@ -1730,7 +1747,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, if (!dai_link->params) { /* create the pcm */ - ret = soc_new_pcm(rtd, rtd->num); + ret = soc_new_pcm(rtd, num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create pcm %s :%d\n", dai_link->stream_name, ret); @@ -2076,6 +2093,59 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour) EXPORT_SYMBOL_GPL(snd_soc_set_dmi_name); #endif /* CONFIG_DMI */ +static void soc_check_tplg_fes(struct snd_soc_card *card) +{ + struct snd_soc_platform *platform; + struct snd_soc_dai_link *dai_link; + int i; + + list_for_each_entry(platform, &platform_list, list) { + + /* does this platform override FEs ? */ + if (!platform->driver->ignore_machine) + continue; + + /* for this machine ? */ + if (strcmp(platform->driver->ignore_machine, + card->dev->driver->name)) + continue; + + /* machine matches, so override the rtd data */ + for (i = 0; i < card->num_links; i++) { + + dai_link = &card->dai_link[i]; + + /* ignore this FE */ + if (dai_link->dynamic) { + dai_link->ignore = true; + continue; + } + + dev_info(card->dev, "info: override FE DAI link %s\n", + card->dai_link[i].name); + + /* override platform */ + dai_link->platform_name = platform->component.name; + dai_link->cpu_dai_name = platform->component.name; + + /* convert non BE into BE */ + dai_link->no_pcm = 1; + dai_link->dpcm_playback = 1; + dai_link->dpcm_capture = 1; + + /* override any BE fixups */ + dai_link->be_hw_params_fixup = + platform->driver->be_hw_params_fixup; + + /* most BE links dont set stream name, so set it to + * dai link name if it's NULL to help bind widgets. + */ + if (!dai_link->stream_name) + dai_link->stream_name = dai_link->name; + } + } +} + static int snd_soc_instantiate_card(struct snd_soc_card *card) { struct snd_soc_codec *codec; @@ -2086,6 +2156,9 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) mutex_lock(&client_mutex); mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT); + /* check whether any platform is ignore machine FE and using topology */ + soc_check_tplg_fes(card); + /* bind DAIs */ for (i = 0; i < card->num_links; i++) { ret = soc_bind_dai_link(card, &card->dai_link[i]); diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 68d9dc930096..4ce489165a6d 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -909,8 +909,20 @@ int soc_dai_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) { + struct snd_soc_pcm_runtime *rtd = substream->private_data; int ret; + /* perform any topology hw_params fixups before DAI */ + if (rtd->dai_link->be_hw_params_fixup) { + ret = rtd->dai_link->be_hw_params_fixup(rtd, params); + if (ret < 0) { + dev_err(rtd->dev, + "ASoC: hw_params topology fixup failed %d\n", + ret); + return ret; + } + } + if (dai->driver->ops->hw_params) { ret = dai->driver->ops->hw_params(substream, params, dai); if (ret < 0) { -- cgit v1.2.3 From f11a5c27f9287cacad74af31cd92d4413eccc05a Mon Sep 17 00:00:00 2001 From: Liam Girdwood Date: Tue, 27 Mar 2018 14:30:41 +0100 Subject: ASoC: core: Add name prefix for machines with topology rewrites Signed-off-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc.h | 2 ++ sound/soc/soc-core.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index fac4ff04fb7d..3676d0a8f532 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1015,6 +1015,7 @@ struct snd_soc_platform_driver { /* this platform uses topology and ignore machine driver FEs */ const char *ignore_machine; + const char *topology_name_prefix; int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd, struct snd_pcm_hw_params *params); bool use_dai_pcm_id; /* use the DAI link PCM ID as PCM device number */ @@ -1167,6 +1168,7 @@ struct snd_soc_card { const char *long_name; const char *driver_name; char dmi_longname[80]; + char topology_shortname[32]; struct device *dev; struct snd_card *snd_card; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 8f01fe8296ff..aa1c33b3cce0 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2143,6 +2143,14 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) if (!dai_link->stream_name) dai_link->stream_name = dai_link->name; } + + /* Inform userspace we are using alternate topology */ + if (platform->driver->topology_name_prefix) { + snprintf(card->topology_shortname, 32, "%s-%s", + platform->driver->topology_name_prefix, + card->name); + card->name = card->topology_shortname; + } } } -- cgit v1.2.3 From 81e9b0a078894841a50a8dd666fd64ca452a2a50 Mon Sep 17 00:00:00 2001 From: Liam Girdwood Date: Tue, 27 Mar 2018 14:30:42 +0100 Subject: ASoC: topology: Give more data to clients via callbacks Give topology clients more access to the topology data by passing index, pcm, link_config and dai_driver to clients. This allows clients to fully instantiate and track topology objects. The SOF driver is the first user of these new APIs and needs them to build component topology driver and FW objects. Signed-off-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc-topology.h | 23 ++++++++++++++--------- sound/soc/intel/skylake/skl-pcm.c | 7 ++++--- sound/soc/intel/skylake/skl-topology.c | 5 +++-- sound/soc/intel/skylake/skl-topology.h | 20 +++++--------------- sound/soc/soc-topology.c | 31 ++++++++++++++++++------------- 5 files changed, 44 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h index f552c3f56368..e1f265e21ee1 100644 --- a/include/sound/soc-topology.h +++ b/include/sound/soc-topology.h @@ -30,6 +30,8 @@ struct snd_soc_dapm_context; struct snd_soc_card; struct snd_kcontrol_new; struct snd_soc_dai_link; +struct snd_soc_dai_driver; +struct snd_soc_dai; /* object scan be loaded and unloaded in groups with identfying indexes */ #define SND_SOC_TPLG_INDEX_ALL 0 /* ID that matches all FW objects */ @@ -109,35 +111,38 @@ struct snd_soc_tplg_widget_events { struct snd_soc_tplg_ops { /* external kcontrol init - used for any driver specific init */ - int (*control_load)(struct snd_soc_component *, + int (*control_load)(struct snd_soc_component *, int index, struct snd_kcontrol_new *, struct snd_soc_tplg_ctl_hdr *); int (*control_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* external widget init - used for any driver specific init */ - int (*widget_load)(struct snd_soc_component *, + int (*widget_load)(struct snd_soc_component *, int index, struct snd_soc_dapm_widget *, struct snd_soc_tplg_dapm_widget *); - int (*widget_ready)(struct snd_soc_component *, + int (*widget_ready)(struct snd_soc_component *, int index, struct snd_soc_dapm_widget *, struct snd_soc_tplg_dapm_widget *); int (*widget_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* FE DAI - used for any driver specific init */ - int (*dai_load)(struct snd_soc_component *, - struct snd_soc_dai_driver *dai_drv); + int (*dai_load)(struct snd_soc_component *, int index, + struct snd_soc_dai_driver *dai_drv, + struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai); + int (*dai_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* DAI link - used for any driver specific init */ - int (*link_load)(struct snd_soc_component *, - struct snd_soc_dai_link *link); + int (*link_load)(struct snd_soc_component *, int index, + struct snd_soc_dai_link *link, + struct snd_soc_tplg_link_config *cfg); int (*link_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* callback to handle vendor bespoke data */ - int (*vendor_load)(struct snd_soc_component *, + int (*vendor_load)(struct snd_soc_component *, int index, struct snd_soc_tplg_hdr *); int (*vendor_unload)(struct snd_soc_component *, struct snd_soc_tplg_hdr *); @@ -146,7 +151,7 @@ struct snd_soc_tplg_ops { void (*complete)(struct snd_soc_component *); /* manifest - optional to inform component of manifest */ - int (*manifest)(struct snd_soc_component *, + int (*manifest)(struct snd_soc_component *, int index, struct snd_soc_tplg_manifest *); /* vendor specific kcontrol handlers available for binding */ diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c index afa86b9e4dcf..1f4dd08d36c5 100644 --- a/sound/soc/intel/skylake/skl-pcm.c +++ b/sound/soc/intel/skylake/skl-pcm.c @@ -1017,10 +1017,11 @@ static struct snd_soc_dai_driver skl_platform_dai[] = { }, }; -int skl_dai_load(struct snd_soc_component *cmp, - struct snd_soc_dai_driver *pcm_dai) +int skl_dai_load(struct snd_soc_component *cmp, int index, + struct snd_soc_dai_driver *dai_drv, + struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai) { - pcm_dai->ops = &skl_pcm_dai_ops; + dai_drv->ops = &skl_pcm_dai_ops; return 0; } diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 3b1dca419883..6ac081f1f215 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -2851,7 +2851,7 @@ void skl_cleanup_resources(struct skl *skl) * information to the driver about module and pipeline parameters which DSP * FW expects like ids, resource values, formats etc */ -static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, +static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { @@ -2958,6 +2958,7 @@ static int skl_init_enum_data(struct device *dev, struct soc_enum *se, } static int skl_tplg_control_load(struct snd_soc_component *cmpnt, + int index, struct snd_kcontrol_new *kctl, struct snd_soc_tplg_ctl_hdr *hdr) { @@ -3446,7 +3447,7 @@ static int skl_tplg_get_manifest_data(struct snd_soc_tplg_manifest *manifest, return 0; } -static int skl_manifest_load(struct snd_soc_component *cmpnt, +static int skl_manifest_load(struct snd_soc_component *cmpnt, int index, struct snd_soc_tplg_manifest *manifest) { struct hdac_ext_bus *ebus = snd_soc_component_get_drvdata(cmpnt); diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h index b1e0667c0ae0..77857c598eed 100644 --- a/sound/soc/intel/skylake/skl-topology.h +++ b/sound/soc/intel/skylake/skl-topology.h @@ -221,18 +221,9 @@ struct skl_mod_inst_map { u16 inst_id; }; -struct skl_uuid_inst_map { - u16 inst_id; - u16 reserved; - uuid_le mod_uuid; -} __packed; - struct skl_kpb_params { u32 num_modules; - union { - struct skl_mod_inst_map map[0]; - struct skl_uuid_inst_map map_uuid[0]; - } u; + struct skl_mod_inst_map map[0]; }; struct skl_module_inst_id { @@ -469,7 +460,7 @@ int skl_dsp_set_dma_control(struct skl_sst *ctx, u32 *caps, u32 caps_size, u32 node_id); void skl_tplg_set_be_dmic_config(struct snd_soc_dai *dai, struct skl_pipe_params *params, int stream); -int skl_tplg_init(struct snd_soc_component *component, +int skl_tplg_init(struct snd_soc_platform *platform, struct hdac_ext_bus *ebus); struct skl_module_cfg *skl_tplg_fe_get_cpr_module( struct snd_soc_dai *dai, int stream); @@ -512,8 +503,7 @@ int skl_pcm_host_dma_prepare(struct device *dev, int skl_pcm_link_dma_prepare(struct device *dev, struct skl_pipe_params *params); -int skl_dai_load(struct snd_soc_component *cmp, - struct snd_soc_dai_driver *pcm_dai); -void skl_tplg_add_moduleid_in_bind_params(struct skl *skl, - struct snd_soc_dapm_widget *w); +int skl_dai_load(struct snd_soc_component *, int index, + struct snd_soc_dai_driver *dai_drv, + struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai); #endif diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index ec2ef7629dbb..028bcaa0eda5 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -315,7 +315,7 @@ static int soc_tplg_vendor_load_(struct soc_tplg *tplg, int ret = 0; if (tplg->comp && tplg->ops && tplg->ops->vendor_load) - ret = tplg->ops->vendor_load(tplg->comp, hdr); + ret = tplg->ops->vendor_load(tplg->comp, tplg->index, hdr); else { dev_err(tplg->dev, "ASoC: no vendor load callback for ID %d\n", hdr->vendor_type); @@ -347,7 +347,8 @@ static int soc_tplg_widget_load(struct soc_tplg *tplg, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { if (tplg->comp && tplg->ops && tplg->ops->widget_load) - return tplg->ops->widget_load(tplg->comp, w, tplg_w); + return tplg->ops->widget_load(tplg->comp, tplg->index, w, + tplg_w); return 0; } @@ -358,27 +359,30 @@ static int soc_tplg_widget_ready(struct soc_tplg *tplg, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { if (tplg->comp && tplg->ops && tplg->ops->widget_ready) - return tplg->ops->widget_ready(tplg->comp, w, tplg_w); + return tplg->ops->widget_ready(tplg->comp, tplg->index, w, + tplg_w); return 0; } /* pass DAI configurations to component driver for extra initialization */ static int soc_tplg_dai_load(struct soc_tplg *tplg, - struct snd_soc_dai_driver *dai_drv) + struct snd_soc_dai_driver *dai_drv, + struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai) { if (tplg->comp && tplg->ops && tplg->ops->dai_load) - return tplg->ops->dai_load(tplg->comp, dai_drv); + return tplg->ops->dai_load(tplg->comp, tplg->index, dai_drv, + pcm, dai); return 0; } /* pass link configurations to component driver for extra initialization */ static int soc_tplg_dai_link_load(struct soc_tplg *tplg, - struct snd_soc_dai_link *link) + struct snd_soc_dai_link *link, struct snd_soc_tplg_link_config *cfg) { if (tplg->comp && tplg->ops && tplg->ops->link_load) - return tplg->ops->link_load(tplg->comp, link); + return tplg->ops->link_load(tplg->comp, tplg->index, link, cfg); return 0; } @@ -699,7 +703,8 @@ static int soc_tplg_init_kcontrol(struct soc_tplg *tplg, struct snd_kcontrol_new *k, struct snd_soc_tplg_ctl_hdr *hdr) { if (tplg->comp && tplg->ops && tplg->ops->control_load) - return tplg->ops->control_load(tplg->comp, k, hdr); + return tplg->ops->control_load(tplg->comp, tplg->index, k, + hdr); return 0; } @@ -1755,7 +1760,7 @@ static int soc_tplg_dai_create(struct soc_tplg *tplg, } /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_load(tplg, dai_drv); + ret = soc_tplg_dai_load(tplg, dai_drv, pcm, NULL); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n"); kfree(dai_drv); @@ -1825,7 +1830,7 @@ static int soc_tplg_fe_link_create(struct soc_tplg *tplg, set_link_flags(link, pcm->flag_mask, pcm->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_link_load(tplg, link); + ret = soc_tplg_dai_link_load(tplg, link, NULL); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: FE link loading failed\n"); kfree(link); @@ -2133,7 +2138,7 @@ static int soc_tplg_link_config(struct soc_tplg *tplg, set_link_flags(link, cfg->flag_mask, cfg->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_link_load(tplg, link); + ret = soc_tplg_dai_link_load(tplg, link, cfg); if (ret < 0) { dev_err(tplg->dev, "ASoC: physical link loading failed\n"); return ret; @@ -2255,7 +2260,7 @@ static int soc_tplg_dai_config(struct soc_tplg *tplg, set_dai_flags(dai_drv, d->flag_mask, d->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_load(tplg, dai_drv); + ret = soc_tplg_dai_load(tplg, dai_drv, NULL, dai); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n"); return ret; @@ -2361,7 +2366,7 @@ static int soc_tplg_manifest_load(struct soc_tplg *tplg, /* pass control to component driver for optional further init */ if (tplg->comp && tplg->ops && tplg->ops->manifest) - return tplg->ops->manifest(tplg->comp, _manifest); + return tplg->ops->manifest(tplg->comp, tplg->index, _manifest); if (!abi_match) /* free the duplicated one */ kfree(_manifest); -- cgit v1.2.3 From 28aa6f7779f77a863a08c1b9db4b654a94c86dd0 Mon Sep 17 00:00:00 2001 From: Liam Girdwood Date: Tue, 27 Mar 2018 14:30:43 +0100 Subject: ASoC: topology: Add callback for DAPM route load/unload Add a callback fro clients for notification about DAPM route loading and unloading. Signed-off-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc-topology.h | 7 +++++++ sound/soc/soc-topology.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h index e1f265e21ee1..401ef2c45d6c 100644 --- a/include/sound/soc-topology.h +++ b/include/sound/soc-topology.h @@ -32,6 +32,7 @@ struct snd_kcontrol_new; struct snd_soc_dai_link; struct snd_soc_dai_driver; struct snd_soc_dai; +struct snd_soc_dapm_route; /* object scan be loaded and unloaded in groups with identfying indexes */ #define SND_SOC_TPLG_INDEX_ALL 0 /* ID that matches all FW objects */ @@ -116,6 +117,12 @@ struct snd_soc_tplg_ops { int (*control_unload)(struct snd_soc_component *, struct snd_soc_dobj *); + /* DAPM graph route element loading and unloading */ + int (*dapm_route_load)(struct snd_soc_component *, int index, + struct snd_soc_dapm_route *route); + int (*dapm_route_unload)(struct snd_soc_component *, + struct snd_soc_dobj *); + /* external widget init - used for any driver specific init */ int (*widget_load)(struct snd_soc_component *, int index, struct snd_soc_dapm_widget *, diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 028bcaa0eda5..b9370ae31907 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1161,6 +1161,17 @@ static int soc_tplg_kcontrol_elems_load(struct soc_tplg *tplg, return 0; } +/* optionally pass new dynamic kcontrol to component driver. */ +static int soc_tplg_add_route(struct soc_tplg *tplg, + struct snd_soc_dapm_route *route) +{ + if (tplg->comp && tplg->ops && tplg->ops->dapm_route_load) + return tplg->ops->dapm_route_load(tplg->comp, tplg->index, + route); + + return 0; +} + static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, struct snd_soc_tplg_hdr *hdr) { @@ -1209,6 +1220,8 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, else route.control = elem->control; + soc_tplg_add_route(tplg, &route); + /* add route, but keep going if some fail */ snd_soc_dapm_add_routes(dapm, &route, 1); } -- cgit v1.2.3 From 032234d8231909ac049f22ea3b408487e1c103eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 10:00:39 -0700 Subject: net/ipv6: Make __inet6_bind static BPF core gets access to __inet6_bind via ipv6_bpf_stub_impl, so it is not invoked directly outside of af_inet6.c. Make it static and move inet6_bind after to avoid forward declaration. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 -- net/ipv6/af_inet6.c | 53 ++++++++++++++++++++++++++--------------------------- 2 files changed, 26 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 836f31af1369..68b167d98879 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1044,8 +1044,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); -int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2c694912df2e..36d622c477b1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -273,33 +273,8 @@ out_rcu_unlock: goto out; } - -/* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - int err = 0; - - /* If the socket has its own bind function then use it. */ - if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - /* BPF prog is run before any checks are done so that if the prog - * changes context in a wrong way it will be caught. - */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); - if (err) - return err; - - return __inet6_bind(sk, uaddr, addr_len, false, true); -} -EXPORT_SYMBOL(inet6_bind); - -int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) +static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -444,6 +419,30 @@ out_unlock: goto out; } +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; -- cgit v1.2.3 From bdb7cc643fc9db8d6ed9a2b9e524e27ac5882029 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Mon, 16 Apr 2018 13:42:16 -0400 Subject: ipv6: Count interface receive statistics on the ingress netdev The statistics such as InHdrErrors should be counted on the ingress netdev rather than on the dev from the dst, which is the egress. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/addrconf.h | 14 +++++++++++ net/ipv6/exthdrs.c | 55 ++++++++++++++++------------------------- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6_output.c | 18 ++++++-------- net/ipv6/reassembly.c | 6 ++--- net/ipv6/route.c | 3 ++- net/netfilter/ipvs/ip_vs_xmit.c | 5 ++-- 7 files changed, 51 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 378d601258be..8312cc25a3af 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -307,6 +307,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +/** + * __in6_dev_get_safely - get inet6_dev pointer from netdevice + * @dev: network device + * + * This is a safer version of __in6_dev_get + */ +static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) +{ + if (likely(dev)) + return rcu_dereference_rtnl(dev->ip6_ptr); + else + return NULL; +} + /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bc68eb661970..5bc2bf3733ab 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = { static int ipv6_destopt_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; @@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); @@ -319,8 +320,7 @@ fail_and_free: return 1; } - __IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } @@ -416,8 +416,7 @@ looped_back: } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -456,8 +455,7 @@ looped_back: if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -481,10 +479,10 @@ looped_back: /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; struct in6_addr daddr; - struct inet6_dev *idev; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -527,7 +523,7 @@ looped_back: * processed by own */ if (!addr) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -553,8 +549,7 @@ looped_back: goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -572,8 +567,7 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -609,14 +603,12 @@ looped_back: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -627,8 +619,7 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -647,8 +638,7 @@ looped_back: if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -663,7 +653,7 @@ looped_back: return -1; unknown_rh: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; @@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct net *net = ipv6_skb_net(skb); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9ee208a348f5..f08d34491ece 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb) bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), - ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); hdr = ipv6_hdr(skb); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2e891d2c30ef..a39b04f9fa6e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -425,6 +425,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); @@ -444,8 +445,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -476,8 +476,7 @@ int ip6_forward(struct sk_buff *skb) /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -490,15 +489,13 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -554,8 +551,7 @@ int ip6_forward(struct sk_buff *skb) /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INTOOBIGERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); @@ -579,7 +575,7 @@ int ip6_forward(struct sk_buff *skb) ip6_forward_finish); error: - __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 4979610287e2..2cdf3dcf8c2c 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -179,7 +179,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((u8 *)&fhdr->frag_off - @@ -214,7 +214,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, payload_len)); @@ -536,7 +536,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return -1; fail_hdr: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49b954d6d0fa..1d738bfe893b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3541,7 +3541,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IP6_INC_STATS(dev_net(dst->dev), + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INADDRERRORS); break; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4527921b1c3a..ba0a0fd045c8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, /* check and decrement ttl */ if (ipv6_hdr(skb)->hop_limit <= 1) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); + /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } -- cgit v1.2.3 From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 17 Apr 2018 14:11:28 +0800 Subject: vxlan: add ttl inherit support Like tos inherit, ttl inherit should also means inherit the inner protocol's ttl values, which actually not implemented in vxlan yet. But we could not treat ttl == 0 as "use the inner TTL", because that would be used also when the "ttl" option is not specified and that would be a behavior change, and breaking real use cases. So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is specified with ip cmd. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 17 ++++++++++++++--- include/net/ip_tunnels.h | 11 +++++++++++ include/net/vxlan.h | 1 + include/uapi/linux/if_link.h | 1 + 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index fab7a4db249e..aee0e60471f1 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2085,9 +2085,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, local_ip = vxlan->cfg.saddr; dst_cache = &rdst->dst_cache; md->gbp = skb->mark; - ttl = vxlan->cfg.ttl; - if (!ttl && vxlan_addr_multicast(dst)) - ttl = 1; + if (flags & VXLAN_F_TTL_INHERIT) { + ttl = ip_tunnel_get_ttl(old_iph, skb); + } else { + ttl = vxlan->cfg.ttl; + if (!ttl && vxlan_addr_multicast(dst)) + ttl = 1; + } tos = vxlan->cfg.tos; if (tos == 1) @@ -2709,6 +2713,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, + [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -3254,6 +3259,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_VXLAN_TTL]) conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); + if (data[IFLA_VXLAN_TTL_INHERIT]) { + if (changelink) + return -EOPNOTSUPP; + conf->flags |= VXLAN_F_TTL_INHERIT; + } + if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 540a4b4417bf..751646adc769 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, + const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ((const struct ipv6hdr *)iph)->hop_limit; + else + return 0; +} + /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad73d8b3fcc2..b99a02ae3934 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,7 @@ struct vxlan_dev { #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 +#define VXLAN_F_TTL_INHERIT 0x10000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 68699f654118..b85266420bfb 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -516,6 +516,7 @@ enum { IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From 1e6c06a7e88c251d8a30271ad5206fbd967a4576 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sun, 8 Apr 2018 11:06:57 +0800 Subject: hwspinlock: Introduce one new mode for hwspinlock In some scenarios, user need do some time-consuming or sleepable operations under the hardware spinlock protection for synchronization between the multiple subsystems. For example, there is one PMIC efuse on Spreadtrum platform, which need to be accessed under one hardware lock. But during the hardware lock protection, the efuse operation is time-consuming to almost 5 ms, so we can not disable the interrupts or preemption so long in this case. Thus we can introduce one new mode to indicate that we just acquire the hardware lock and do not disable interrupts or preemption, meanwhile we should force user to protect the hardware lock with mutex or spinlock to avoid dead-lock. Signed-off-by: Baolin Wang Signed-off-by: Bjorn Andersson --- drivers/hwspinlock/hwspinlock_core.c | 34 ++++++++++++++++----- include/linux/hwspinlock.h | 58 ++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c index f4a59f5631e4..5278d0560a4a 100644 --- a/drivers/hwspinlock/hwspinlock_core.c +++ b/drivers/hwspinlock/hwspinlock_core.c @@ -71,10 +71,16 @@ static DEFINE_MUTEX(hwspinlock_tree_lock); * This function attempts to lock an hwspinlock, and will immediately * fail if the hwspinlock is already taken. * - * Upon a successful return from this function, preemption (and possibly - * interrupts) is disabled, so the caller must not sleep, and is advised to - * release the hwspinlock as soon as possible. This is required in order to - * minimize remote cores polling on the hardware interconnect. + * Caution: If the mode is HWLOCK_RAW, that means user must protect the routine + * of getting hardware lock with mutex or spinlock. Since in some scenarios, + * user need some time-consuming or sleepable operations under the hardware + * lock, they need one sleepable lock (like mutex) to protect the operations. + * + * If the mode is not HWLOCK_RAW, upon a successful return from this function, + * preemption (and possibly interrupts) is disabled, so the caller must not + * sleep, and is advised to release the hwspinlock as soon as possible. This is + * required in order to minimize remote cores polling on the hardware + * interconnect. * * The user decides whether local interrupts are disabled or not, and if yes, * whether he wants their previous state to be saved. It is up to the user @@ -113,6 +119,9 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags) case HWLOCK_IRQ: ret = spin_trylock_irq(&hwlock->lock); break; + case HWLOCK_RAW: + ret = 1; + break; default: ret = spin_trylock(&hwlock->lock); break; @@ -134,6 +143,9 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags) case HWLOCK_IRQ: spin_unlock_irq(&hwlock->lock); break; + case HWLOCK_RAW: + /* Nothing to do */ + break; default: spin_unlock(&hwlock->lock); break; @@ -170,9 +182,14 @@ EXPORT_SYMBOL_GPL(__hwspin_trylock); * is already taken, the function will busy loop waiting for it to * be released, but give up after @timeout msecs have elapsed. * - * Upon a successful return from this function, preemption is disabled - * (and possibly local interrupts, too), so the caller must not sleep, - * and is advised to release the hwspinlock as soon as possible. + * Caution: If the mode is HWLOCK_RAW, that means user must protect the routine + * of getting hardware lock with mutex or spinlock. Since in some scenarios, + * user need some time-consuming or sleepable operations under the hardware + * lock, they need one sleepable lock (like mutex) to protect the operations. + * + * If the mode is not HWLOCK_RAW, upon a successful return from this function, + * preemption is disabled (and possibly local interrupts, too), so the caller + * must not sleep, and is advised to release the hwspinlock as soon as possible. * This is required in order to minimize remote cores polling on the * hardware interconnect. * @@ -266,6 +283,9 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags) case HWLOCK_IRQ: spin_unlock_irq(&hwlock->lock); break; + case HWLOCK_RAW: + /* Nothing to do */ + break; default: spin_unlock(&hwlock->lock); break; diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index 859d673d98c8..fe450ee58d55 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -24,6 +24,7 @@ /* hwspinlock mode argument */ #define HWLOCK_IRQSTATE 0x01 /* Disable interrupts, save state */ #define HWLOCK_IRQ 0x02 /* Disable interrupts, don't save state */ +#define HWLOCK_RAW 0x03 struct device; struct device_node; @@ -175,6 +176,25 @@ static inline int hwspin_trylock_irq(struct hwspinlock *hwlock) return __hwspin_trylock(hwlock, HWLOCK_IRQ, NULL); } +/** + * hwspin_trylock_raw() - attempt to lock a specific hwspinlock + * @hwlock: an hwspinlock which we want to trylock + * + * This function attempts to lock an hwspinlock, and will immediately fail + * if the hwspinlock is already taken. + * + * Caution: User must protect the routine of getting hardware lock with mutex + * or spinlock to avoid dead-lock, that will let user can do some time-consuming + * or sleepable operations under the hardware lock. + * + * Returns 0 if we successfully locked the hwspinlock, -EBUSY if + * the hwspinlock was already taken, and -EINVAL if @hwlock is invalid. + */ +static inline int hwspin_trylock_raw(struct hwspinlock *hwlock) +{ + return __hwspin_trylock(hwlock, HWLOCK_RAW, NULL); +} + /** * hwspin_trylock() - attempt to lock a specific hwspinlock * @hwlock: an hwspinlock which we want to trylock @@ -242,6 +262,29 @@ int hwspin_lock_timeout_irq(struct hwspinlock *hwlock, unsigned int to) return __hwspin_lock_timeout(hwlock, to, HWLOCK_IRQ, NULL); } +/** + * hwspin_lock_timeout_raw() - lock an hwspinlock with timeout limit + * @hwlock: the hwspinlock to be locked + * @to: timeout value in msecs + * + * This function locks the underlying @hwlock. If the @hwlock + * is already taken, the function will busy loop waiting for it to + * be released, but give up when @timeout msecs have elapsed. + * + * Caution: User must protect the routine of getting hardware lock with mutex + * or spinlock to avoid dead-lock, that will let user can do some time-consuming + * or sleepable operations under the hardware lock. + * + * Returns 0 when the @hwlock was successfully taken, and an appropriate + * error code otherwise (most notably an -ETIMEDOUT if the @hwlock is still + * busy after @timeout msecs). The function will never sleep. + */ +static inline +int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int to) +{ + return __hwspin_lock_timeout(hwlock, to, HWLOCK_RAW, NULL); +} + /** * hwspin_lock_timeout() - lock an hwspinlock with timeout limit * @hwlock: the hwspinlock to be locked @@ -301,6 +344,21 @@ static inline void hwspin_unlock_irq(struct hwspinlock *hwlock) __hwspin_unlock(hwlock, HWLOCK_IRQ, NULL); } +/** + * hwspin_unlock_raw() - unlock hwspinlock + * @hwlock: a previously-acquired hwspinlock which we want to unlock + * + * This function will unlock a specific hwspinlock. + * + * @hwlock must be already locked (e.g. by hwspin_trylock()) before calling + * this function: it is a bug to call unlock on a @hwlock that is already + * unlocked. + */ +static inline void hwspin_unlock_raw(struct hwspinlock *hwlock) +{ + __hwspin_unlock(hwlock, HWLOCK_RAW, NULL); +} + /** * hwspin_unlock() - unlock hwspinlock * @hwlock: a previously-acquired hwspinlock which we want to unlock -- cgit v1.2.3 From ee6548d1d98df7df3b9c8103a42cf68b31c29417 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Apr 2018 07:52:04 +0300 Subject: RDMA/rdma_cm: Delete rdma_addr_client The only thing it does is block module unload while work is posted from rdma_resolve_ip(). However, this is not the right place to do this. The users of rdma_resolve_ip() must ensure their own module does not unload until rdma_resolve_ip() calls the callback, or until rdma_addr_cancel() is called. Similarly callers to rdma_addr_find_l2_eth_by_grh() must ensure their module does not unload while they are calling code. The only two users are already safe, so there is no need for this. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 38 ++++---------------------------------- drivers/infiniband/core/cma.c | 6 +----- include/rdma/ib_addr.h | 20 +------------------- 3 files changed, 6 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9756cfbdef0e..4f32c4062fb6 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -56,7 +56,6 @@ struct addr_req { struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; - struct rdma_addr_client *client; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); @@ -220,28 +219,6 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -static struct rdma_addr_client self; - -void rdma_addr_register_client(struct rdma_addr_client *client) -{ - atomic_set(&client->refcount, 1); - init_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_register_client); - -static inline void put_client(struct rdma_addr_client *client) -{ - if (atomic_dec_and_test(&client->refcount)) - complete(&client->comp); -} - -void rdma_addr_unregister_client(struct rdma_addr_client *client) -{ - put_client(client); - wait_for_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_unregister_client); - void rdma_copy_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev, const unsigned char *dst_dev_addr) @@ -605,14 +582,12 @@ static void process_one_req(struct work_struct *_work) */ cancel_delayed_work(&req->work); list_del_init(&req->list); - put_client(req->client); kfree(req); } spin_unlock_bh(&lock); } -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), @@ -644,8 +619,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->addr = addr; req->callback = callback; req->context = context; - req->client = client; - atomic_inc(&client->refcount); INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); @@ -661,7 +634,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, break; default: ret = req->status; - atomic_dec(&client->refcount); goto err; } return ret; @@ -722,7 +694,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, found->addr, found->context); - put_client(found->client); kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); @@ -761,8 +732,8 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, dev_addr.net = &init_net; init_completion(&ctx.comp); - ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); if (ret) return ret; @@ -806,14 +777,13 @@ int addr_init(void) return -ENOMEM; register_netevent_notifier(&nb); - rdma_addr_register_client(&self); return 0; } void addr_cleanup(void) { - rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); + WARN_ON(!list_empty(&req_list)); } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 51a641002e10..48300838e354 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -156,7 +156,6 @@ static struct ib_client cma_client = { }; static struct ib_sa_client sa_client; -static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); @@ -2910,7 +2909,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); } @@ -4547,7 +4546,6 @@ static int __init cma_init(void) goto err_wq; ib_sa_register_client(&sa_client); - rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); @@ -4561,7 +4559,6 @@ static int __init cma_init(void) err: unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); err_wq: destroy_workqueue(cma_wq); @@ -4574,7 +4571,6 @@ static void __exit cma_cleanup(void) rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index a08cc7278980..c2c8b1fdeead 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -49,22 +49,6 @@ #include #include -struct rdma_addr_client { - atomic_t refcount; - struct completion comp; -}; - -/** - * rdma_addr_register_client - Register an address client. - */ -void rdma_addr_register_client(struct rdma_addr_client *client); - -/** - * rdma_addr_unregister_client - Deregister an address client. - * @client: Client object to deregister. - */ -void rdma_addr_unregister_client(struct rdma_addr_client *client); - /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. @@ -99,7 +83,6 @@ int rdma_translate_ip(const struct sockaddr *addr, /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. - * @client: Address client associated with request. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. @@ -112,8 +95,7 @@ int rdma_translate_ip(const struct sockaddr *addr, * or been canceled. A status of 0 indicates success. * @context: User-specified context associated with the call. */ -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), -- cgit v1.2.3 From fe1bd78bf18a7cb3eb76fceea9193534fb6619e3 Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Mon, 2 Apr 2018 13:06:05 -0700 Subject: ARM: imx: Update spi_imx platform data to reflect current state The docs for the spi_imx platform data still refer to a -32 offset used to specify a native chip select. This was removed in commit 602c8f4485cd ("spi: imx: fix use of native chip-selects with devicetree") and no longer works as documented. Update documentation. The macro MXC_SPI_CS() is no longer is needed. If a board uses all native chip selects, then it's not necessary to specify a chip select array at all, as all native is the default (this is how device-tree configured SPI masters work too). Most of the spi-imx platform data users have their chip select arrays removed by this patch. This patch also fixes a bug in mx31moboard introduced in the '602 commit. When that board was updated in commit 901f26bce64a ("ARM: imx: set correct chip_select in platform setup") to reflect the SPI change, only SPI bus 2 was updated and SPI bus 1 was left with non-sequential chip selects. The mc13783 spi device on bus 1 had its chip select updated as if it were on bus 2. CC: Sascha Hauer CC: Fabio Estevam Acked-by: Greg Ungerer Reviewed-by: Oleksij Rempel Signed-off-by: Trent Piepho Signed-off-by: Shawn Guo --- arch/arm/mach-imx/mach-mx31_3ds.c | 18 ++---------------- arch/arm/mach-imx/mach-mx31lilly.c | 12 ++---------- arch/arm/mach-imx/mach-mx31lite.c | 16 ++-------------- arch/arm/mach-imx/mach-mx31moboard.c | 17 +++-------------- arch/arm/mach-imx/mach-pcm037_eet.c | 5 +---- include/linux/platform_data/spi-imx.h | 29 +++++++++++++++++------------ 6 files changed, 27 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-imx/mach-mx31_3ds.c b/arch/arm/mach-imx/mach-mx31_3ds.c index 68c3f0799d5b..9d87f1dcf7bb 100644 --- a/arch/arm/mach-imx/mach-mx31_3ds.c +++ b/arch/arm/mach-imx/mach-mx31_3ds.c @@ -374,26 +374,12 @@ static struct imx_ssi_platform_data mx31_3ds_ssi_pdata = { }; /* SPI */ -static int spi0_internal_chipselect[] = { - MXC_SPI_CS(0), - MXC_SPI_CS(1), - MXC_SPI_CS(2), -}; - static const struct spi_imx_master spi0_pdata __initconst = { - .chipselect = spi0_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi0_internal_chipselect), -}; - -static int spi1_internal_chipselect[] = { - MXC_SPI_CS(0), - MXC_SPI_CS(1), - MXC_SPI_CS(2), + .num_chipselect = 3, }; static const struct spi_imx_master spi1_pdata __initconst = { - .chipselect = spi1_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi1_internal_chipselect), + .num_chipselect = 3, }; static struct spi_board_info mx31_3ds_spi_devs[] __initdata = { diff --git a/arch/arm/mach-imx/mach-mx31lilly.c b/arch/arm/mach-imx/mach-mx31lilly.c index 6fd463642954..8bf52819d4d9 100644 --- a/arch/arm/mach-imx/mach-mx31lilly.c +++ b/arch/arm/mach-imx/mach-mx31lilly.c @@ -226,20 +226,12 @@ static void __init lilly1131_usb_init(void) /* SPI */ -static int spi_internal_chipselect[] = { - MXC_SPI_CS(0), - MXC_SPI_CS(1), - MXC_SPI_CS(2), -}; - static const struct spi_imx_master spi0_pdata __initconst = { - .chipselect = spi_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi_internal_chipselect), + .num_chipselect = 3, }; static const struct spi_imx_master spi1_pdata __initconst = { - .chipselect = spi_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi_internal_chipselect), + .num_chipselect = 3, }; static struct mc13xxx_platform_data mc13783_pdata __initdata = { diff --git a/arch/arm/mach-imx/mach-mx31lite.c b/arch/arm/mach-imx/mach-mx31lite.c index a3250bc7f114..a3cbba6c955b 100644 --- a/arch/arm/mach-imx/mach-mx31lite.c +++ b/arch/arm/mach-imx/mach-mx31lite.c @@ -83,15 +83,8 @@ static const struct imxuart_platform_data uart_pdata __initconst = { }; /* SPI */ -static int spi0_internal_chipselect[] = { - MXC_SPI_CS(0), - MXC_SPI_CS(1), - MXC_SPI_CS(2), -}; - static const struct spi_imx_master spi0_pdata __initconst = { - .chipselect = spi0_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi0_internal_chipselect), + .num_chipselect = 3, }; static const struct mxc_nand_platform_data @@ -133,13 +126,8 @@ static struct platform_device smsc911x_device = { * The MC13783 is the only hard-wired SPI device on the module. */ -static int spi1_internal_chipselect[] = { - MXC_SPI_CS(0), -}; - static const struct spi_imx_master spi1_pdata __initconst = { - .chipselect = spi1_internal_chipselect, - .num_chipselect = ARRAY_SIZE(spi1_internal_chipselect), + .num_chipselect = 1, }; static struct mc13xxx_platform_data mc13783_pdata __initdata = { diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c index 7716f83aecdd..643a3d749703 100644 --- a/arch/arm/mach-imx/mach-mx31moboard.c +++ b/arch/arm/mach-imx/mach-mx31moboard.c @@ -152,14 +152,8 @@ static const struct imxi2c_platform_data moboard_i2c1_data __initconst = { .bitrate = 100000, }; -static int moboard_spi1_cs[] = { - MXC_SPI_CS(0), - MXC_SPI_CS(2), -}; - static const struct spi_imx_master moboard_spi1_pdata __initconst = { - .chipselect = moboard_spi1_cs, - .num_chipselect = ARRAY_SIZE(moboard_spi1_cs), + .num_chipselect = 3, }; static struct regulator_consumer_supply sdhc_consumers[] = { @@ -296,19 +290,14 @@ static struct spi_board_info moboard_spi_board_info[] __initdata = { /* irq number is run-time assigned */ .max_speed_hz = 300000, .bus_num = 1, - .chip_select = 1, + .chip_select = 0, .platform_data = &moboard_pmic, .mode = SPI_CS_HIGH, }, }; -static int moboard_spi2_cs[] = { - MXC_SPI_CS(0), MXC_SPI_CS(1), -}; - static const struct spi_imx_master moboard_spi2_pdata __initconst = { - .chipselect = moboard_spi2_cs, - .num_chipselect = ARRAY_SIZE(moboard_spi2_cs), + .num_chipselect = 2, }; #define SDHC1_CD IOMUX_TO_GPIO(MX31_PIN_ATA_CS0) diff --git a/arch/arm/mach-imx/mach-pcm037_eet.c b/arch/arm/mach-imx/mach-pcm037_eet.c index 95bd97710494..15bc956d466b 100644 --- a/arch/arm/mach-imx/mach-pcm037_eet.c +++ b/arch/arm/mach-imx/mach-pcm037_eet.c @@ -56,11 +56,8 @@ static struct spi_board_info pcm037_spi_dev[] = { }; /* Platform Data for MXC CSPI */ -static int pcm037_spi1_cs[] = { MXC_SPI_CS(0), MXC_SPI_CS(1), }; - static const struct spi_imx_master pcm037_spi1_pdata __initconst = { - .chipselect = pcm037_spi1_cs, - .num_chipselect = ARRAY_SIZE(pcm037_spi1_cs), + .num_chipselect = 2, }; /* GPIO-keys input device */ diff --git a/include/linux/platform_data/spi-imx.h b/include/linux/platform_data/spi-imx.h index 6f012fefa1a2..328f670d10bd 100644 --- a/include/linux/platform_data/spi-imx.h +++ b/include/linux/platform_data/spi-imx.h @@ -5,24 +5,29 @@ /* * struct spi_imx_master - device.platform_data for SPI controller devices. - * @chipselect: Array of chipselects for this master. Numbers >= 0 mean gpio - * pins, numbers < 0 mean internal CSPI chipselects according - * to MXC_SPI_CS(). Normally you want to use gpio based chip - * selects as the CSPI module tries to be intelligent about - * when to assert the chipselect: The CSPI module deasserts the - * chipselect once it runs out of input data. The other problem - * is that it is not possible to mix between high active and low - * active chipselects on one single bus using the internal - * chipselects. Unfortunately Freescale decided to put some + * @chipselect: Array of chipselects for this master or NULL. Numbers >= 0 + * mean GPIO pins, -ENOENT means internal CSPI chipselect + * matching the position in the array. E.g., if chipselect[1] = + * -ENOENT then a SPI slave using chip select 1 will use the + * native SS1 line of the CSPI. Omitting the array will use + * all native chip selects. + + * Normally you want to use gpio based chip selects as the CSPI + * module tries to be intelligent about when to assert the + * chipselect: The CSPI module deasserts the chipselect once it + * runs out of input data. The other problem is that it is not + * possible to mix between high active and low active chipselects + * on one single bus using the internal chipselects. + * Unfortunately, on some SoCs, Freescale decided to put some * chipselects on dedicated pins which are not usable as gpios, * so we have to support the internal chipselects. - * @num_chipselect: ARRAY_SIZE(chipselect) + * + * @num_chipselect: If @chipselect is specified, ARRAY_SIZE(chipselect), + * otherwise the number of native chip selects. */ struct spi_imx_master { int *chipselect; int num_chipselect; }; -#define MXC_SPI_CS(no) ((no) - 32) - #endif /* __MACH_SPI_H_*/ -- cgit v1.2.3 From a919525ad832d2bb1388b2303832a2307b30aeff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:07 -0700 Subject: net: Move fib_convert_metrics to metrics file Move logic of fib_convert_metrics into ip_metrics_convert. This allows the code that converts netlink attributes into metrics struct to be re-used in a later patch by IPv6. This is mostly a code move with the following changes to variable names: - fi->fib_net becomes net - fc_mx and fc_mx_len are passed as inputs pulled from fib_config - metrics array is passed as an input from fi->fib_metrics->metrics Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ net/ipv4/Makefile | 3 ++- net/ipv4/fib_semantics.c | 43 ++------------------------------------- net/ipv4/metrics.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 42 deletions(-) create mode 100644 net/ipv4/metrics.c (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index ecffd843e7b8..dc4a2d6e58a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -396,6 +396,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics); + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a07b7dd06def..b379520f9133 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -13,7 +13,8 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ - inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ + metrics.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c27122f01b87..6608db23f54b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1019,47 +1019,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - - if (!cfg->fc_mx) - return 0; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (type > RTAX_MAX) - return -EINVAL; - - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - return -EINVAL; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - return -EINVAL; - fi->fib_metrics->metrics[type - 1] = val; - } - - if (ecn_ca) - fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; - - return 0; + return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, + fi->fib_metrics->metrics); } struct fib_info *fib_create_info(struct fib_config *cfg, diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c new file mode 100644 index 000000000000..5121c6475e6b --- /dev/null +++ b/net/ipv4/metrics.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!fc_mx) + return 0; + + nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + metrics[type - 1] = val; + } + + if (ecn_ca) + metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_metrics_convert); -- cgit v1.2.3 From 7aef6859ee91ea867a3dff9ba47bca9b2de382f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:10 -0700 Subject: net/ipv6: Pass net to fib6_update_sernum Pass net namespace to fib6_update_sernum. It can not be marked const as fib6_new_sernum will change ipv6.fib6_sernum. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 3 +-- net/ipv6/route.c | 10 +++++----- 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5e86fd9dc857..f0aaf1c8f1a8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -408,7 +408,7 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index deab2db6692e..74d2a3748e2f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -105,9 +105,8 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct rt6_info *rt) +void fib6_update_sernum(struct net *net, struct rt6_info *rt) { - struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; fn = rcu_dereference_protected(rt->rt6i_node, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1d738bfe893b..0a99cda9fd7b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1352,7 +1352,7 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&ort->rt6i_table->tb6_lock); - fib6_update_sernum(ort); + fib6_update_sernum(net, ort); spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } @@ -3786,11 +3786,11 @@ void rt6_multipath_rebalance(struct rt6_info *rt) static int fib6_ifup(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; - const struct net *net = dev_net(arg->dev); + struct net *net = dev_net(arg->dev); if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; - fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3869,7 +3869,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; - const struct net *net = dev_net(dev); + struct net *net = dev_net(dev); if (rt == net->ipv6.ip6_null_entry) return 0; @@ -3892,7 +3892,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); - fib6_update_sernum(rt); + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; -- cgit v1.2.3 From afb1d4b59311a8252f67c214b37ec69d8100cb55 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:11 -0700 Subject: net/ipv6: Pass net namespace to route functions Pass network namespace reference into route add, delete and get functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 ++++++----- net/ipv6/addrconf.c | 33 ++++++++++++++++-------------- net/ipv6/anycast.c | 10 +++++---- net/ipv6/ndisc.c | 12 ++++++----- net/ipv6/route.c | 54 +++++++++++++++++++++++++------------------------ 5 files changed, 66 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984..1130a1144dfd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,8 +101,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct rt6_info *); -int ip6_del_rt(struct rt6_info *); +int ip6_ins_rt(struct net *net, struct rt6_info *rt); +int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); @@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, @@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b2c0175125db..7ff7466c52e5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(idev, addr, false); + rt = addrconf_dst_alloc(net, idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -1187,7 +1187,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r 0, RTF_GATEWAY | RTF_DEFAULT); if (rt) { if (del_rt) - ip6_del_rt(rt); + ip6_del_rt(dev_net(ifp->idev->dev), rt); else { if (!(rt->rt6i_flags & RTF_EXPIRES)) rt6_set_expires(rt, expires); @@ -2666,7 +2666,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3333,7 +3333,8 @@ static void addrconf_gre_config(struct net_device *dev) } #endif -static int fixup_permanent_addr(struct inet6_dev *idev, +static int fixup_permanent_addr(struct net *net, + struct inet6_dev *idev, struct inet6_ifaddr *ifp) { /* !rt6i_node means the host route was removed from the @@ -3343,7 +3344,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, if (!ifp->rt || !ifp->rt->rt6i_node) { struct rt6_info *rt, *prev; - rt = addrconf_dst_alloc(idev, &ifp->addr, false); + rt = addrconf_dst_alloc(net, idev, &ifp->addr, false); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -3367,7 +3368,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, return 0; } -static void addrconf_permanent_addr(struct net_device *dev) +static void addrconf_permanent_addr(struct net *net, struct net_device *dev) { struct inet6_ifaddr *ifp, *tmp; struct inet6_dev *idev; @@ -3380,7 +3381,7 @@ static void addrconf_permanent_addr(struct net_device *dev) list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { if ((ifp->flags & IFA_F_PERMANENT) && - fixup_permanent_addr(idev, ifp) < 0) { + fixup_permanent_addr(net, idev, ifp) < 0) { write_unlock_bh(&idev->lock); in6_ifa_hold(ifp); ipv6_del_addr(ifp); @@ -3449,7 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (event == NETDEV_UP) { /* restore routes for permanent addresses */ - addrconf_permanent_addr(dev); + addrconf_permanent_addr(net, dev); if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ @@ -3735,7 +3736,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -3853,6 +3854,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; bool bump_id, notify = false; + struct net *net; addrconf_join_solict(dev, &ifp->addr); @@ -3863,8 +3865,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) if (ifp->state == INET6_IFADDR_STATE_DEAD) goto out; + net = dev_net(dev); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + (net->ipv6.devconf_all->accept_dad < 1 && idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3900,8 +3903,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) * Frames right away */ if (ifp->flags & IFA_F_OPTIMISTIC) { - ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { + ip6_ins_rt(net, ifp->rt); + if (ipv6_use_optimistic_addr(net, idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -5604,7 +5607,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * to do it again */ if (!rcu_access_pointer(ifp->rt->rt6i_node)) - ip6_ins_rt(ifp->rt); + ip6_ins_rt(net, ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) @@ -5621,11 +5624,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); } if (ifp->rt) { if (dst_hold_safe(&ifp->rt->dst)) - ip6_del_rt(ifp->rt); + ip6_del_rt(net, ifp->rt); } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index bbcabbba9bd8..1122fb299b86 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -247,6 +247,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct rt6_info *rt; + struct net *net; int err; ASSERT_RTNL(); @@ -265,7 +266,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } } - rt = addrconf_dst_alloc(idev, addr, true); + net = dev_net(idev->dev); + rt = addrconf_dst_alloc(net, idev, addr, true); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; @@ -286,7 +288,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); - ip6_ins_rt(rt); + ip6_ins_rt(net, rt); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -329,7 +331,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_leave_solict(idev, &aca->aca_addr); dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); return 0; @@ -357,7 +359,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9de4dfb126ba..3fbc3805e69b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1156,6 +1156,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct rt6_info *rt = NULL; + struct net *net; int lifetime; struct ndisc_options ndopts; int optlen; @@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ + net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && - ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - in6_dev->dev, 0)) { + ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1272,7 +1273,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); + rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); @@ -1285,7 +1286,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -1294,7 +1295,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); - rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); + rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, + skb->dev, pref); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0a99cda9fd7b..045811a3da76 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -850,13 +850,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } if (rinfo->prefix_len == 0) - rt = rt6_get_dflt_router(gwaddr, dev); + rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -1014,9 +1014,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, return err; } -int ip6_ins_rt(struct rt6_info *rt) +int ip6_ins_rt(struct net *net, struct rt6_info *rt) { - struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; + struct nl_info info = { .nl_net = net, }; struct mx6_config mxc = { .mx = NULL, }; /* Hold dst to account for the reference from the fib6 tree */ @@ -1121,14 +1121,13 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_make_pcpu_route(struct net *net, + struct rt6_info *rt) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { - struct net *net = dev_net(rt->dst.dev); - dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } @@ -1787,7 +1786,7 @@ uncached_rt_out: /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - pcpu_rt = rt6_make_pcpu_route(rt); + pcpu_rt = rt6_make_pcpu_route(net, rt); rt6_release(rt); } else { /* rt is already removed from tree */ @@ -2088,7 +2087,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (rt6_check_expired(rt)) { - ip6_del_rt(rt); + ip6_del_rt(dev_net(dst->dev), rt); dst = NULL; } } else { @@ -2109,7 +2108,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) - ip6_del_rt(rt); + ip6_del_rt(dev_net(rt->dst.dev), rt); } else { struct fib6_node *fn; @@ -3018,9 +3017,9 @@ out: static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) { - int err; + struct net *net = info->nl_net; struct fib6_table *table; - struct net *net = dev_net(rt->dst.dev); + int err; if (rt == net->ipv6.ip6_null_entry) { err = -ENOENT; @@ -3037,11 +3036,10 @@ out: return err; } -int ip6_del_rt(struct rt6_info *rt) +int ip6_del_rt(struct net *net, struct rt6_info *rt) { - struct nl_info info = { - .nl_net = dev_net(rt->dst.dev), - }; + struct nl_info info = { .nl_net = net }; + return __ip6_del_rt(rt, &info); } @@ -3376,13 +3374,15 @@ static struct rt6_info *rt6_add_route_info(struct net *net, } #endif -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, + struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct rt6_info *rt; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), tb_id); + table = fib6_get_table(net, tb_id); if (!table) return NULL; @@ -3399,7 +3399,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev return rt; } -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { @@ -3412,7 +3413,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, - .fc_nlinfo.nl_net = dev_net(dev), + .fc_nlinfo.nl_net = net, }; cfg.fc_gateway = *gwaddr; @@ -3425,10 +3426,11 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } - return rt6_get_dflt_router(gwaddr, dev); + return rt6_get_dflt_router(net, gwaddr, dev); } -static void __rt6_purge_dflt_routers(struct fib6_table *table) +static void __rt6_purge_dflt_routers(struct net *net, + struct fib6_table *table) { struct rt6_info *rt; @@ -3439,7 +3441,7 @@ restart: (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { if (dst_hold_safe(&rt->dst)) { rcu_read_unlock(); - ip6_del_rt(rt); + ip6_del_rt(net, rt); } else { rcu_read_unlock(); } @@ -3463,7 +3465,7 @@ void rt6_purge_dflt_routers(struct net *net) head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) - __rt6_purge_dflt_routers(table); + __rt6_purge_dflt_routers(net, table); } } @@ -3583,12 +3585,12 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, + struct inet6_dev *idev, const struct in6_addr *addr, bool anycast) { u32 tb_id; - struct net *net = dev_net(idev->dev); struct net_device *dev = idev->dev; struct rt6_info *rt; -- cgit v1.2.3 From e8478e80e5a74f4ce47b043735f0066588fb64c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:13 -0700 Subject: net/ipv6: Save route type in rt6_info The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags and dst.error. Since dst is going to be removed, it can no longer be relied on for FIB dumps so save the route type as fib6_type. fc_type is set in current users based on the algorithm in rt6_fill_node: - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST - else fc_type = RTN_UNICAST Similarly, fib6_type is set in the rt6_info templates based on the RTF_REJECT section of rt6_fill_node converting dst.error to RTN type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/addrconf.c | 2 ++ net/ipv6/route.c | 46 ++++++++++++++++++++-------------------------- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0aaf1c8f1a8..0165820bbafb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,6 +174,7 @@ struct rt6_info { int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7ff7466c52e5..f71fcf2635d5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2331,6 +2331,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_flags = RTF_UP | flags, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, + .fc_type = RTN_UNICAST, }; cfg.fc_dst = *pfx; @@ -2394,6 +2395,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, + .fc_type = RTN_UNICAST, .fc_nlinfo.nl_net = dev_net(dev), }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0daf4c9c9f2b..1cc01f5bb773 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -307,6 +307,7 @@ static const struct rt6_info ip6_null_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_UNREACHABLE, }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -324,6 +325,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_PROHIBIT, }; static const struct rt6_info ip6_blk_hole_entry_template = { @@ -339,6 +341,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_BLACKHOLE, }; #endif @@ -2802,6 +2805,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@ -2914,6 +2922,8 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_metric = cfg->fc_metric; rt->rt6i_nh_weight = 1; + rt->fib6_type = cfg->fc_type; + /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes */ @@ -3354,6 +3364,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -3410,6 +3421,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -3485,6 +3497,7 @@ static void rtmsg_to_fib6_config(struct net *net, cfg->fc_dst_len = rtmsg->rtmsg_dst_len; cfg->fc_src_len = rtmsg->rtmsg_src_len; cfg->fc_flags = rtmsg->rtmsg_flags; + cfg->fc_type = rtmsg->rtmsg_type; cfg->fc_nlinfo.nl_net = net; @@ -3606,10 +3619,13 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (anycast) + if (anycast) { + rt->fib6_type = RTN_ANYCAST; rt->rt6i_flags |= RTF_ANYCAST; - else + } else { + rt->fib6_type = RTN_LOCAL; rt->rt6i_flags |= RTF_LOCAL; + } rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; @@ -4509,30 +4525,8 @@ static int rt6_fill_node(struct net *net, rtm->rtm_table = table; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; - if (rt->rt6i_flags & RTF_REJECT) { - switch (rt->dst.error) { - case -EINVAL: - rtm->rtm_type = RTN_BLACKHOLE; - break; - case -EACCES: - rtm->rtm_type = RTN_PROHIBIT; - break; - case -EAGAIN: - rtm->rtm_type = RTN_THROW; - break; - default: - rtm->rtm_type = RTN_UNREACHABLE; - break; - } - } - else if (rt->rt6i_flags & RTF_LOCAL) - rtm->rtm_type = RTN_LOCAL; - else if (rt->rt6i_flags & RTF_ANYCAST) - rtm->rtm_type = RTN_ANYCAST; - else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) - rtm->rtm_type = RTN_LOCAL; - else - rtm->rtm_type = RTN_UNICAST; + + rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; -- cgit v1.2.3 From 5e670d844b2a4e47d1b9b9aceb14dd3c12a6d4bf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:14 -0700 Subject: net/ipv6: Move nexthop data to fib6_nh Introduce fib6_nh structure and move nexthop related data from rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or lwtstate from a FIB lookup perspective are converted to use fib6_nh; datapath references to dst version are left as is. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 32 ++-- include/net/ip6_fib.h | 16 +- include/net/ip6_route.h | 6 +- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 6 +- net/ipv6/route.c | 162 +++++++++++---------- 6 files changed, 125 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 1904c0323d39..d995a0b52d7c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2770,9 +2770,9 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp, struct in6_addr *gw; int ifindex, weight; - ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex; - weight = mlxsw_sp_rt6->rt->rt6i_nh_weight; - gw = &mlxsw_sp_rt6->rt->rt6i_gateway; + ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex; + weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight; + gw = &mlxsw_sp_rt6->rt->fib6_nh.nh_gw; if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex, weight)) return false; @@ -2838,7 +2838,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed) struct net_device *dev; list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - dev = mlxsw_sp_rt6->rt->dst.dev; + dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev; val ^= dev->ifindex; } @@ -3836,9 +3836,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp, struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; struct rt6_info *rt = mlxsw_sp_rt6->rt; - if (nh->rif && nh->rif->dev == rt->dst.dev && + if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev && ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr, - &rt->rt6i_gateway)) + &rt->fib6_nh.nh_gw)) return nh; continue; } @@ -3895,7 +3895,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) { list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, - list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD; + list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD; return; } @@ -3905,9 +3905,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6); if (nh && nh->offloaded) - mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD; + mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD; else - mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD; + mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD; } } @@ -3922,7 +3922,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { struct rt6_info *rt = mlxsw_sp_rt6->rt; - rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD; + rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD; } } @@ -4818,8 +4818,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp, const struct rt6_info *rt, enum mlxsw_sp_ipip_type *ret) { - return rt->dst.dev && - mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret); + return rt->fib6_nh.nh_dev && + mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret); } static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, @@ -4829,7 +4829,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, { const struct mlxsw_sp_ipip_ops *ipip_ops; struct mlxsw_sp_ipip_entry *ipip_entry; - struct net_device *dev = rt->dst.dev; + struct net_device *dev = rt->fib6_nh.nh_dev; struct mlxsw_sp_rif *rif; int err; @@ -4872,11 +4872,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh, const struct rt6_info *rt) { - struct net_device *dev = rt->dst.dev; + struct net_device *dev = rt->fib6_nh.nh_dev; nh->nh_grp = nh_grp; - nh->nh_weight = rt->rt6i_nh_weight; - memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr)); + nh->nh_weight = rt->fib6_nh.nh_weight; + memcpy(&nh->gw_addr, &rt->fib6_nh.nh_gw, sizeof(nh->gw_addr)); mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh); list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0165820bbafb..f0a88370ba95 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -127,6 +127,16 @@ struct rt6_exception { #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 +struct fib6_nh { + struct in6_addr nh_gw; + struct net_device *nh_dev; + struct lwtunnel_state *nh_lwtstate; + + unsigned int nh_flags; + atomic_t nh_upper_bound; + int nh_weight; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -149,12 +159,9 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; - atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; - unsigned int rt6i_nh_flags; - /* These are in a separate cache line. */ struct rt6key rt6i_dst ____cacheline_aligned_in_smp; u32 rt6i_flags; @@ -171,13 +178,14 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ - int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; + + struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1130a1144dfd..655e13017a45 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -273,10 +273,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) { - return a->dst.dev == b->dst.dev && + return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && - ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && - !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); + ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && + !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } #endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f71fcf2635d5..483c8772e856 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2369,7 +2369,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != dev->ifindex) + if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) continue; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 74d2a3748e2f..64b73e65f114 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2221,6 +2221,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct rt6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + const struct net_device *dev; seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); @@ -2230,14 +2231,15 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (rt->rt6i_flags & RTF_GATEWAY) - seq_printf(seq, "%pi6", &rt->rt6i_gateway); + seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); else seq_puts(seq, "00000000000000000000000000000000"); + dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), rt->dst.__use, rt->rt6i_flags, - rt->dst.dev ? rt->dst.dev->name : ""); + dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1cc01f5bb773..222af19d3403 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -466,12 +466,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, rt6i_siblings) { - if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + int nh_upper_bound; + + nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); + if (fl6->mp_hash > nh_upper_bound) continue; if (rt6_score_route(sibling, oif, strict) < 0) break; @@ -495,13 +498,14 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + if (!oif && ipv6_addr_any(saddr) && + !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) return rt; for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { - struct net_device *dev = sprt->dst.dev; + const struct net_device *dev = sprt->fib6_nh.nh_dev; - if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; if (oif) { @@ -533,7 +537,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, return net->ipv6.ip6_null_entry; } - return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; + return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -558,7 +562,10 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct rt6_info *rt) { struct __rt6_probe_work *work; + const struct in6_addr *nh_gw; struct neighbour *neigh; + struct net_device *dev; + /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it @@ -569,8 +576,11 @@ static void rt6_probe(struct rt6_info *rt) */ if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) return; + + nh_gw = &rt->fib6_nh.nh_gw; + dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (neigh->nud_state & NUD_VALID) goto out; @@ -592,9 +602,9 @@ static void rt6_probe(struct rt6_info *rt) if (work) { INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; + work->target = *nh_gw; + dev_hold(dev); + work->dev = dev; schedule_work(&work->work); } @@ -612,7 +622,8 @@ static inline void rt6_probe(struct rt6_info *rt) */ static inline int rt6_check_dev(struct rt6_info *rt, int oif) { - struct net_device *dev = rt->dst.dev; + const struct net_device *dev = rt->fib6_nh.nh_dev; + if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && @@ -623,15 +634,16 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) { - struct neighbour *neigh; enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + struct neighbour *neigh; if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) return RT6_NUD_SUCCEED; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, + &rt->fib6_nh.nh_gw); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -679,11 +691,11 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; if (idev->cnf.ignore_routes_with_linkdown && - rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -888,7 +900,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) { - struct net_device *dev = rt->dst.dev; + struct net_device *dev = rt->fib6_nh.nh_dev; if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the @@ -928,7 +940,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); rt->rt6i_metric = ort->rt6i_metric; @@ -937,7 +949,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); + rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, @@ -1308,7 +1320,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { - struct net *net = dev_net(ort->dst.dev); + struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -2313,7 +2325,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; if (rt6_check_expired(rt)) continue; @@ -2321,14 +2333,14 @@ restart: break; if (!(rt->rt6i_flags & RTF_GATEWAY)) continue; - if (fl6->flowi6_oif != rt->dst.dev->ifindex) + if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) continue; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); @@ -2905,7 +2917,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, &lwtstate, extack); if (err) goto out; - rt->dst.lwtstate = lwtstate_get(lwtstate); + rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); lwtunnel_set_redirect(&rt->dst); } @@ -2920,7 +2932,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; - rt->rt6i_nh_weight = 1; + rt->fib6_nh.nh_weight = 1; rt->fib6_type = cfg->fc_type; @@ -2975,7 +2987,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; - rt->rt6i_gateway = cfg->fc_gateway; + rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway; } err = -ENODEV; @@ -3010,9 +3022,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, install_route: if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; - rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->dst.dev = dev; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); + rt->fib6_nh.nh_dev = rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3171,11 +3183,11 @@ static int ip6_route_del(struct fib6_config *cfg, rt = rt_cache; } if (cfg->fc_ifindex && - (!rt->dst.dev || - rt->dst.dev->ifindex != cfg->fc_ifindex)) + (!rt->fib6_nh.nh_dev || + rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && - !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; @@ -3337,11 +3349,11 @@ static struct rt6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != ifindex) + if (rt->fib6_nh.nh_dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; - if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; ip6_hold_safe(NULL, &rt, false); break; @@ -3398,9 +3410,9 @@ struct rt6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (dev == rt->dst.dev && + if (dev == rt->fib6_nh.nh_dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && - ipv6_addr_equal(&rt->rt6i_gateway, addr)) + ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } if (rt) @@ -3627,6 +3639,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, rt->rt6i_flags |= RTF_LOCAL; } + rt->fib6_nh.nh_gw = *addr; + rt->fib6_nh.nh_dev = dev; rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; @@ -3649,7 +3663,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->dst.dev == dev || !dev) && + if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3681,7 +3695,7 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { return -1; } @@ -3729,8 +3743,8 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) static bool rt6_is_dead(const struct rt6_info *rt) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD || - (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || + (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) return true; @@ -3743,11 +3757,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->rt6i_nh_weight; + total += rt->fib6_nh.nh_weight; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { if (!rt6_is_dead(iter)) - total += iter->rt6i_nh_weight; + total += iter->fib6_nh.nh_weight; } return total; @@ -3758,11 +3772,11 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->rt6i_nh_weight; + *weight += rt->fib6_nh.nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) @@ -3805,8 +3819,8 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { - rt->rt6i_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { + rt->fib6_nh.nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3834,10 +3848,10 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt, { struct rt6_info *iter; - if (rt->dst.dev == dev) + if (rt->fib6_nh.nh_dev == dev) return true; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) + if (iter->fib6_nh.nh_dev == dev) return true; return false; @@ -3858,11 +3872,12 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, struct rt6_info *iter; unsigned int dead = 0; - if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_dev == down_dev || + rt->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == down_dev || - iter->rt6i_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh.nh_dev == down_dev || + iter->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -3874,11 +3889,11 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt, { struct rt6_info *iter; - if (rt->dst.dev == dev) - rt->rt6i_nh_flags |= nh_flags; + if (rt->fib6_nh.nh_dev == dev) + rt->fib6_nh.nh_flags |= nh_flags; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) - iter->rt6i_nh_flags |= nh_flags; + if (iter->fib6_nh.nh_dev == dev) + iter->fib6_nh.nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -3893,12 +3908,12 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->dst.dev == dev ? -1 : 0; + return rt->fib6_nh.nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->rt6i_nsiblings) - return rt->dst.dev == dev ? -1 : 0; + return rt->fib6_nh.nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -3914,10 +3929,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->dst.dev != dev || + if (rt->fib6_nh.nh_dev != dev || rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -3969,7 +3984,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - if (rt->dst.dev == arg->dev && + if (rt->fib6_nh.nh_dev == arg->dev && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); if (dst_metric_raw(&rt->dst, RTAX_MTU) && @@ -4255,7 +4270,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { @@ -4412,7 +4427,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->dst.lwtstate); + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); nexthop_len *= rt->rt6i_nsiblings; } @@ -4430,38 +4445,38 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->dst.lwtstate) + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) + nexthop_len; } static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; - if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { + if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; } if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) goto nla_put_failure; } - *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); - if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) + *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); + if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; /* not needed for multipath encoding b/c it has a rtnexthop struct */ - if (!skip_oif && rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + if (!skip_oif && rt->fib6_nh.nh_dev && + nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) goto nla_put_failure; - if (rt->dst.lwtstate && - lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + if (rt->fib6_nh.nh_lwtstate && + lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) goto nla_put_failure; return 0; @@ -4473,6 +4488,7 @@ nla_put_failure: /* add multipath next hop */ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) { + const struct net_device *dev = rt->fib6_nh.nh_dev; struct rtnexthop *rtnh; unsigned int flags = 0; @@ -4480,8 +4496,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; - rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; + rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; + rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) goto nla_put_failure; -- cgit v1.2.3 From d4ead6b34b67fd711639324b6465a050bcb197d4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:16 -0700 Subject: net/ipv6: move metrics from dst to rt6_info Similar to IPv4, add fib metrics to the fib struct, which at the moment is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics into dst by reference using refcount. To make the transition: - add dst_metrics to rt6_info. Default to dst_default_metrics if no metrics are passed during route add. No need for a separate pmtu entry; it can reference the MTU slot in fib6_metrics - ip6_convert_metrics allocates memory in the FIB entry and uses ip_metrics_convert to copy from netlink attribute to metrics entry - the convert metrics call is done in ip6_route_info_create simplifying the route add path + fib6_commit_metrics and fib6_copy_metrics and the temporary mx6_config are no longer needed - add fib6_metric_set helper to change the value of a metric in the fib entry since dst_metric_set can no longer be used - cow_metrics for IPv6 can drop to dst_cow_metrics_generic - rt6_dst_from_metrics_check is no longer needed - rt6_fill_node needs the FIB entry and dst as separate arguments to keep compatibility with existing output. Current dst address is renamed to dest. (to be consistent with IPv4 rt6_fill_node really should be split into 2 functions similar to fib_dump_info and rt_fill_info) - rt6_fill_node no longer needs the temporary metrics variable Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 17 ++-- net/core/dst.c | 1 + net/ipv6/ip6_fib.c | 66 +++++-------- net/ipv6/ndisc.c | 10 +- net/ipv6/route.c | 257 +++++++++++++++++++------------------------------- 5 files changed, 133 insertions(+), 218 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0a88370ba95..1f8dc9d12abb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -94,11 +94,6 @@ struct fib6_gc_args { #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif -struct mx6_config { - const u32 *mx; - DECLARE_BITMAP(mx_valid, RTAX_MAX); -}; - /* * routing information * @@ -176,7 +171,6 @@ struct rt6_info { struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; - u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; @@ -185,6 +179,8 @@ struct rt6_info { should_flush:1, unused:6; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; }; @@ -390,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack); + struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, @@ -420,6 +415,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +{ + return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); +} + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); diff --git a/net/core/dst.c b/net/core/dst.c index 007aa0b08291..2d9b37f8944a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = { */ .refcnt = REFCOUNT_INIT(1), }; +EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 64b73e65f114..0d94c56c3e41 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -578,6 +578,24 @@ out: return res; } +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val) +{ + if (!f6i) + return; + + if (f6i->fib6_metrics == &dst_default_metrics) { + struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); + + if (!p) + return; + + refcount_set(&p->refcnt, 1); + f6i->fib6_metrics = p; + } + + f6i->fib6_metrics->metrics[metric - 1] = val; +} + /* * Routing Table * @@ -801,38 +819,6 @@ insert_above: return ln; } -static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) -{ - int i; - - for (i = 0; i < RTAX_MAX; i++) { - if (test_bit(i, mxc->mx_valid)) - mp[i] = mxc->mx[i]; - } -} - -static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) -{ - if (!mxc->mx) - return 0; - - if (dst->flags & DST_HOST) { - u32 *mp = dst_metrics_write_ptr(dst); - - if (unlikely(!mp)) - return -ENOMEM; - - fib6_copy_metrics(mp, mxc); - } else { - dst_init_metrics(dst, mxc->mx, false); - - /* We've stolen mx now. */ - mxc->mx = NULL; - } - - return 0; -} - static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { @@ -866,7 +852,7 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, + struct nl_info *info, struct netlink_ext_ack *extack) { struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, @@ -923,7 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); - iter->rt6i_pmtu = rt->rt6i_pmtu; + fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, @@ -1002,9 +988,6 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, @@ -1035,10 +1018,6 @@ add: return -ENOENT; } - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; - err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1135,8 +1114,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; @@ -1244,7 +1222,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc, extack); + err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3fbc3805e69b..b058ea9ecec0 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1323,9 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); + fib6_metric_set(rt, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } @@ -1477,10 +1476,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; - - if (rt) - dst_metric_set(&rt->dst, RTAX_MTU, mtu); - + fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3b301aafd2ed..62aafd06c35f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -96,12 +96,11 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); static size_t rt6_nlmsg_size(struct rt6_info *rt); -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct rt6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, @@ -183,23 +182,6 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } -static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) -{ - return dst_metrics_write_ptr(&rt->from->dst); -} - -static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - struct rt6_info *rt = (struct rt6_info *)dst; - - if (rt->rt6i_flags & RTF_PCPU) - return rt6_pcpu_cow_metrics(rt); - else if (rt->rt6i_flags & RTF_CACHE) - return NULL; - else - return dst_cow_metrics_generic(dst, old); -} - static inline const void *choose_neigh_daddr(struct rt6_info *rt, struct sk_buff *skb, const void *daddr) @@ -249,7 +231,7 @@ static struct dst_ops ip6_dst_ops_template = { .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, - .cow_metrics = ipv6_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, @@ -353,6 +335,7 @@ static void rt6_info_init(struct rt6_info *rt) memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); + rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; } /* allocate dst with ip6_dst_ops */ @@ -395,6 +378,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; struct inet6_dev *idev; + struct dst_metrics *m; dst_destroy_metrics_generic(dst); free_percpu(rt->rt6i_pcpu); @@ -411,6 +395,10 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } + m = rt->fib6_metrics; + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) + kfree(m); + rt->from = NULL; dst_release(&from->dst); } @@ -996,7 +984,11 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); rt->from = from; - dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); + dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } } static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) @@ -1140,7 +1132,6 @@ EXPORT_SYMBOL(rt6_lookup); */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, - struct mx6_config *mxc, struct netlink_ext_ack *extack) { int err; @@ -1148,7 +1139,7 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, table = rt->rt6i_table; spin_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info, mxc, extack); + err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; @@ -1157,11 +1148,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, int ip6_ins_rt(struct net *net, struct rt6_info *rt) { struct nl_info info = { .nl_net = net, }; - struct mx6_config mxc = { .mx = NULL, }; /* Hold dst to account for the reference from the fib6 tree */ dst_hold(&rt->dst); - return __ip6_ins_rt(rt, &info, &mxc, NULL); + return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, @@ -1232,8 +1222,8 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) - rt6_dst_from_metrics_check(pcpu_rt); + if (pcpu_rt) + ip6_hold_safe(NULL, &pcpu_rt, false); return pcpu_rt; } @@ -1254,7 +1244,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); - rt6_dst_from_metrics_check(pcpu_rt); return pcpu_rt; } @@ -1384,6 +1373,16 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } +static unsigned int fib6_mtu(const struct rt6_info *rt) +{ + unsigned int mtu; + + mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6; + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); + + return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); +} + static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { @@ -1436,7 +1435,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, * Only insert this exception route if its mtu * is less than ort's mtu value. */ - if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { err = -EINVAL; goto out; } @@ -1673,12 +1672,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected - * route), the metrics of its rt->dst.from have already + * route), the metrics of its rt->from have already * been updated. */ - if (entry->rt6i_pmtu && + if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) - entry->rt6i_pmtu = mtu; + dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } @@ -1844,10 +1843,9 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (rt->rt6i_flags & RTF_CACHE) { - if (ip6_hold_safe(net, &rt, true)) { + if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - rt6_dst_from_metrics_check(rt); - } + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table, fl6); return rt; @@ -2147,13 +2145,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ -static void rt6_dst_from_metrics_check(struct rt6_info *rt) -{ - if (rt->from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); -} - static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; @@ -2188,8 +2179,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * into this function always. */ - rt6_dst_from_metrics_check(rt); - if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); @@ -2242,8 +2231,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); + dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; - rt->rt6i_pmtu = mtu; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } @@ -2289,10 +2278,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, } else if (daddr) { struct rt6_info *nrt6; - nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, rt6)) + if (rt6_insert_exception(nrt6, rt6->from)) dst_release_immediate(&nrt6->dst); } } @@ -2533,12 +2522,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - - if (mtu) - goto out; + unsigned int mtu; mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) @@ -2622,60 +2607,24 @@ out: return entries > rt_max_size; } -static int ip6_convert_metrics(struct mx6_config *mxc, - const struct fib6_config *cfg) +static int ip6_convert_metrics(struct net *net, struct rt6_info *rt, + struct fib6_config *cfg) { - struct net *net = cfg->fc_nlinfo.nl_net; - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - u32 *mp; - - if (!cfg->fc_mx) - return 0; - - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); - if (unlikely(!mp)) - return -ENOMEM; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (unlikely(type > RTAX_MAX)) - goto err; - - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + int err = 0; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - goto err; + if (cfg->fc_mx) { + rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics), + GFP_KERNEL); + if (unlikely(!rt->fib6_metrics)) + return -ENOMEM; - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); - } + refcount_set(&rt->fib6_metrics->refcnt, 1); - if (ecn_ca) { - __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); - mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, + rt->fib6_metrics->metrics); } - mxc->mx = mp; - return 0; - err: - kfree(mp); - return -EINVAL; + return err; } static struct rt6_info *ip6_nh_lookup_table(struct net *net, @@ -2955,6 +2904,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + err = ip6_convert_metrics(net, rt, cfg); + if (err < 0) + goto out; + if (cfg->fc_flags & RTF_EXPIRES) rt6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -3078,32 +3031,16 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg, - struct netlink_ext_ack *extack) +int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct mx6_config mxc = { .mx = NULL, }; struct rt6_info *rt; int err; rt = ip6_route_info_create(cfg, extack); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto out; - } - - err = ip6_convert_metrics(&mxc, cfg); - if (err) - goto out; - - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); - - kfree(mxc.mx); + if (IS_ERR(rt)) + return PTR_ERR(rt); - return err; -out: - if (rt) - dst_release_immediate(&rt->dst); + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); return err; } @@ -3157,7 +3094,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; - if (rt6_fill_node(net, skb, rt, + if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); @@ -3348,7 +3285,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * a cached route because rt6_insert_exception() will * takes care of it */ - if (rt6_insert_exception(nrt, rt)) { + if (rt6_insert_exception(nrt, rt->from)) { dst_release_immediate(&nrt->dst); goto out; } @@ -4018,11 +3955,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) update PMTU increase is a MUST. (i.e. jumbo frame) */ if (rt->fib6_nh.nh_dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU)) { + !fib6_metric_locked(rt, RTAX_MTU)) { + u32 mtu = rt->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(rt, RTAX_MTU, arg->mtu); + spin_lock_bh(&rt6_exception_lock); - if (dst_metric_raw(&rt->dst, RTAX_MTU) && - rt6_mtu_change_route_allowed(idev, rt, arg->mtu)) - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); rt6_exceptions_update_pmtu(idev, rt, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } @@ -4183,7 +4123,6 @@ errout: struct rt6_nh { struct rt6_info *rt6_info; struct fib6_config r_cfg; - struct mx6_config mxc; struct list_head next; }; @@ -4198,7 +4137,8 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) } } -static int ip6_route_info_append(struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct net *net, + struct list_head *rt6_nh_list, struct rt6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; @@ -4214,7 +4154,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, if (!nh) return -ENOMEM; nh->rt6_info = rt; - err = ip6_convert_metrics(&nh->mxc, r_cfg); + err = ip6_convert_metrics(net, rt, r_cfg); if (err) { kfree(nh); return err; @@ -4305,7 +4245,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); + err = ip6_route_info_append(info->nl_net, &rt6_nh_list, + rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); goto cleanup; @@ -4323,7 +4264,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { rt_last = nh->rt6_info; - err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); + err = __ip6_ins_rt(nh->rt6_info, info, extack); /* save reference to first route for notification */ if (!rt_notif && !err) rt_notif = nh->rt6_info; @@ -4372,7 +4313,6 @@ cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) dst_release_immediate(&nh->rt6_info->dst); - kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); } @@ -4546,16 +4486,16 @@ nla_put_failure: return -EMSGSIZE; } -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct rt6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; - long expires; + long expires = 0; + u32 *pmetrics; u32 table; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); @@ -4583,8 +4523,8 @@ static int rt6_fill_node(struct net *net, if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; - if (dst) { - if (nla_put_in6_addr(skb, RTA_DST, dst)) + if (dest) { + if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) @@ -4612,9 +4552,9 @@ static int rt6_fill_node(struct net *net, #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; - } else if (dst) { + } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -4626,10 +4566,8 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt6i_pmtu) - metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; - if (rtnetlink_put_metrics(skb, metrics) < 0) + pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; + if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) @@ -4661,9 +4599,10 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; + if (rt->rt6i_flags & RTF_EXPIRES && dst) + expires = dst->expires - jiffies; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) @@ -4697,10 +4636,9 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) } } - return rt6_fill_node(net, - arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, - NLM_F_MULTI); + return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, @@ -4814,13 +4752,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_dst_set(skb, &rt->dst); if (fibmatch) - err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); + err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr, + iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -4846,8 +4785,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, if (!skb) goto errout; - err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, nlm_flags); + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + event, info->portid, seq, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); -- cgit v1.2.3 From 14895687d36805f051bb54014c32e48e5937f7e1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:17 -0700 Subject: net/ipv6: move expires into rt6_info Add expires to rt6_info for FIB entries, and add fib6 helpers to manage it. Data path use of dst.expires remains. The transition is fairly straightforward: when working with fib entries, rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and rt6_check_expired becomes fib6_check_expired, where the fib6 versions are added by this patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 27 +++++++++++++++++++++++---- net/ipv6/addrconf.c | 6 +++--- net/ipv6/ip6_fib.c | 8 ++++---- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 20 +++++++++++--------- 5 files changed, 42 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1f8dc9d12abb..c73b985734f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -179,6 +179,7 @@ struct rt6_info { should_flush:1, unused:6; + unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; @@ -197,6 +198,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline void fib6_clean_expires(struct rt6_info *f6i) +{ + f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_set_expires(struct rt6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->rt6i_flags |= RTF_EXPIRES; +} + +static inline bool fib6_check_expired(const struct rt6_info *f6i) +{ + if (f6i->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, f6i->expires); + return false; +} + static inline void rt6_clean_expires(struct rt6_info *rt) { rt->rt6i_flags &= ~RTF_EXPIRES; @@ -211,11 +232,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { - struct rt6_info *rt; + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); - if (rt && rt != rt0) - rt0->dst.expires = rt->dst.expires; dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 483c8772e856..a156f1a0b1a7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1190,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r ip6_del_rt(dev_net(ifp->idev->dev), rt); else { if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_set_expires(rt, expires); + fib6_set_expires(rt, expires); ip6_rt_put(rt); } } @@ -2672,9 +2672,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ - rt6_set_expires(rt, jiffies + rt_expires); + fib6_set_expires(rt, jiffies + rt_expires); } else { - rt6_clean_expires(rt); + fib6_clean_expires(rt); } } else if (valid_lft) { clock_t expires = 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0d94c56c3e41..f25f4d9831e8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -906,9 +906,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_clean_expires(iter); + fib6_clean_expires(iter); else - rt6_set_expires(iter, rt->dst.expires); + fib6_set_expires(iter, rt->expires); fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } @@ -2003,8 +2003,8 @@ static int fib6_age(struct rt6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { - if (time_after(now, rt->dst.expires)) { + if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) { + if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b058ea9ecec0..e4d9eea92139 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } if (rt) - rt6_set_expires(rt, jiffies + (HZ * lifetime)); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 62aafd06c35f..f6ca55d21fac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -435,7 +435,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) return true; } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired(rt->from); + fib6_check_expired(rt->from); } return false; } @@ -687,7 +687,7 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); @@ -871,9 +871,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, if (rt) { if (!addrconf_finite_timeout(lifetime)) - rt6_clean_expires(rt); + fib6_clean_expires(rt); else - rt6_set_expires(rt, jiffies + HZ * lifetime); + fib6_set_expires(rt, jiffies + HZ * lifetime); ip6_rt_put(rt); } @@ -2383,7 +2383,7 @@ restart: for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) continue; if (rt->rt6i_flags & RTF_REJECT) break; @@ -2909,10 +2909,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; if (cfg->fc_flags & RTF_EXPIRES) - rt6_set_expires(rt, jiffies + + fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); else - rt6_clean_expires(rt); + fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4599,8 +4599,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_flags & RTF_EXPIRES && dst) - expires = dst->expires - jiffies; + if (rt->rt6i_flags & RTF_EXPIRES) { + expires = dst ? dst->expires : rt->expires; + expires -= jiffies; + } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; -- cgit v1.2.3 From 421842edeaf62c4e180b687f5a4efca8c19c49ad Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:18 -0700 Subject: net/ipv6: Add fib6_null_entry ip6_null_entry will stay a dst based return for lookups that fail to match an entry. Add a new fib6_null_entry which constitutes the root node and leafs for fibs. Replace existing references to ip6_null_entry with the new fib6_null_entry when dealing with FIBs. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 ++- net/ipv6/ip6_fib.c | 26 ++++++++++---------- net/ipv6/route.c | 62 +++++++++++++++++++++++++++++++++--------------- 3 files changed, 58 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c29f09cfc9d7..74e4e1e449d5 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,8 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *ip6_null_entry; + struct rt6_info *fib6_null_entry; + struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f25f4d9831e8..280b69497ad0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -231,7 +231,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -369,7 +369,7 @@ struct fib6_dump_arg { static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.ip6_null_entry) + if (rt == arg->net->ipv6.fib6_null_entry) return; call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); } @@ -658,7 +658,7 @@ static struct fib6_node *fib6_add_1(struct net *net, /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == - net->ipv6.ip6_null_entry) { + net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } @@ -1171,9 +1171,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref); rcu_assign_pointer(sfn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ @@ -1212,7 +1212,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); } else { atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(fn->leaf, rt); @@ -1251,7 +1251,7 @@ out: if (!pn_leaf) { WARN_ON(!pn_leaf); pn_leaf = - info->nl_net->ipv6.ip6_null_entry; + info->nl_net->ipv6.fib6_null_entry; } #endif atomic_inc(&pn_leaf->rt6i_ref); @@ -1494,7 +1494,7 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, @@ -1531,7 +1531,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { - rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } @@ -1576,7 +1576,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); - new_fn_leaf = net->ipv6.ip6_null_entry; + new_fn_leaf = net->ipv6.fib6_null_entry; } #endif atomic_inc(&new_fn_leaf->rt6i_ref); @@ -1726,7 +1726,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) return -ENOENT; } #endif - if (!fn || rt == net->ipv6.ip6_null_entry) + if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); @@ -2087,7 +2087,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -2099,7 +2099,7 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f6ca55d21fac..7c141394d4f1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -276,6 +276,15 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; +static const struct rt6_info fib6_null_entry_template = { + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32)0, + .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_UNREACHABLE, + .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, +}; + static const struct rt6_info ip6_null_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), @@ -522,10 +531,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net, return local; if (flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; } - return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; + return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -758,8 +767,8 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, bool do_rr = false; int key_plen; - if (!leaf || leaf == net->ipv6.ip6_null_entry) - return net->ipv6.ip6_null_entry; + if (!leaf || leaf == net->ipv6.fib6_null_entry) + return net->ipv6.fib6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -776,7 +785,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, key_plen = rt0->rt6i_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); @@ -797,7 +806,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, } } - return match ? match : net->ipv6.ip6_null_entry; + return match ? match : net->ipv6.fib6_null_entry; } static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) @@ -1063,7 +1072,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, restart: rt = rcu_dereference(fn->leaf); if (!rt) { - rt = net->ipv6.ip6_null_entry; + rt = net->ipv6.fib6_null_entry; } else { rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); @@ -1071,7 +1080,7 @@ restart: rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, skb, flags); } - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; @@ -1820,7 +1829,7 @@ redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1837,7 +1846,8 @@ redo_rt6_select: if (rt_cache) rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { + rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table, fl6); @@ -2412,13 +2422,13 @@ restart: } if (!rt) - rt = net->ipv6.ip6_null_entry; + rt = net->ipv6.fib6_null_entry; else if (rt->rt6i_flags & RTF_REJECT) { rt = net->ipv6.ip6_null_entry; goto out; } - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; @@ -3051,7 +3061,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; int err; - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } @@ -3081,7 +3091,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) struct fib6_table *table; int err = -ENOENT; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->rt6i_table; spin_lock_bh(&table->tb6_lock); @@ -3634,7 +3644,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && - rt != net->ipv6.ip6_null_entry && + rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ @@ -3789,7 +3799,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { + if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { rt->fib6_nh.nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); @@ -3873,7 +3883,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; switch (arg->event) { @@ -4624,7 +4634,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct net *net = arg->net; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { @@ -4813,6 +4823,8 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { + net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; + net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4826,6 +4838,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ + in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5009,11 +5022,17 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; + net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry), + GFP_KERNEL); + if (!net->ipv6.fib6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) - goto out_ip6_dst_entries; + goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -5060,6 +5079,8 @@ out_ip6_prohibit_entry: out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif +out_fib6_null_entry: + kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: @@ -5068,6 +5089,7 @@ out_ip6_dst_ops: static void __net_exit ip6_route_net_exit(struct net *net) { + kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); @@ -5138,6 +5160,8 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ + init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From 3b6761d18bc11f2af2a6fc494e9026d39593f22c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:20 -0700 Subject: net/ipv6: Move dst flags to booleans in fib entries Continuing to wean FIB paths off of dst_entry, use a bool to hold requests for certain dst settings. Add a helper to convert the flags to DST flags when a FIB entry is converted to a dst_entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 ++++- net/ipv6/addrconf.c | 4 ++-- net/ipv6/route.c | 29 ++++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c73b985734f5..159f651dee55 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -177,7 +177,10 @@ struct rt6_info { u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, - unused:6; + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; unsigned long expires; struct dst_metrics *fib6_metrics; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a156f1a0b1a7..c8df5c6499db 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1046,7 +1046,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, if (net->ipv6.devconf_all->disable_policy || idev->cnf.disable_policy) - rt->dst.flags |= DST_NOPOLICY; + rt->dst_nopolicy = true; neigh_parms_data_state_setall(idev->nd_parms); @@ -5981,7 +5981,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) int cpu; rcu_read_lock(); - addrconf_set_nopolicy(ifa->rt, val); + ifa->rt->dst_nopolicy = val ? true : false; if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e293692174ba..1a3e0db31b34 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -937,6 +937,20 @@ static int ip6_rt_type_to_error(u8 fib6_type) return fib6_prop[fib6_type]; } +static unsigned short fib6_info_dst_flags(struct rt6_info *rt) +{ + unsigned short flags = 0; + + if (rt->dst_nocount) + flags |= DST_NOCOUNT; + if (rt->dst_nopolicy) + flags |= DST_NOPOLICY; + if (rt->dst_host) + flags |= DST_HOST; + + return flags; +} + static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) { rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); @@ -961,6 +975,8 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) { + rt->dst.flags |= fib6_info_dst_flags(ort); + if (ort->rt6i_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; @@ -970,7 +986,6 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) rt->dst.output = ip6_output; if (ort->fib6_type == RTN_LOCAL) { - rt->dst.flags |= DST_HOST; rt->dst.input = ip6_input; } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; @@ -1058,10 +1073,11 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) { + unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; - nrt = __ip6_dst_alloc(dev_net(dev), dev, 0); + nrt = __ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); @@ -1229,12 +1245,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { + unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev; struct rt6_info *pcpu_rt; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); - pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); + pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) return NULL; @@ -2965,7 +2982,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) - rt->dst.flags |= DST_HOST; + rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -3626,10 +3643,12 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, if (!rt) return ERR_PTR(-ENOMEM); + rt->dst_nocount = true; + in6_dev_hold(idev); rt->rt6i_idev = idev; - rt->dst.flags |= DST_HOST; + rt->dst_host = true; rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { -- cgit v1.2.3 From f8a1b43b709d8ef33a8de2f8f35095b4a4413713 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:21 -0700 Subject: net/ipv6: Create a neigh_lookup for FIB entries The router discovery code has a FIB entry and wants to validate the gateway has a neighbor entry. Refactor the existing dst_neigh_lookup for IPv6 and create a new function that takes the gateway and device and returns a neighbor entry. Use the new function in ndisc_router_discovery to validate the gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +++ net/ipv6/ndisc.c | 8 ++++++-- net/ipv6/route.c | 33 ++++++++++++++++++++------------- 3 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 655e13017a45..cb6fb7e16a28 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,4 +279,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, struct sk_buff *skb, + const void *daddr); #endif diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e4d9eea92139..556717154fa3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1276,7 +1276,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", @@ -1304,7 +1306,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1a3e0db31b34..d635d71f7d51 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -182,12 +182,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } -static inline const void *choose_neigh_daddr(struct rt6_info *rt, +static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { - struct in6_addr *p = &rt->rt6i_gateway; - if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) @@ -195,18 +193,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt, return daddr; } -static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, + struct sk_buff *skb, + const void *daddr) { - struct rt6_info *rt = (struct rt6_info *) dst; struct neighbour *n; - daddr = choose_neigh_daddr(rt, skb, daddr); - n = __ipv6_neigh_lookup(dst->dev, daddr); + daddr = choose_neigh_daddr(gw, skb, daddr); + n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dst->dev); + return neigh_create(&nd_tbl, daddr, dev); +} + +static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + + return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) @@ -214,7 +221,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; struct rt6_info *rt = (struct rt6_info *)dst; - daddr = choose_neigh_daddr(rt, NULL, daddr); + daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) @@ -239,7 +246,7 @@ static struct dst_ops ip6_dst_ops_template = { .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; @@ -269,7 +276,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, .cow_metrics = dst_cow_metrics_generic, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, }; static const u32 ip6_template_metrics[RTAX_MAX] = { -- cgit v1.2.3 From acb54e3cba404c20f07733f3222c0418a7724a5b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:22 -0700 Subject: net/ipv6: Add gfp_flags to route add functions Most FIB entries can be added using memory allocated with GFP_KERNEL. Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that can be reached from the packet path (e.g., ndisc and autoconfig) or atomic notifiers use GFP_ATOMIC; paths from user context (adding addresses and routes) use GFP_KERNEL. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ++++-- net/ipv6/addrconf.c | 39 +++++++++++++++++++++++---------------- net/ipv6/anycast.c | 2 +- net/ipv6/route.c | 18 ++++++++++-------- 4 files changed, 38 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index cb6fb7e16a28..ff70266e30d7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -100,7 +100,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); @@ -138,7 +139,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast); + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c8df5c6499db..915cd0734b27 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(net, idev, addr, false); + rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -2320,7 +2320,7 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad static void addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, - unsigned long expires, u32 flags) + unsigned long expires, u32 flags, gfp_t gfp_flags) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, @@ -2345,7 +2345,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, cfg.fc_flags |= RTF_NONEXTHOP; #endif - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, gfp_flags, NULL); } @@ -2401,7 +2401,7 @@ static void addrconf_add_mroute(struct net_device *dev) ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) @@ -2685,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) expires = jiffies_to_clock_t(rt_expires); } addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, expires, flags); + dev, expires, flags, GFP_ATOMIC); } ip6_rt_put(rt); } @@ -2900,7 +2900,7 @@ static int inet6_addr_add(struct net *net, int ifindex, if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags); + expires, flags, GFP_KERNEL); } /* Send a netlink notification if DAD is enabled and @@ -3053,7 +3053,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (addr.s6_addr32[3]) { add_addr(idev, &addr, plen, scope); - addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); + addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags, + GFP_ATOMIC); return; } @@ -3078,7 +3079,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) add_addr(idev, &addr, plen, flag); addrconf_prefix_route(&addr, plen, idev->dev, 0, - pflags); + pflags, GFP_ATOMIC); } } } @@ -3118,7 +3119,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL); if (!IS_ERR(ifp)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, + 0, 0, GFP_ATOMIC); addrconf_dad_start(ifp); in6_ifa_put(ifp); } @@ -3233,7 +3235,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we @@ -3243,7 +3246,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_ATOMIC); break; case IN6_ADDR_GEN_MODE_NONE: default: @@ -3346,7 +3350,8 @@ static int fixup_permanent_addr(struct net *net, if (!ifp->rt || !ifp->rt->rt6i_node) { struct rt6_info *rt, *prev; - rt = addrconf_dst_alloc(net, idev, &ifp->addr, false); + rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, + GFP_ATOMIC); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -3361,7 +3366,7 @@ static int fixup_permanent_addr(struct net *net, if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - idev->dev, 0, 0); + idev->dev, 0, 0, GFP_ATOMIC); } if (ifp->state == INET6_IFADDR_STATE_PREDAD) @@ -4572,8 +4577,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, ipv6_ifa_notify(0, ifp); if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - expires, flags); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->idev->dev, expires, flags, + GFP_KERNEL); } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; @@ -5614,7 +5620,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) addrconf_prefix_route(&ifp->peer_addr, 128, - ifp->idev->dev, 0, 0); + ifp->idev->dev, 0, 0, + GFP_KERNEL); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 1122fb299b86..e456386fe4d5 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -267,7 +267,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - rt = addrconf_dst_alloc(net, idev, addr, true); + rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d635d71f7d51..56a854b426a6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2866,6 +2866,7 @@ out: } static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, + gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -3086,12 +3087,13 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct rt6_info *rt; int err; - rt = ip6_route_info_create(cfg, extack); + rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -3418,7 +3420,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } @@ -3469,7 +3471,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net, cfg.fc_gateway = *gwaddr; - if (!ip6_route_add(&cfg, NULL)) { + if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); @@ -3567,7 +3569,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&cfg, NULL); + err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); @@ -3640,7 +3642,7 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast) + bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; @@ -4293,7 +4295,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); - rt = ip6_route_info_create(&r_cfg, extack); + rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -4446,7 +4448,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else - return ip6_route_add(&cfg, extack); + return ip6_route_add(&cfg, GFP_KERNEL, extack); } static size_t rt6_nlmsg_size(struct rt6_info *rt) -- cgit v1.2.3 From 23fb93a4d3f118a900790066d03368a296dce0d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:23 -0700 Subject: net/ipv6: Cleanup exception and cache route handling IPv6 FIB will only contain FIB entries with exception routes added to the FIB entry. Once this transformation is complete, FIB lookups will return a fib6_info with the lookup functions still returning a dst based rt6_info. The current code uses rt6_info for both paths and overloads the rt6_info variable usually called 'rt'. This patch introduces a new 'f6i' variable name for the result of the FIB lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a fib6_info in a later patch which is why it is introduced as f6i now; avoids the additional churn in the later patch. In addition, remove RTF_CACHE and dst checks from fib6 add and delete since they can not happen now and will never happen after the data type flip. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - net/ipv6/ip6_fib.c | 16 +----- net/ipv6/route.c | 142 +++++++++++++++++++++++++++--------------------- 3 files changed, 81 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ff70266e30d7..686cdc7f356a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -106,7 +106,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); -int rt6_remove_exception_rt(struct rt6_info *rt); void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 280b69497ad0..53df4a98a7f7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1074,7 +1074,7 @@ add: static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + (rt->rt6i_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -1125,8 +1125,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; - if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1650,8 +1648,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, RT6_TRACE("fib6_del_route\n"); - WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); - /* Unlink it */ *rtp = rt->rt6_next; rt->rt6i_node = NULL; @@ -1720,21 +1716,11 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) struct rt6_info __rcu **rtp; struct rt6_info __rcu **rtp_next; -#if RT6_DEBUG >= 2 - if (rt->dst.obsolete > 0) { - WARN_ON(fn); - return -ENOENT; - } -#endif if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - /* remove cached dst from exception table */ - if (rt->rt6i_flags & RTF_CACHE) - return rt6_remove_exception_rt(rt); - /* * Walk the leaf entries looking for ourself */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 56a854b426a6..ad9eaecf539c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1013,8 +1013,8 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; - dst_hold(&from->dst); - rt->from = from; + if (dst_hold_safe(&from->dst)) + rt->from = from; dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -1097,8 +1097,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct rt6_info *rt, *rt_cache; + struct rt6_info *f6i; struct fib6_node *fn; + struct rt6_info *rt; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) flags &= ~RT6_LOOKUP_F_IFACE; @@ -1106,36 +1107,36 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = rcu_dereference(fn->leaf); - if (!rt) { - rt = net->ipv6.fib6_null_entry; + f6i = rcu_dereference(fn->leaf); + if (!f6i) { + f6i = net->ipv6.fib6_null_entry; } else { - rt = rt6_device_match(net, rt, &fl6->saddr, + f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, - skb, flags); + if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0) + f6i = rt6_multipath_select(net, f6i, fl6, + fl6->flowi6_oif, skb, flags); } - if (rt == net->ipv6.fib6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } + /* Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) { - rt = rt_cache; + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - } else if (dst_hold_safe(&rt->dst)) { - struct rt6_info *nrt; - - nrt = ip6_create_rt_rcu(rt); - dst_release(&rt->dst); - rt = nrt; - } else { + } else if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); + } else { + rt = ip6_create_rt_rcu(f6i); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } } rcu_read_unlock(); @@ -1218,9 +1219,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, * Clone the route. */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); rt = __ip6_dst_alloc(dev_net(dev), dev, 0); @@ -1446,11 +1444,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_exception *rt6_ex; int err = 0; - /* ort can't be a cache or pcpu route */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); - spin_lock_bh(&rt6_exception_lock); if (ort->exception_bucket_flushed) { @@ -1589,7 +1582,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, } /* Remove the passed in cached rt from the hash table that contains it */ -int rt6_remove_exception_rt(struct rt6_info *rt) +static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; @@ -1854,7 +1847,8 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *rt_cache; + struct rt6_info *f6i; + struct rt6_info *rt; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1871,10 +1865,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(net, fn, oif, strict); - if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); - if (rt == net->ipv6.fib6_null_entry) { + f6i = rt6_select(net, fn, oif, strict); + if (f6i->rt6i_nsiblings) + f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1886,18 +1880,17 @@ redo_rt6_select: } } - /*Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) - rt = rt_cache; - - if (rt == net->ipv6.fib6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table, fl6); return rt; - } else if (rt->rt6i_flags & RTF_CACHE) { + } + + /*Search through exception table */ + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); @@ -1905,7 +1898,7 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(rt->rt6i_flags & RTF_GATEWAY))) { + !(f6i->rt6i_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -1914,16 +1907,16 @@ redo_rt6_select: struct rt6_info *uncached_rt; - if (ip6_hold_safe(net, &rt, true)) { - dst_use_noref(&rt->dst, jiffies); + if (ip6_hold_safe(net, &f6i, true)) { + dst_use_noref(&f6i->dst, jiffies); } else { rcu_read_unlock(); - uncached_rt = rt; + uncached_rt = f6i; goto uncached_rt_out; } rcu_read_unlock(); - uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); dst_release(&rt->dst); if (uncached_rt) { @@ -1946,18 +1939,18 @@ uncached_rt_out: struct rt6_info *pcpu_rt; - dst_use_noref(&rt->dst, jiffies); + dst_use_noref(&f6i->dst, jiffies); local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(rt); + pcpu_rt = rt6_get_pcpu_route(f6i); if (!pcpu_rt) { /* atomic_inc_not_zero() is needed when using rcu */ - if (atomic_inc_not_zero(&rt->rt6i_ref)) { + if (atomic_inc_not_zero(&f6i->rt6i_ref)) { /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - pcpu_rt = rt6_make_pcpu_route(net, rt); - rt6_release(rt); + pcpu_rt = rt6_make_pcpu_route(net, f6i); + rt6_release(f6i); } else { /* rt is already removed from tree */ pcpu_rt = net->ipv6.ip6_null_entry; @@ -2419,7 +2412,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt, *rt_cache; + struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2458,7 +2452,7 @@ restart: if (rt_cache && ipv6_addr_equal(&rdfl->gateway, &rt_cache->rt6i_gateway)) { - rt = rt_cache; + ret = rt_cache; break; } continue; @@ -2469,7 +2463,7 @@ restart: if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->rt6i_flags & RTF_REJECT) { - rt = net->ipv6.ip6_null_entry; + ret = net->ipv6.ip6_null_entry; goto out; } @@ -2480,12 +2474,15 @@ restart: } out: - ip6_hold_safe(net, &rt, true); + if (ret) + dst_hold(&ret->dst); + else + ret = ip6_create_rt_rcu(rt); rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); - return rt; + trace_fib6_table_lookup(net, ret, table, fl6); + return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, @@ -3182,6 +3179,22 @@ out_put: return err; } +static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +{ + int rc = -ESRCH; + + if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) + goto out; + + if (cfg->fc_flags & RTF_GATEWAY && + !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + goto out; + if (dst_hold_safe(&rt->dst)) + rc = rt6_remove_exception_rt(rt); +out: + return rc; +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -3206,11 +3219,16 @@ static int ip6_route_del(struct fib6_config *cfg, if (fn) { for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { + int rc; + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); - if (!rt_cache) - continue; - rt = rt_cache; + if (rt_cache) { + rc = ip6_del_cached_rt(rt_cache, cfg); + if (rc != -ESRCH) + return rc; + } + continue; } if (cfg->fc_ifindex && (!rt->fib6_nh.nh_dev || @@ -3327,7 +3345,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); - nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); + nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); if (!nrt) goto out; -- cgit v1.2.3 From a64efe142f5e70b7e39276a414bbb3b96691c608 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:24 -0700 Subject: net/ipv6: introduce fib6_info struct and helpers Add fib6_info struct and alloc, destroy, hold and release helpers. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/ip6_fib.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 159f651dee55..630392ae12d8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -38,6 +38,7 @@ #endif struct rt6_info; +struct fib6_info; struct fib6_config { u32 fc_table; @@ -132,6 +133,46 @@ struct fib6_nh { int nh_weight; }; +struct fib6_info { + struct fib6_table *rt6i_table; + struct fib6_info __rcu *rt6_next; + struct fib6_node __rcu *rt6i_node; + + /* Multipath routes: + * siblings is a list of fib6_info that have the the same metric/weight, + * destination, but not the same gateway. nsiblings is just a cache + * to speed up lookup. + */ + struct list_head rt6i_siblings; + unsigned int rt6i_nsiblings; + + atomic_t rt6i_ref; + struct inet6_dev *rt6i_idev; + unsigned long expires; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] + + struct rt6key rt6i_dst; + u32 rt6i_flags; + struct rt6key rt6i_src; + struct rt6key rt6i_prefsrc; + + struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; + + u32 rt6i_metric; + u8 rt6i_protocol; + u8 fib6_type; + u8 exception_bucket_flushed:1, + should_flush:1, + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; + + struct fib6_nh fib6_nh; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -291,6 +332,20 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); +struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct rt6_info *f6i); + +static inline void fib6_info_hold(struct rt6_info *f6i) +{ + atomic_inc(&f6i->rt6i_ref); +} + +static inline void fib6_info_release(struct rt6_info *f6i) +{ + if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + fib6_info_destroy(f6i); +} + static inline void rt6_hold(struct rt6_info *rt) { atomic_inc(&rt->rt6i_ref); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 53df4a98a7f7..d07578d84db0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -145,6 +145,66 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } +struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) +{ + struct rt6_info *f6i; + + f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (!f6i) + return NULL; + + f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!f6i->rt6i_pcpu) { + kfree(f6i); + return NULL; + } + + INIT_LIST_HEAD(&f6i->rt6i_siblings); + f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; + + atomic_inc(&f6i->rt6i_ref); + + return f6i; +} + +void fib6_info_destroy(struct rt6_info *f6i) +{ + struct rt6_exception_bucket *bucket; + + WARN_ON(f6i->rt6i_node); + + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); + if (bucket) { + f6i->rt6i_exception_bucket = NULL; + kfree(bucket); + } + + if (f6i->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + } + + if (f6i->rt6i_idev) + in6_dev_put(f6i->rt6i_idev); + if (f6i->fib6_nh.nh_dev) + dev_put(f6i->fib6_nh.nh_dev); + + kfree(f6i); +} +EXPORT_SYMBOL_GPL(fib6_info_destroy); + static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; -- cgit v1.2.3 From 93531c6743157d7e8c5792f8ed1a57641149d62c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:25 -0700 Subject: net/ipv6: separate handling of FIB entries from dst based routes Last step before flipping the data type for FIB entries: - use fib6_info_alloc to create FIB entries in ip6_route_info_create and addrconf_dst_alloc - use fib6_info_release in place of dst_release, ip6_rt_put and rt6_release - remove the dst_hold before calling __ip6_ins_rt or ip6_del_rt - when purging routes, drop per-cpu routes - replace inc and dec of rt6i_ref with fib6_info_hold and fib6_info_release - use rt->from since it points to the FIB entry - drop references to exception bucket, fib6_metrics and per-cpu from dst entries (those are relevant for fib entries only) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 +- include/net/ip6_route.h | 3 +- net/ipv6/addrconf.c | 18 +++-- net/ipv6/anycast.c | 7 +- net/ipv6/ip6_fib.c | 55 ++++++++++------ net/ipv6/ip6_output.c | 3 +- net/ipv6/ndisc.c | 6 +- net/ipv6/route.c | 171 +++++++++++++++++------------------------------- 8 files changed, 115 insertions(+), 152 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 630392ae12d8..6c3d92bb3459 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -314,9 +314,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - rt = rt->from; - - rt6_get_cookie_safe(rt, &cookie); + rt6_get_cookie_safe(rt->from, &cookie); return cookie; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 686cdc7f356a..57d0d45667f1 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,8 +114,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; + struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; int err = 0; if (rt && rt->rt6i_prefsrc.plen) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 915cd0734b27..e533a447f68c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) pr_warn("Freeing alive inet6 address %p\n", ifp); return; } - ip6_rt_put(ifp->rt); kfree_rcu(ifp, rcu); } @@ -1102,8 +1101,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: if (unlikely(err < 0)) { - if (rt) - ip6_rt_put(rt); + fib6_info_release(rt); + if (ifa) { if (ifa->idev) in6_dev_put(ifa->idev); @@ -1191,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r else { if (!(rt->rt6i_flags & RTF_EXPIRES)) fib6_set_expires(rt, expires); - ip6_rt_put(rt); + fib6_info_release(rt); } } } @@ -2375,8 +2374,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & noflags) != 0) continue; - if (!dst_hold_safe(&rt->dst)) - rt = NULL; + fib6_info_hold(rt); break; } out: @@ -2687,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, dev, expires, flags, GFP_ATOMIC); } - ip6_rt_put(rt); + fib6_info_release(rt); } /* Try to figure out our local address for this prefix */ @@ -3361,7 +3359,7 @@ static int fixup_permanent_addr(struct net *net, ifp->rt = rt; spin_unlock(&ifp->lock); - ip6_rt_put(prev); + fib6_info_release(prev); } if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { @@ -5636,8 +5634,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_del_rt(net, rt); } if (ifp->rt) { - if (dst_hold_safe(&ifp->rt->dst)) - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt); + ifp->rt = NULL; } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index e456386fe4d5..3db8fe10322b 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -213,7 +213,7 @@ static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->dst); + fib6_info_release(ac->aca_rt); kfree(ac); } } @@ -231,6 +231,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, aca->aca_addr = *addr; in6_dev_hold(idev); aca->aca_idev = idev; + fib6_info_hold(rt); aca->aca_rt = rt; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ @@ -274,7 +275,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } aca = aca_alloc(rt, addr); if (!aca) { - ip6_rt_put(rt); + fib6_info_release(rt); err = -ENOMEM; goto out; } @@ -330,7 +331,6 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); @@ -358,7 +358,6 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d07578d84db0..4d6bd033dccd 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -170,6 +170,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy(struct rt6_info *f6i) { struct rt6_exception_bucket *bucket; + struct dst_metrics *m; WARN_ON(f6i->rt6i_node); @@ -201,6 +202,10 @@ void fib6_info_destroy(struct rt6_info *f6i) if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); + m = f6i->fib6_metrics; + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) + kfree(m); + kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy); @@ -714,7 +719,7 @@ static struct fib6_node *fib6_add_1(struct net *net, /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); - rt6_release(leaf); + fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == @@ -898,12 +903,32 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); - rt6_release(rt); + fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } + + if (rt->rt6i_pcpu) { + int cpu; + + /* release the reference to this fib entry from + * all of its cached pcpu routes + */ + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + fib6_info_release(pcpu_rt->from); + pcpu_rt->from = NULL; + } + } + } } } @@ -1099,7 +1124,7 @@ add: fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1115,7 +1140,7 @@ add: fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1183,9 +1208,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); - if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) - return -EINVAL; - if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1300,7 +1322,7 @@ out: if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); + fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, @@ -1312,7 +1334,7 @@ out: info->nl_net->ipv6.fib6_null_entry; } #endif - atomic_inc(&pn_leaf->rt6i_ref); + fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } @@ -1334,10 +1356,6 @@ failure: (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); return err; } @@ -1637,7 +1655,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, new_fn_leaf = net->ipv6.fib6_null_entry; } #endif - atomic_inc(&new_fn_leaf->rt6i_ref); + fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } @@ -1693,7 +1711,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, return pn; RCU_INIT_POINTER(pn->leaf, NULL); - rt6_release(pn_leaf); + fib6_info_release(pn_leaf); fn = pn; } } @@ -1763,7 +1781,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); - rt6_release(rt); + fib6_info_release(rt); } /* Need to own table->tb6_lock */ @@ -2261,9 +2279,8 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), - rt->dst.__use, rt->rt6i_flags, - dev ? dev->name : ""); + rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0, + rt->rt6i_flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a39b04f9fa6e..3db47986ef38 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -968,7 +968,8 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, if (!had_dst) *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; - err = ip6_route_get_saddr(net, rt, &fl6->daddr, + err = ip6_route_get_saddr(net, rt ? rt->from : NULL, + &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); if (err) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 556717154fa3..a28857088bff 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1283,7 +1283,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } } @@ -1313,7 +1313,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } neigh->flags |= NTF_ROUTER; @@ -1499,7 +1499,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: - ip6_rt_put(rt); + fib6_info_release(rt); if (neigh) neigh_release(neigh); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad9eaecf539c..0d861bd07673 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -351,13 +351,11 @@ static void rt6_info_init(struct rt6_info *rt) memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; } /* allocate dst with ip6_dst_ops */ -static struct rt6_info *__ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); @@ -369,35 +367,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } - -struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) -{ - struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); - - if (rt) { - rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (!rt->rt6i_pcpu) { - dst_release_immediate(&rt->dst); - return NULL; - } - } - - return rt; -} EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; struct inet6_dev *idev; - struct dst_metrics *m; dst_destroy_metrics_generic(dst); - free_percpu(rt->rt6i_pcpu); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -405,18 +383,9 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); - if (bucket) { - rt->rt6i_exception_bucket = NULL; - kfree(bucket); - } - - m = rt->fib6_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); rt->from = NULL; - dst_release(&from->dst); + fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -891,7 +860,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, else fib6_set_expires(rt, jiffies + HZ * lifetime); - ip6_rt_put(rt); + fib6_info_release(rt); } return 0; } @@ -1010,11 +979,9 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->from); - rt->rt6i_flags &= ~RTF_EXPIRES; - if (dst_hold_safe(&from->dst)) - rt->from = from; + fib6_info_hold(from); + rt->from = from; dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -1084,7 +1051,7 @@ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; - nrt = __ip6_dst_alloc(dev_net(dev), dev, flags); + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); @@ -1203,8 +1170,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt) { struct nl_info info = { .nl_net = net, }; - /* Hold dst to account for the reference from the fib6 tree */ - dst_hold(&rt->dst); return __ip6_ins_rt(rt, &info, NULL); } @@ -1221,7 +1186,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); - rt = __ip6_dst_alloc(dev_net(dev), dev, 0); + rt = ip6_dst_alloc(dev_net(dev), dev, 0); rcu_read_unlock(); if (!rt) return NULL; @@ -1256,7 +1221,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); - pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags); + pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) return NULL; @@ -1317,7 +1282,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, net = dev_net(rt6_ex->rt6i->dst.dev); rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); - rt6_release(rt6_ex->rt6i); + ip6_rt_put(rt6_ex->rt6i); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; @@ -1907,17 +1872,11 @@ redo_rt6_select: struct rt6_info *uncached_rt; - if (ip6_hold_safe(net, &f6i, true)) { - dst_use_noref(&f6i->dst, jiffies); - } else { - rcu_read_unlock(); - uncached_rt = f6i; - goto uncached_rt_out; - } + fib6_info_hold(f6i); rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); - dst_release(&rt->dst); + fib6_info_release(f6i); if (uncached_rt) { /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() @@ -1930,7 +1889,6 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } -uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; @@ -1939,24 +1897,12 @@ uncached_rt_out: struct rt6_info *pcpu_rt; - dst_use_noref(&f6i->dst, jiffies); local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(f6i); - if (!pcpu_rt) { - /* atomic_inc_not_zero() is needed when using rcu */ - if (atomic_inc_not_zero(&f6i->rt6i_ref)) { - /* No dst_hold() on rt is needed because grabbing - * rt->rt6i_ref makes sure rt can't be released. - */ - pcpu_rt = rt6_make_pcpu_route(net, f6i); - rt6_release(f6i); - } else { - /* rt is already removed from tree */ - pcpu_rt = net->ipv6.ip6_null_entry; - dst_hold(&pcpu_rt->dst); - } - } + if (!pcpu_rt) + pcpu_rt = rt6_make_pcpu_route(net, f6i); + local_bh_enable(); rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table, fl6); @@ -2193,11 +2139,26 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static bool fib6_check(struct rt6_info *f6i, u32 cookie) +{ + u32 rt_cookie = 0; + + if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) || + rt_cookie != cookie) + return false; + + if (fib6_check_expired(f6i)) + return false; + + return true; +} + static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; - if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) + if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) || + rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -2210,7 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check(rt->from, cookie)) + fib6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -2241,7 +2202,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (rt6_check_expired(rt)) { - ip6_del_rt(dev_net(dst->dev), rt); + rt6_remove_exception_rt(rt); dst = NULL; } } else { @@ -2262,12 +2223,12 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) - ip6_del_rt(dev_net(rt->dst.dev), rt); - } else { + rt6_remove_exception_rt(rt); + } else if (rt->from) { struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(rt->from->rt6i_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) fn->fn_sernum = -1; rcu_read_unlock(); @@ -2949,13 +2910,13 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, - (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); - - if (!rt) { - err = -ENOMEM; + err = -ENOMEM; + rt = fib6_info_alloc(gfp_flags); + if (!rt) goto out; - } + + if (cfg->fc_flags & RTF_ADDRCONF) + rt->dst_nocount = true; err = ip6_convert_metrics(net, rt, cfg); if (err < 0) @@ -3029,7 +2990,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; - rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway; + rt->fib6_nh.nh_gw = cfg->fc_gateway; } err = -ENODEV; @@ -3066,7 +3027,7 @@ install_route: !netif_carrier_ok(dev)) rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->fib6_nh.nh_dev = rt->dst.dev = dev; + rt->fib6_nh.nh_dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3078,9 +3039,8 @@ out: dev_put(dev); if (idev) in6_dev_put(idev); - if (rt) - dst_release_immediate(&rt->dst); + fib6_info_release(rt); return ERR_PTR(err); } @@ -3095,6 +3055,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, return PTR_ERR(rt); err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); + fib6_info_release(rt); return err; } @@ -3116,7 +3077,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) spin_unlock_bh(&table->tb6_lock); out: - ip6_rt_put(rt); + fib6_info_release(rt); return err; } @@ -3170,7 +3131,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: - ip6_rt_put(rt); + fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, @@ -3241,8 +3202,7 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - if (!dst_hold_safe(&rt->dst)) - break; + fib6_info_hold(rt); rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3510,12 +3470,9 @@ restart: for_each_fib6_node_rt_rcu(&table->tb6_root) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - if (dst_hold_safe(&rt->dst)) { - rcu_read_unlock(); - ip6_del_rt(net, rt); - } else { - rcu_read_unlock(); - } + fib6_info_hold(rt); + rcu_read_unlock(); + ip6_del_rt(net, rt); goto restart; } } @@ -3666,7 +3623,7 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, struct net_device *dev = idev->dev; struct rt6_info *rt; - rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); + rt = fib6_info_alloc(gfp_flags); if (!rt) return ERR_PTR(-ENOMEM); @@ -3687,8 +3644,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, } rt->fib6_nh.nh_gw = *addr; + dev_hold(dev); rt->fib6_nh.nh_dev = dev; - rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; @@ -4325,7 +4282,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); if (err) { - dst_release_immediate(&rt->dst); + fib6_info_release(rt); goto cleanup; } @@ -4342,6 +4299,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, list_for_each_entry(nh, &rt6_nh_list, next) { rt_last = nh->rt6_info; err = __ip6_ins_rt(nh->rt6_info, info, extack); + fib6_info_release(nh->rt6_info); + /* save reference to first route for notification */ if (!rt_notif && !err) rt_notif = nh->rt6_info; @@ -4389,7 +4348,7 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) - dst_release_immediate(&nh->rt6_info->dst); + fib6_info_release(nh->rt6_info); list_del(&nh->next); kfree(nh); } @@ -4814,14 +4773,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } - if (fibmatch && rt->from) { - struct rt6_info *ort = rt->from; - - dst_hold(&ort->dst); - ip6_rt_put(rt); - rt = ort; - } - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); @@ -4831,12 +4782,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_dst_set(skb, &rt->dst); if (fibmatch) - err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif, + err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr, - iif, RTM_NEWROUTE, + err = rt6_fill_node(net, skb, rt->from, dst, + &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); if (err < 0) { -- cgit v1.2.3 From 8d1c802b2815edc97af8a58c5045ebaf3848621a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:26 -0700 Subject: net/ipv6: Flip FIB entries to fib6_info Convert all code paths referencing a FIB entry from rt6_info to fib6_info. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 64 ++--- include/net/if_inet6.h | 4 +- include/net/ip6_fib.h | 42 ++-- include/net/ip6_route.h | 28 +-- include/net/netns/ipv6.h | 2 +- net/ipv6/addrconf.c | 20 +- net/ipv6/anycast.c | 4 +- net/ipv6/ip6_fib.c | 116 ++++----- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 259 +++++++++++---------- 10 files changed, 271 insertions(+), 270 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index d995a0b52d7c..b85b15851841 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -442,7 +442,7 @@ struct mlxsw_sp_fib6_entry { struct mlxsw_sp_rt6 { struct list_head list; - struct rt6_info *rt; + struct fib6_info *rt; }; struct mlxsw_sp_lpm_tree { @@ -3834,7 +3834,7 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp, for (i = 0; i < nh_grp->count; i++) { struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; - struct rt6_info *rt = mlxsw_sp_rt6->rt; + struct fib6_info *rt = mlxsw_sp_rt6->rt; if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev && ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr, @@ -3920,7 +3920,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry, common); list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - struct rt6_info *rt = mlxsw_sp_rt6->rt; + struct fib6_info *rt = mlxsw_sp_rt6->rt; rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD; } @@ -4699,7 +4699,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); } -static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt) +static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt) { /* Packets with link-local destination IP arriving to the router * are trapped to the CPU, so no need to program specific routes @@ -4721,7 +4721,7 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt) return false; } -static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt) +static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt) { struct mlxsw_sp_rt6 *mlxsw_sp_rt6; @@ -4734,18 +4734,18 @@ static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt) * memory. */ mlxsw_sp_rt6->rt = rt; - rt6_hold(rt); + fib6_info_hold(rt); return mlxsw_sp_rt6; } #if IS_ENABLED(CONFIG_IPV6) -static void mlxsw_sp_rt6_release(struct rt6_info *rt) +static void mlxsw_sp_rt6_release(struct fib6_info *rt) { - rt6_release(rt); + fib6_info_release(rt); } #else -static void mlxsw_sp_rt6_release(struct rt6_info *rt) +static void mlxsw_sp_rt6_release(struct fib6_info *rt) { } #endif @@ -4756,13 +4756,13 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) kfree(mlxsw_sp_rt6); } -static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt) +static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) { /* RTF_CACHE routes are ignored */ return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; } -static struct rt6_info * +static struct fib6_info * mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) { return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, @@ -4771,7 +4771,7 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct rt6_info *nrt, bool replace) + const struct fib6_info *nrt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; @@ -4779,7 +4779,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { - struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); + struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); /* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same * virtual router. @@ -4802,7 +4802,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, static struct mlxsw_sp_rt6 * mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry, - const struct rt6_info *rt) + const struct fib6_info *rt) { struct mlxsw_sp_rt6 *mlxsw_sp_rt6; @@ -4815,7 +4815,7 @@ mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry, } static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp, - const struct rt6_info *rt, + const struct fib6_info *rt, enum mlxsw_sp_ipip_type *ret) { return rt->fib6_nh.nh_dev && @@ -4825,7 +4825,7 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp, struct mlxsw_sp_nexthop *nh, - const struct rt6_info *rt) + const struct fib6_info *rt) { const struct mlxsw_sp_ipip_ops *ipip_ops; struct mlxsw_sp_ipip_entry *ipip_entry; @@ -4870,7 +4870,7 @@ static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp, struct mlxsw_sp_nexthop *nh, - const struct rt6_info *rt) + const struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh.nh_dev; @@ -4897,7 +4897,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, } static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, - const struct rt6_info *rt) + const struct fib6_info *rt) { return rt->rt6i_flags & RTF_GATEWAY || mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL); @@ -4928,7 +4928,7 @@ mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp, nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt); nh_grp->count = fib6_entry->nrt6; for (i = 0; i < nh_grp->count; i++) { - struct rt6_info *rt = mlxsw_sp_rt6->rt; + struct fib6_info *rt = mlxsw_sp_rt6->rt; nh = &nh_grp->nexthops[i]; err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt); @@ -5040,7 +5040,7 @@ err_nexthop6_group_get: static int mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib6_entry *fib6_entry, - struct rt6_info *rt) + struct fib6_info *rt) { struct mlxsw_sp_rt6 *mlxsw_sp_rt6; int err; @@ -5068,7 +5068,7 @@ err_nexthop6_group_update: static void mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib6_entry *fib6_entry, - struct rt6_info *rt) + struct fib6_info *rt) { struct mlxsw_sp_rt6 *mlxsw_sp_rt6; @@ -5084,7 +5084,7 @@ mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp, static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib_entry *fib_entry, - const struct rt6_info *rt) + const struct fib6_info *rt) { /* Packets hitting RTF_REJECT routes need to be discarded by the * stack. We can rely on their destination device not having a @@ -5118,7 +5118,7 @@ mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib_node *fib_node, - struct rt6_info *rt) + struct fib6_info *rt) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_entry *fib_entry; @@ -5168,12 +5168,12 @@ static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct rt6_info *nrt, bool replace) + const struct fib6_info *nrt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { - struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); + struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id) continue; @@ -5198,7 +5198,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry, bool replace) { struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node; - struct rt6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry); + struct fib6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry); struct mlxsw_sp_fib6_entry *fib6_entry; fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace); @@ -5213,7 +5213,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry, struct mlxsw_sp_fib6_entry *last; list_for_each_entry(last, &fib_node->entry_list, common.list) { - struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(last); + struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last); if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id) break; @@ -5268,7 +5268,7 @@ mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp, - const struct rt6_info *rt) + const struct fib6_info *rt) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5287,7 +5287,7 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp, return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { - struct rt6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry); + struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry); if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id && rt->rt6i_metric == iter_rt->rt6i_metric && @@ -5316,7 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, - struct rt6_info *rt, bool replace) + struct fib6_info *rt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5373,7 +5373,7 @@ err_fib6_entry_nexthop_add: } static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp, - struct rt6_info *rt) + struct fib6_info *rt) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5836,7 +5836,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work, fen6_info = container_of(info, struct fib6_entry_notifier_info, info); fib_work->fen6_info = *fen6_info; - rt6_hold(fib_work->fen6_info.rt); + fib6_info_hold(fib_work->fen6_info.rt); break; } } diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d4088d1a688d..d6089b2e64fe 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -64,7 +64,7 @@ struct inet6_ifaddr { struct delayed_work dad_work; struct inet6_dev *idev; - struct rt6_info *rt; + struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; @@ -144,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct inet6_dev *aca_idev; - struct rt6_info *aca_rt; + struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; refcount_t aca_refcnt; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6c3d92bb3459..d41b7bd69fb3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -75,12 +75,12 @@ struct fib6_node { #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif - struct rt6_info __rcu *leaf; + struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info __rcu *rr_ptr; + struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -176,7 +176,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; - struct rt6_info *from; + struct fib6_info *from; /* * Tail elements of dst_entry (__refcnt etc.) @@ -242,20 +242,20 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } -static inline void fib6_clean_expires(struct rt6_info *f6i) +static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->rt6i_flags &= ~RTF_EXPIRES; f6i->expires = 0; } -static inline void fib6_set_expires(struct rt6_info *f6i, +static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->rt6i_flags |= RTF_EXPIRES; } -static inline bool fib6_check_expired(const struct rt6_info *f6i) +static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); @@ -288,7 +288,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, u32 *cookie) { struct fib6_node *fn; @@ -330,15 +330,15 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); -struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); -void fib6_info_destroy(struct rt6_info *f6i); +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct fib6_info *f6i); -static inline void fib6_info_hold(struct rt6_info *f6i) +static inline void fib6_info_hold(struct fib6_info *f6i) { atomic_inc(&f6i->rt6i_ref); } -static inline void fib6_info_release(struct rt6_info *f6i) +static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) fib6_info_destroy(f6i); @@ -371,7 +371,7 @@ enum fib6_walk_state { struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; - struct rt6_info *leaf; + struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; @@ -435,7 +435,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ - struct rt6_info *rt; + struct fib6_info *rt; }; /* @@ -457,14 +457,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *saddr, int src_len, bool exact_match); -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); -int fib6_add(struct fib6_node *root, struct rt6_info *rt, +int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); -int fib6_del(struct rt6_info *rt, struct nl_info *info); +int fib6_del(struct fib6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); @@ -487,11 +487,11 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct net *net, struct rt6_info *rt); -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct fib6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); -void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); -static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 57d0d45667f1..d5fb1e4ae7ac 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,7 +66,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -102,14 +102,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct rt6_info *rt); -int ip6_del_rt(struct net *net, struct rt6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *rt); +int ip6_del_rt(struct net *net, struct fib6_info *rt); -void rt6_flush_exceptions(struct rt6_info *rt); -void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *rt); +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -136,9 +136,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); +struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); @@ -147,10 +147,10 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(struct net *net, +struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(struct net *net, +struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct rt6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *rt, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct rt6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *rt); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); @@ -271,7 +271,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, return daddr; } -static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) +static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 74e4e1e449d5..97b3a54579c8 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,7 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *fib6_null_entry; + struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e533a447f68c..a294e86a9b25 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event); static int addrconf_ifdown(struct net_device *dev, int how); -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags); @@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1178,7 +1178,7 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) { - struct rt6_info *rt; + struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, @@ -2348,13 +2348,13 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, } -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags) { struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; @@ -2641,7 +2641,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) */ if (pinfo->onlink) { - struct rt6_info *rt; + struct fib6_info *rt; unsigned long rt_expires; /* Avoid arithmetic overflow. Really, we could @@ -3346,7 +3346,7 @@ static int fixup_permanent_addr(struct net *net, * case regenerate the host route. */ if (!ifp->rt || !ifp->rt->rt6i_node) { - struct rt6_info *rt, *prev; + struct fib6_info *rt, *prev; rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, GFP_ATOMIC); @@ -3713,7 +3713,7 @@ restart: keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; bool keep; addrconf_del_dad_work(ifa); @@ -5626,7 +5626,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { - struct rt6_info *rt; + struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); @@ -5982,7 +5982,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct rt6_info *rt = ifa->rt; + struct fib6_info *rt = ifa->rt; int cpu; rcu_read_lock(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 3db8fe10322b..0e35657501a7 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -218,7 +218,7 @@ static void aca_put(struct ifacaddr6 *ac) } } -static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, +static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, const struct in6_addr *addr) { struct inet6_dev *idev = rt->rt6i_idev; @@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct rt6_info *rt; + struct fib6_info *rt; struct net *net; int err; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4d6bd033dccd..77cf43b2d858 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; - int (*func)(struct rt6_info *, void *arg); + int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; }; @@ -54,7 +54,7 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, @@ -105,7 +105,7 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct net *net, struct rt6_info *rt) +void fib6_update_sernum(struct net *net, struct fib6_info *rt) { struct fib6_node *fn; @@ -145,9 +145,9 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) { - struct rt6_info *f6i; + struct fib6_info *f6i; f6i = kzalloc(sizeof(*f6i), gfp_flags); if (!f6i) @@ -167,7 +167,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) return f6i; } -void fib6_info_destroy(struct rt6_info *f6i) +void fib6_info_destroy(struct fib6_info *f6i) { struct rt6_exception_bucket *bucket; struct dst_metrics *m; @@ -404,7 +404,7 @@ unsigned int fib6_tables_seq_read(struct net *net) static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, @@ -415,7 +415,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt, + struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { @@ -432,7 +432,7 @@ struct fib6_dump_arg { struct notifier_block *nb; }; -static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) +static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) return; @@ -441,7 +441,7 @@ static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) static int fib6_node_dump(struct fib6_walker *w) { - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); @@ -490,7 +490,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) static int fib6_dump_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); @@ -507,7 +507,7 @@ static int fib6_dump_node(struct fib6_walker *w) */ if (rt->rt6i_nsiblings) rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, + struct fib6_info, rt6i_siblings); } w->leaf = NULL; @@ -643,7 +643,7 @@ out: return res; } -void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val) +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; @@ -690,7 +690,7 @@ static struct fib6_node *fib6_add_1(struct net *net, fn = root; do { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); @@ -884,7 +884,7 @@ insert_above: return ln; } -static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, +static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->rt6i_table; @@ -897,9 +897,9 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_leaf; + struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); atomic_inc(&new_leaf->rt6i_ref); @@ -936,15 +936,15 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * Insert routing information in a node. */ -static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, +static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct rt6_info *iter = NULL; - struct rt6_info __rcu **ins; - struct rt6_info __rcu **fallback_ins = NULL; + struct fib6_info *iter = NULL; + struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -1035,7 +1035,7 @@ next_iter: /* Link this route to others same route. */ if (rt->rt6i_nsiblings) { unsigned int rt6i_nsiblings; - struct rt6_info *sibling, *temp_sibling; + struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; @@ -1156,7 +1156,7 @@ add: return 0; } -static void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->rt6i_flags & RTF_EXPIRES)) @@ -1171,7 +1171,7 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void __fib6_update_sernum_upto_root(struct rt6_info *rt, +static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, @@ -1186,7 +1186,7 @@ static void __fib6_update_sernum_upto_root(struct rt6_info *rt, } } -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } @@ -1198,7 +1198,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) * Need to own table->tb6_lock */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, +int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->rt6i_table; @@ -1219,7 +1219,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(info->nl_net, table, root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - offsetof(struct rt6_info, rt6i_dst), allow_create, + offsetof(struct fib6_info, rt6i_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -1260,7 +1260,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(info->nl_net, table, sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1279,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1316,7 +1316,7 @@ out: * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { - struct rt6_info *pn_leaf = + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { @@ -1365,7 +1365,7 @@ failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ + int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; @@ -1403,7 +1403,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1443,12 +1443,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct rt6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, rt6i_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct rt6_info, rt6i_src), + .offset = offsetof(struct fib6_info, rt6i_src), .addr = saddr, }, #endif @@ -1484,7 +1484,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1533,7 +1533,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst), + offsetof(struct fib6_info, rt6i_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES @@ -1544,7 +1544,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), exact_match); } } @@ -1563,7 +1563,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { @@ -1622,11 +1622,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_fn_leaf; + struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; @@ -1717,10 +1717,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, - struct rt6_info __rcu **rtp, struct nl_info *info) + struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = rcu_dereference_protected(*rtp, + struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; @@ -1741,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Remove this entry from other siblings */ if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, &rt->rt6i_siblings, rt6i_siblings) @@ -1785,14 +1785,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } /* Need to own table->tb6_lock */ -int fib6_del(struct rt6_info *rt, struct nl_info *info) +int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info __rcu **rtp; - struct rt6_info __rcu **rtp_next; + struct fib6_info __rcu **rtp; + struct fib6_info __rcu **rtp_next; if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; @@ -1804,7 +1804,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { - struct rt6_info *cur = rcu_dereference_protected(*rtp, + struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { fib6_del_route(table, fn, rtp, info); @@ -1948,7 +1948,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w) static int fib6_clean_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, @@ -1983,7 +1983,7 @@ static int fib6_clean_node(struct fib6_walker *w) if (WARN_ON(!rt->rt6i_nsiblings)) continue; rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, rt6i_siblings); + struct fib6_info, rt6i_siblings); continue; } WARN_ON(res != 0); @@ -2002,7 +2002,7 @@ static int fib6_clean_node(struct fib6_walker *w) */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, - int (*func)(struct rt6_info *, void *arg), + int (*func)(struct fib6_info *, void *arg), int sernum, void *arg) { struct fib6_cleaner c; @@ -2020,7 +2020,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, } static void __fib6_clean_all(struct net *net, - int (*func)(struct rt6_info *, void *), + int (*func)(struct fib6_info *, void *), int sernum, void *arg) { struct fib6_table *table; @@ -2040,7 +2040,7 @@ static void __fib6_clean_all(struct net *net, rcu_read_unlock(); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); @@ -2057,7 +2057,7 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct rt6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; @@ -2261,7 +2261,7 @@ struct ipv6_route_iter { static int ipv6_route_seq_show(struct seq_file *seq, void *v) { - struct rt6_info *rt = v; + struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; const struct net_device *dev; @@ -2353,14 +2353,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; - struct rt6_info *n; + struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); + n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a28857088bff..102645298692 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1155,7 +1155,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct net *net; int lifetime; struct ndisc_options ndopts; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0d861bd07673..2ccf939e1a20 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -96,24 +96,24 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct rt6_info *rt, int oif, int strict); -static size_t rt6_nlmsg_size(struct rt6_info *rt); +static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, - struct rt6_info *rt, struct dst_entry *dst, + struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); @@ -283,7 +283,7 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; -static const struct rt6_info fib6_null_entry_template = { +static const struct fib6_info fib6_null_entry_template = { .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32)0, @@ -372,7 +372,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -425,13 +425,13 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(const struct net *net, - struct rt6_info *match, +static struct fib6_info *rt6_multipath_select(const struct net *net, + struct fib6_info *match, struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -462,14 +462,14 @@ static struct rt6_info *rt6_multipath_select(const struct net *net, * Route lookup. rcu_read_lock() should be held. */ -static inline struct rt6_info *rt6_device_match(struct net *net, - struct rt6_info *rt, +static inline struct fib6_info *rt6_device_match(struct net *net, + struct fib6_info *rt, const struct in6_addr *saddr, int oif, int flags) { - struct rt6_info *local = NULL; - struct rt6_info *sprt; + struct fib6_info *local = NULL; + struct fib6_info *sprt; if (!oif && ipv6_addr_any(saddr) && !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) @@ -532,7 +532,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct rt6_info *rt) +static void rt6_probe(struct fib6_info *rt) { struct __rt6_probe_work *work; const struct in6_addr *nh_gw; @@ -585,7 +585,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct rt6_info *rt) +static inline void rt6_probe(struct fib6_info *rt) { } #endif @@ -593,7 +593,7 @@ static inline void rt6_probe(struct rt6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct rt6_info *rt, int oif) +static inline int rt6_check_dev(struct fib6_info *rt, int oif) { const struct net_device *dev = rt->fib6_nh.nh_dev; @@ -605,7 +605,7 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) return 0; } -static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; @@ -637,8 +637,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) return ret; } -static int rt6_score_route(struct rt6_info *rt, int oif, - int strict) +static int rt6_score_route(struct fib6_info *rt, int oif, int strict) { int m; @@ -656,8 +655,8 @@ static int rt6_score_route(struct rt6_info *rt, int oif, return m; } -static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match, +static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, + int *mpri, struct fib6_info *match, bool *do_rr) { int m; @@ -696,13 +695,13 @@ out: return match; } -static struct rt6_info *find_rr_leaf(struct fib6_node *fn, - struct rt6_info *leaf, - struct rt6_info *rr_head, +static struct fib6_info *find_rr_leaf(struct fib6_node *fn, + struct fib6_info *leaf, + struct fib6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match, *cont; + struct fib6_info *rt, *match, *cont; int mpri = -1; match = NULL; @@ -735,11 +734,11 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, return match; } -static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, +static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); - struct rt6_info *match, *rt0; + struct fib6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *match, *rt0; bool do_rr = false; int key_plen; @@ -767,7 +766,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->rt6_next); + struct fib6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -785,7 +784,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, return match ? match : net->ipv6.fib6_null_entry; } -static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); } @@ -799,7 +798,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct in6_addr prefix_buf, *prefix; unsigned int pref; unsigned long lifetime; - struct rt6_info *rt; + struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; @@ -871,7 +870,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, */ /* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) +static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh.nh_dev; @@ -913,7 +912,7 @@ static int ip6_rt_type_to_error(u8 fib6_type) return fib6_prop[fib6_type]; } -static unsigned short fib6_info_dst_flags(struct rt6_info *rt) +static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; @@ -927,7 +926,7 @@ static unsigned short fib6_info_dst_flags(struct rt6_info *rt) return flags; } -static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); @@ -949,7 +948,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) } } -static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.flags |= fib6_info_dst_flags(ort); @@ -977,7 +976,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) rt->dst.lastuse = jiffies; } -static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) +static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; fib6_info_hold(from); @@ -989,7 +988,7 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) } } -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { ip6_rt_init_dst(rt, ort); @@ -1045,7 +1044,7 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, } /* called with rcu_lock held */ -static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) +static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) { unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev = rt->fib6_nh.nh_dev; @@ -1064,7 +1063,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct rt6_info *f6i; + struct fib6_info *f6i; struct fib6_node *fn; struct rt6_info *rt; @@ -1152,7 +1151,7 @@ EXPORT_SYMBOL(rt6_lookup); * Caller must hold dst before calling it. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, +static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; @@ -1166,14 +1165,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, return err; } -int ip6_ins_rt(struct net *net, struct rt6_info *rt) +int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1213,7 +1212,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) { unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev; @@ -1232,7 +1231,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1246,7 +1245,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) } static struct rt6_info *rt6_make_pcpu_route(struct net *net, - struct rt6_info *rt) + struct fib6_info *rt) { struct rt6_info *pcpu_rt, *prev, **p; @@ -1390,7 +1389,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } -static unsigned int fib6_mtu(const struct rt6_info *rt) +static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; @@ -1401,7 +1400,7 @@ static unsigned int fib6_mtu(const struct rt6_info *rt) } static int rt6_insert_exception(struct rt6_info *nrt, - struct rt6_info *ort) + struct fib6_info *ort) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; @@ -1487,7 +1486,7 @@ out: return err; } -void rt6_flush_exceptions(struct rt6_info *rt) +void rt6_flush_exceptions(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1517,7 +1516,7 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr) { @@ -1550,7 +1549,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1595,7 +1594,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1625,7 +1624,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } -static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1667,7 +1666,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct rt6_info *rt, int mtu) + struct fib6_info *rt, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1697,7 +1696,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct rt6_info *rt, +static void rt6_exceptions_clean_tohost(struct fib6_info *rt, struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; @@ -1776,7 +1775,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct rt6_info *rt, +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now) { @@ -1812,7 +1811,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *f6i; + struct fib6_info *f6i; struct rt6_info *rt; int strict = 0; @@ -2139,7 +2138,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ -static bool fib6_check(struct rt6_info *f6i, u32 cookie) +static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; @@ -2374,7 +2373,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL, *rt_cache; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2620,7 +2619,7 @@ out: return entries > rt_max_size; } -static int ip6_convert_metrics(struct net *net, struct rt6_info *rt, +static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, struct fib6_config *cfg) { int err = 0; @@ -2823,12 +2822,12 @@ out: return err; } -static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, +static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; @@ -3047,7 +3046,7 @@ out: int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { - struct rt6_info *rt; + struct fib6_info *rt; int err; rt = ip6_route_info_create(cfg, gfp_flags, extack); @@ -3060,7 +3059,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, return err; } -static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; @@ -3081,14 +3080,14 @@ out: return err; } -int ip6_del_rt(struct net *net, struct rt6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net }; return __ip6_del_rt(rt, &info); } -static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; @@ -3102,7 +3101,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); @@ -3159,8 +3158,9 @@ out: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt, *rt_cache; + struct rt6_info *rt_cache; struct fib6_table *table; + struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; @@ -3336,7 +3336,7 @@ out: } #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) @@ -3344,7 +3344,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); @@ -3363,7 +3363,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); break; } out: @@ -3371,7 +3371,7 @@ out: return rt; } -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, @@ -3404,12 +3404,12 @@ static struct rt6_info *rt6_add_route_info(struct net *net, } #endif -struct rt6_info *rt6_get_dflt_router(struct net *net, +struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); @@ -3424,12 +3424,12 @@ struct rt6_info *rt6_get_dflt_router(struct net *net, break; } if (rt) - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); rcu_read_unlock(); return rt; } -struct rt6_info *rt6_add_dflt_router(struct net *net, +struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) @@ -3463,7 +3463,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net, static void __rt6_purge_dflt_routers(struct net *net, struct fib6_table *table) { - struct rt6_info *rt; + struct fib6_info *rt; restart: rcu_read_lock(); @@ -3614,14 +3614,14 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct rt6_info *addrconf_dst_alloc(struct net *net, +struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; - struct rt6_info *rt; + struct fib6_info *rt; rt = fib6_info_alloc(gfp_flags); if (!rt) @@ -3661,7 +3661,7 @@ struct arg_dev_net_ip { struct in6_addr *addr; }; -static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; @@ -3694,7 +3694,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) /* Remove routers and update dst entries when gateway turn into host. */ -static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; @@ -3725,9 +3725,9 @@ struct arg_netdev_event { }; }; -static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->rt6i_node, @@ -3745,7 +3745,7 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) return NULL; } -static bool rt6_is_dead(const struct rt6_info *rt) +static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && @@ -3755,9 +3755,9 @@ static bool rt6_is_dead(const struct rt6_info *rt) return false; } -static int rt6_multipath_total_weight(const struct rt6_info *rt) +static int rt6_multipath_total_weight(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) @@ -3771,7 +3771,7 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) return total; } -static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; @@ -3783,9 +3783,9 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); } -static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { - struct rt6_info *iter; + struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); @@ -3794,9 +3794,9 @@ static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) rt6_upper_bound_set(iter, &weight, total); } -void rt6_multipath_rebalance(struct rt6_info *rt) +void rt6_multipath_rebalance(struct fib6_info *rt) { - struct rt6_info *first; + struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, @@ -3818,7 +3818,7 @@ void rt6_multipath_rebalance(struct rt6_info *rt) rt6_multipath_upper_bound_set(first, total); } -static int fib6_ifup(struct rt6_info *rt, void *p_arg) +static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); @@ -3847,10 +3847,10 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } -static bool rt6_multipath_uses_dev(const struct rt6_info *rt, +static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { - struct rt6_info *iter; + struct fib6_info *iter; if (rt->fib6_nh.nh_dev == dev) return true; @@ -3861,19 +3861,19 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt, return false; } -static void rt6_multipath_flush(struct rt6_info *rt) +static void rt6_multipath_flush(struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) iter->should_flush = 1; } -static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, +static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { - struct rt6_info *iter; + struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh.nh_dev == down_dev || @@ -3887,11 +3887,11 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, return dead; } -static void rt6_multipath_nh_flags_set(struct rt6_info *rt, +static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned int nh_flags) { - struct rt6_info *iter; + struct fib6_info *iter; if (rt->fib6_nh.nh_dev == dev) rt->fib6_nh.nh_flags |= nh_flags; @@ -3901,7 +3901,7 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt, } /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *p_arg) +static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; @@ -3968,7 +3968,7 @@ struct rt6_mtu_change_arg { unsigned int mtu; }; -static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4155,7 +4155,7 @@ errout: } struct rt6_nh { - struct rt6_info *rt6_info; + struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head next; }; @@ -4173,21 +4173,22 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) static int ip6_route_info_append(struct net *net, struct list_head *rt6_nh_list, - struct rt6_info *rt, struct fib6_config *r_cfg) + struct fib6_info *rt, + struct fib6_config *r_cfg) { struct rt6_nh *nh; int err = -EEXIST; list_for_each_entry(nh, rt6_nh_list, next) { - /* check if rt6_info already exists */ - if (rt6_duplicate_nexthop(nh->rt6_info, rt)) + /* check if fib6_info already exists */ + if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return err; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; - nh->rt6_info = rt; + nh->fib6_info = rt; err = ip6_convert_metrics(net, rt, r_cfg); if (err) { kfree(nh); @@ -4199,8 +4200,8 @@ static int ip6_route_info_append(struct net *net, return 0; } -static void ip6_route_mpath_notify(struct rt6_info *rt, - struct rt6_info *rt_last, +static void ip6_route_mpath_notify(struct fib6_info *rt, + struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { @@ -4212,7 +4213,7 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, */ if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { rt = list_first_entry(&rt_last->rt6i_siblings, - struct rt6_info, + struct fib6_info, rt6i_siblings); } @@ -4223,11 +4224,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_notif = NULL, *rt_last = NULL; + struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct rt6_info *rt; + struct fib6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; __u16 nlflags; @@ -4247,7 +4248,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of - * rt6_info structs per nexthop + * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); @@ -4297,16 +4298,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - rt_last = nh->rt6_info; - err = __ip6_ins_rt(nh->rt6_info, info, extack); - fib6_info_release(nh->rt6_info); + rt_last = nh->fib6_info; + err = __ip6_ins_rt(nh->fib6_info, info, extack); + fib6_info_release(nh->fib6_info); /* save reference to first route for notification */ if (!rt_notif && !err) - rt_notif = nh->rt6_info; + rt_notif = nh->fib6_info; - /* nh->rt6_info is used or freed at this point, reset to NULL*/ - nh->rt6_info = NULL; + /* nh->fib6_info is used or freed at this point, reset to NULL*/ + nh->fib6_info = NULL; if (err) { if (replace && nhn) ip6_print_replace_route_err(&rt6_nh_list); @@ -4347,8 +4348,8 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->rt6_info) - fib6_info_release(nh->rt6_info); + if (nh->fib6_info) + fib6_info_release(nh->fib6_info); list_del(&nh->next); kfree(nh); } @@ -4428,7 +4429,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct rt6_info *rt) +static size_t rt6_nlmsg_size(struct fib6_info *rt) { int nexthop_len = 0; @@ -4458,7 +4459,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) + nexthop_len; } -static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, +static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, unsigned int *flags, bool skip_oif) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) @@ -4495,7 +4496,7 @@ nla_put_failure: } /* add multipath next hop */ -static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) +static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) { const struct net_device *dev = rt->fib6_nh.nh_dev; struct rtnexthop *rtnh; @@ -4523,7 +4524,7 @@ nla_put_failure: } static int rt6_fill_node(struct net *net, struct sk_buff *skb, - struct rt6_info *rt, struct dst_entry *dst, + struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) @@ -4613,7 +4614,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, * each as a nexthop within RTA_MULTIPATH. */ if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; struct nlattr *mp; mp = nla_nest_start(skb, RTA_MULTIPATH); @@ -4655,7 +4656,7 @@ nla_put_failure: return -EMSGSIZE; } -int rt6_dump_route(struct rt6_info *rt, void *p_arg) +int rt6_dump_route(struct fib6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct net *net = arg->net; @@ -4800,7 +4801,7 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; -- cgit v1.2.3 From 77634cc67dc1ffc8ae6d869af6dee4b2ea6025ee Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:27 -0700 Subject: net/ipv6: Remove unused code and variables for rt6_info Drop unneeded elements from rt6_info struct and rearrange layout to something more relevant for the data path. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 60 +++---------------------------------------------- net/ipv6/ip6_fib.c | 22 ------------------ net/ipv6/route.c | 27 ++-------------------- net/ipv6/xfrm6_policy.c | 2 -- 4 files changed, 5 insertions(+), 106 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d41b7bd69fb3..a36116b92100 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -175,58 +175,20 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct rt6_info __rcu *rt6_next; struct fib6_info *from; - /* - * Tail elements of dst_entry (__refcnt etc.) - * and these elements (rarely used in hot path) are in - * the same cache line. - */ - struct fib6_table *rt6i_table; - struct fib6_node __rcu *rt6i_node; - + struct rt6key rt6i_dst; + struct rt6key rt6i_src; struct in6_addr rt6i_gateway; - - /* Multipath routes: - * siblings is a list of rt6_info that have the the same metric/weight, - * destination, but not the same gateway. nsiblings is just a cache - * to speed up lookup. - */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; - - atomic_t rt6i_ref; - - /* These are in a separate cache line. */ - struct rt6key rt6i_dst ____cacheline_aligned_in_smp; + struct inet6_dev *rt6i_idev; u32 rt6i_flags; - struct rt6key rt6i_src; struct rt6key rt6i_prefsrc; struct list_head rt6i_uncached; struct uncached_list *rt6i_uncached_list; - struct inet6_dev *rt6i_idev; - struct rt6_info * __percpu *rt6i_pcpu; - struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - - u32 rt6i_metric; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; - u8 rt6i_protocol; - u8 fib6_type; - u8 exception_bucket_flushed:1, - should_flush:1, - dst_nocount:1, - dst_nopolicy:1, - dst_host:1, - unused:3; - - unsigned long expires; - struct dst_metrics *fib6_metrics; -#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ @@ -328,8 +290,6 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); - struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); void fib6_info_destroy(struct fib6_info *f6i); @@ -344,20 +304,6 @@ static inline void fib6_info_release(struct fib6_info *f6i) fib6_info_destroy(f6i); } -static inline void rt6_hold(struct rt6_info *rt) -{ - atomic_inc(&rt->rt6i_ref); -} - -static inline void rt6_release(struct rt6_info *rt) -{ - if (atomic_dec_and_test(&rt->rt6i_ref)) { - rt6_free_pcpu(rt); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } -} - enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 77cf43b2d858..2ab49b7cac22 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -240,28 +240,6 @@ static void node_free(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) -{ - int cpu; - - if (!non_pcpu_rt->rt6i_pcpu) - return; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } -} -EXPORT_SYMBOL_GPL(rt6_free_pcpu); - static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2ccf939e1a20..f9c363327d62 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -302,10 +302,6 @@ static const struct rt6_info ip6_null_entry_template = { .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_UNREACHABLE, }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -320,10 +316,6 @@ static const struct rt6_info ip6_prohibit_entry_template = { .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_PROHIBIT, }; static const struct rt6_info ip6_blk_hole_entry_template = { @@ -336,10 +328,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_BLACKHOLE, }; #endif @@ -349,7 +337,6 @@ static void rt6_info_init(struct rt6_info *rt) struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); } @@ -999,12 +986,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); - rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = ort->rt6i_src; #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; - rt->rt6i_table = ort->rt6i_table; rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } @@ -1192,7 +1177,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; - rt->rt6i_metric = 0; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; @@ -1225,7 +1209,6 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) if (!pcpu_rt) return NULL; ip6_rt_copy_init(pcpu_rt, rt); - pcpu_rt->rt6i_protocol = rt->rt6i_protocol; pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } @@ -1279,9 +1262,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, return; net = dev_net(rt6_ex->rt6i->dst.dev); - rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); - ip6_rt_put(rt6_ex->rt6i); + dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; @@ -1463,8 +1445,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; - atomic_inc(&nrt->rt6i_ref); - nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; @@ -2122,7 +2102,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; - rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES @@ -2247,8 +2226,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || - rcu_access_pointer(rt->rt6i_node)); + (rt->rt6i_flags & RTF_PCPU || rt->from); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -3313,7 +3291,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* No need to remove rt from the exception table if rt is diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 416fe67271a9..2cff209d0fc1 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); - xdst->u.rt6.rt6i_metric = rt->rt6i_metric; - xdst->u.rt6.rt6i_node = rt->rt6i_node; xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; -- cgit v1.2.3 From 24ada03555505205b0c8b8b796d52926600bf947 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 18 Apr 2018 15:40:41 +0100 Subject: ASoC: topology: Fix build errors The two commits: 81e9b0a07889 ASoC: topology: Give more data to clients via callbacks 28aa6f7779f7 ASoC: topology: Add callback for DAPM route load/unload break the build so revert them. Reported-by: Stephen Rothwell Signed-off-by: Mark Brown --- include/sound/soc-topology.h | 30 +++++++---------------- sound/soc/intel/skylake/skl-pcm.c | 7 +++--- sound/soc/intel/skylake/skl-topology.c | 5 ++-- sound/soc/intel/skylake/skl-topology.h | 20 ++++++++++++---- sound/soc/soc-topology.c | 44 ++++++++++------------------------ 5 files changed, 42 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/sound/soc-topology.h b/include/sound/soc-topology.h index 401ef2c45d6c..f552c3f56368 100644 --- a/include/sound/soc-topology.h +++ b/include/sound/soc-topology.h @@ -30,9 +30,6 @@ struct snd_soc_dapm_context; struct snd_soc_card; struct snd_kcontrol_new; struct snd_soc_dai_link; -struct snd_soc_dai_driver; -struct snd_soc_dai; -struct snd_soc_dapm_route; /* object scan be loaded and unloaded in groups with identfying indexes */ #define SND_SOC_TPLG_INDEX_ALL 0 /* ID that matches all FW objects */ @@ -112,44 +109,35 @@ struct snd_soc_tplg_widget_events { struct snd_soc_tplg_ops { /* external kcontrol init - used for any driver specific init */ - int (*control_load)(struct snd_soc_component *, int index, + int (*control_load)(struct snd_soc_component *, struct snd_kcontrol_new *, struct snd_soc_tplg_ctl_hdr *); int (*control_unload)(struct snd_soc_component *, struct snd_soc_dobj *); - /* DAPM graph route element loading and unloading */ - int (*dapm_route_load)(struct snd_soc_component *, int index, - struct snd_soc_dapm_route *route); - int (*dapm_route_unload)(struct snd_soc_component *, - struct snd_soc_dobj *); - /* external widget init - used for any driver specific init */ - int (*widget_load)(struct snd_soc_component *, int index, + int (*widget_load)(struct snd_soc_component *, struct snd_soc_dapm_widget *, struct snd_soc_tplg_dapm_widget *); - int (*widget_ready)(struct snd_soc_component *, int index, + int (*widget_ready)(struct snd_soc_component *, struct snd_soc_dapm_widget *, struct snd_soc_tplg_dapm_widget *); int (*widget_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* FE DAI - used for any driver specific init */ - int (*dai_load)(struct snd_soc_component *, int index, - struct snd_soc_dai_driver *dai_drv, - struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai); - + int (*dai_load)(struct snd_soc_component *, + struct snd_soc_dai_driver *dai_drv); int (*dai_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* DAI link - used for any driver specific init */ - int (*link_load)(struct snd_soc_component *, int index, - struct snd_soc_dai_link *link, - struct snd_soc_tplg_link_config *cfg); + int (*link_load)(struct snd_soc_component *, + struct snd_soc_dai_link *link); int (*link_unload)(struct snd_soc_component *, struct snd_soc_dobj *); /* callback to handle vendor bespoke data */ - int (*vendor_load)(struct snd_soc_component *, int index, + int (*vendor_load)(struct snd_soc_component *, struct snd_soc_tplg_hdr *); int (*vendor_unload)(struct snd_soc_component *, struct snd_soc_tplg_hdr *); @@ -158,7 +146,7 @@ struct snd_soc_tplg_ops { void (*complete)(struct snd_soc_component *); /* manifest - optional to inform component of manifest */ - int (*manifest)(struct snd_soc_component *, int index, + int (*manifest)(struct snd_soc_component *, struct snd_soc_tplg_manifest *); /* vendor specific kcontrol handlers available for binding */ diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c index 1f4dd08d36c5..afa86b9e4dcf 100644 --- a/sound/soc/intel/skylake/skl-pcm.c +++ b/sound/soc/intel/skylake/skl-pcm.c @@ -1017,11 +1017,10 @@ static struct snd_soc_dai_driver skl_platform_dai[] = { }, }; -int skl_dai_load(struct snd_soc_component *cmp, int index, - struct snd_soc_dai_driver *dai_drv, - struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai) +int skl_dai_load(struct snd_soc_component *cmp, + struct snd_soc_dai_driver *pcm_dai) { - dai_drv->ops = &skl_pcm_dai_ops; + pcm_dai->ops = &skl_pcm_dai_ops; return 0; } diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 6ac081f1f215..3b1dca419883 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -2851,7 +2851,7 @@ void skl_cleanup_resources(struct skl *skl) * information to the driver about module and pipeline parameters which DSP * FW expects like ids, resource values, formats etc */ -static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index, +static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { @@ -2958,7 +2958,6 @@ static int skl_init_enum_data(struct device *dev, struct soc_enum *se, } static int skl_tplg_control_load(struct snd_soc_component *cmpnt, - int index, struct snd_kcontrol_new *kctl, struct snd_soc_tplg_ctl_hdr *hdr) { @@ -3447,7 +3446,7 @@ static int skl_tplg_get_manifest_data(struct snd_soc_tplg_manifest *manifest, return 0; } -static int skl_manifest_load(struct snd_soc_component *cmpnt, int index, +static int skl_manifest_load(struct snd_soc_component *cmpnt, struct snd_soc_tplg_manifest *manifest) { struct hdac_ext_bus *ebus = snd_soc_component_get_drvdata(cmpnt); diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h index 77857c598eed..b1e0667c0ae0 100644 --- a/sound/soc/intel/skylake/skl-topology.h +++ b/sound/soc/intel/skylake/skl-topology.h @@ -221,9 +221,18 @@ struct skl_mod_inst_map { u16 inst_id; }; +struct skl_uuid_inst_map { + u16 inst_id; + u16 reserved; + uuid_le mod_uuid; +} __packed; + struct skl_kpb_params { u32 num_modules; - struct skl_mod_inst_map map[0]; + union { + struct skl_mod_inst_map map[0]; + struct skl_uuid_inst_map map_uuid[0]; + } u; }; struct skl_module_inst_id { @@ -460,7 +469,7 @@ int skl_dsp_set_dma_control(struct skl_sst *ctx, u32 *caps, u32 caps_size, u32 node_id); void skl_tplg_set_be_dmic_config(struct snd_soc_dai *dai, struct skl_pipe_params *params, int stream); -int skl_tplg_init(struct snd_soc_platform *platform, +int skl_tplg_init(struct snd_soc_component *component, struct hdac_ext_bus *ebus); struct skl_module_cfg *skl_tplg_fe_get_cpr_module( struct snd_soc_dai *dai, int stream); @@ -503,7 +512,8 @@ int skl_pcm_host_dma_prepare(struct device *dev, int skl_pcm_link_dma_prepare(struct device *dev, struct skl_pipe_params *params); -int skl_dai_load(struct snd_soc_component *, int index, - struct snd_soc_dai_driver *dai_drv, - struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai); +int skl_dai_load(struct snd_soc_component *cmp, + struct snd_soc_dai_driver *pcm_dai); +void skl_tplg_add_moduleid_in_bind_params(struct skl *skl, + struct snd_soc_dapm_widget *w); #endif diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index b95a9ab0b526..de08693be9e1 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -315,7 +315,7 @@ static int soc_tplg_vendor_load_(struct soc_tplg *tplg, int ret = 0; if (tplg->comp && tplg->ops && tplg->ops->vendor_load) - ret = tplg->ops->vendor_load(tplg->comp, tplg->index, hdr); + ret = tplg->ops->vendor_load(tplg->comp, hdr); else { dev_err(tplg->dev, "ASoC: no vendor load callback for ID %d\n", hdr->vendor_type); @@ -347,8 +347,7 @@ static int soc_tplg_widget_load(struct soc_tplg *tplg, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { if (tplg->comp && tplg->ops && tplg->ops->widget_load) - return tplg->ops->widget_load(tplg->comp, tplg->index, w, - tplg_w); + return tplg->ops->widget_load(tplg->comp, w, tplg_w); return 0; } @@ -359,30 +358,27 @@ static int soc_tplg_widget_ready(struct soc_tplg *tplg, struct snd_soc_dapm_widget *w, struct snd_soc_tplg_dapm_widget *tplg_w) { if (tplg->comp && tplg->ops && tplg->ops->widget_ready) - return tplg->ops->widget_ready(tplg->comp, tplg->index, w, - tplg_w); + return tplg->ops->widget_ready(tplg->comp, w, tplg_w); return 0; } /* pass DAI configurations to component driver for extra initialization */ static int soc_tplg_dai_load(struct soc_tplg *tplg, - struct snd_soc_dai_driver *dai_drv, - struct snd_soc_tplg_pcm *pcm, struct snd_soc_dai *dai) + struct snd_soc_dai_driver *dai_drv) { if (tplg->comp && tplg->ops && tplg->ops->dai_load) - return tplg->ops->dai_load(tplg->comp, tplg->index, dai_drv, - pcm, dai); + return tplg->ops->dai_load(tplg->comp, dai_drv); return 0; } /* pass link configurations to component driver for extra initialization */ static int soc_tplg_dai_link_load(struct soc_tplg *tplg, - struct snd_soc_dai_link *link, struct snd_soc_tplg_link_config *cfg) + struct snd_soc_dai_link *link) { if (tplg->comp && tplg->ops && tplg->ops->link_load) - return tplg->ops->link_load(tplg->comp, tplg->index, link, cfg); + return tplg->ops->link_load(tplg->comp, link); return 0; } @@ -703,8 +699,7 @@ static int soc_tplg_init_kcontrol(struct soc_tplg *tplg, struct snd_kcontrol_new *k, struct snd_soc_tplg_ctl_hdr *hdr) { if (tplg->comp && tplg->ops && tplg->ops->control_load) - return tplg->ops->control_load(tplg->comp, tplg->index, k, - hdr); + return tplg->ops->control_load(tplg->comp, k, hdr); return 0; } @@ -1161,17 +1156,6 @@ static int soc_tplg_kcontrol_elems_load(struct soc_tplg *tplg, return 0; } -/* optionally pass new dynamic kcontrol to component driver. */ -static int soc_tplg_add_route(struct soc_tplg *tplg, - struct snd_soc_dapm_route *route) -{ - if (tplg->comp && tplg->ops && tplg->ops->dapm_route_load) - return tplg->ops->dapm_route_load(tplg->comp, tplg->index, - route); - - return 0; -} - static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, struct snd_soc_tplg_hdr *hdr) { @@ -1220,8 +1204,6 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, else route.control = elem->control; - soc_tplg_add_route(tplg, &route); - /* add route, but keep going if some fail */ snd_soc_dapm_add_routes(dapm, &route, 1); } @@ -1776,7 +1758,7 @@ static int soc_tplg_dai_create(struct soc_tplg *tplg, dai_drv->compress_new = snd_soc_new_compress; /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_load(tplg, dai_drv, pcm, NULL); + ret = soc_tplg_dai_load(tplg, dai_drv); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n"); kfree(dai_drv); @@ -1846,7 +1828,7 @@ static int soc_tplg_fe_link_create(struct soc_tplg *tplg, set_link_flags(link, pcm->flag_mask, pcm->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_link_load(tplg, link, NULL); + ret = soc_tplg_dai_link_load(tplg, link); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: FE link loading failed\n"); kfree(link); @@ -2154,7 +2136,7 @@ static int soc_tplg_link_config(struct soc_tplg *tplg, set_link_flags(link, cfg->flag_mask, cfg->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_link_load(tplg, link, cfg); + ret = soc_tplg_dai_link_load(tplg, link); if (ret < 0) { dev_err(tplg->dev, "ASoC: physical link loading failed\n"); return ret; @@ -2276,7 +2258,7 @@ static int soc_tplg_dai_config(struct soc_tplg *tplg, set_dai_flags(dai_drv, d->flag_mask, d->flags); /* pass control to component driver for optional further init */ - ret = soc_tplg_dai_load(tplg, dai_drv, NULL, dai); + ret = soc_tplg_dai_load(tplg, dai_drv); if (ret < 0) { dev_err(tplg->comp->dev, "ASoC: DAI loading failed\n"); return ret; @@ -2382,7 +2364,7 @@ static int soc_tplg_manifest_load(struct soc_tplg *tplg, /* pass control to component driver for optional further init */ if (tplg->comp && tplg->ops && tplg->ops->manifest) - return tplg->ops->manifest(tplg->comp, tplg->index, _manifest); + return tplg->ops->manifest(tplg->comp, _manifest); if (!abi_match) /* free the duplicated one */ kfree(_manifest); -- cgit v1.2.3 From 91e2dd0a47bae19600f13dcc9e0761082c50afa6 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 28 Mar 2018 13:19:25 -0500 Subject: ipmi: Add a panic handler for IPMI users Users of the IPMI code had their own panic handlers, but the order was not necessarily right, the base IPMI code would need to handle the panic first, and the user had no way to know if the IPMI interface could run at panic time. Add a panic handler to the user interface, it is called if non-NULL and the interface the user is on is capable of panic handling. It also cleans up the panic log handling a bit to reuse the existing interface loop in the main panic handler. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 214 ++++++++++++++++++------------------ include/linux/ipmi.h | 6 + 2 files changed, 110 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index dcfbf2e3c8c5..9ffbb5f9c7bd 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4956,13 +4956,15 @@ static void device_id_fetcher(ipmi_smi_t intf, struct ipmi_recv_msg *msg) } } -static void send_panic_events(char *str) +static void send_panic_events(ipmi_smi_t intf, char *str) { - struct kernel_ipmi_msg msg; - ipmi_smi_t intf; - unsigned char data[16]; + struct kernel_ipmi_msg msg; + unsigned char data[16]; struct ipmi_system_interface_addr *si; - struct ipmi_addr addr; + struct ipmi_addr addr; + char *p = str; + struct ipmi_ipmb_addr *ipmb; + int j; if (ipmi_send_panic_event == IPMI_SEND_PANIC_EVENT_NONE) return; @@ -4993,15 +4995,8 @@ static void send_panic_events(char *str) data[7] = str[2]; } - /* For every registered interface, send the event. */ - list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { - if (!intf->handlers || !intf->handlers->poll) - /* Interface is not ready or can't run at panic time. */ - continue; - - /* Send the event announcing the panic. */ - ipmi_panic_request_and_wait(intf, &addr, &msg); - } + /* Send the event announcing the panic. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); /* * On every interface, dump a bunch of OEM event holding the @@ -5010,111 +5005,100 @@ static void send_panic_events(char *str) if (ipmi_send_panic_event != IPMI_SEND_PANIC_EVENT_STRING || !str) return; - /* For every registered interface, send the event. */ - list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { - char *p = str; - struct ipmi_ipmb_addr *ipmb; - int j; - - if (intf->intf_num == -1) - /* Interface was not ready yet. */ - continue; + /* + * intf_num is used as an marker to tell if the + * interface is valid. Thus we need a read barrier to + * make sure data fetched before checking intf_num + * won't be used. + */ + smp_rmb(); - /* - * intf_num is used as an marker to tell if the - * interface is valid. Thus we need a read barrier to - * make sure data fetched before checking intf_num - * won't be used. - */ - smp_rmb(); + /* + * First job here is to figure out where to send the + * OEM events. There's no way in IPMI to send OEM + * events using an event send command, so we have to + * find the SEL to put them in and stick them in + * there. + */ - /* - * First job here is to figure out where to send the - * OEM events. There's no way in IPMI to send OEM - * events using an event send command, so we have to - * find the SEL to put them in and stick them in - * there. - */ + /* Get capabilities from the get device id. */ + intf->local_sel_device = 0; + intf->local_event_generator = 0; + intf->event_receiver = 0; - /* Get capabilities from the get device id. */ - intf->local_sel_device = 0; - intf->local_event_generator = 0; - intf->event_receiver = 0; + /* Request the device info from the local MC. */ + msg.netfn = IPMI_NETFN_APP_REQUEST; + msg.cmd = IPMI_GET_DEVICE_ID_CMD; + msg.data = NULL; + msg.data_len = 0; + intf->null_user_handler = device_id_fetcher; + ipmi_panic_request_and_wait(intf, &addr, &msg); - /* Request the device info from the local MC. */ - msg.netfn = IPMI_NETFN_APP_REQUEST; - msg.cmd = IPMI_GET_DEVICE_ID_CMD; + if (intf->local_event_generator) { + /* Request the event receiver from the local MC. */ + msg.netfn = IPMI_NETFN_SENSOR_EVENT_REQUEST; + msg.cmd = IPMI_GET_EVENT_RECEIVER_CMD; msg.data = NULL; msg.data_len = 0; - intf->null_user_handler = device_id_fetcher; + intf->null_user_handler = event_receiver_fetcher; ipmi_panic_request_and_wait(intf, &addr, &msg); + } + intf->null_user_handler = NULL; - if (intf->local_event_generator) { - /* Request the event receiver from the local MC. */ - msg.netfn = IPMI_NETFN_SENSOR_EVENT_REQUEST; - msg.cmd = IPMI_GET_EVENT_RECEIVER_CMD; - msg.data = NULL; - msg.data_len = 0; - intf->null_user_handler = event_receiver_fetcher; - ipmi_panic_request_and_wait(intf, &addr, &msg); - } - intf->null_user_handler = NULL; + /* + * Validate the event receiver. The low bit must not + * be 1 (it must be a valid IPMB address), it cannot + * be zero, and it must not be my address. + */ + if (((intf->event_receiver & 1) == 0) + && (intf->event_receiver != 0) + && (intf->event_receiver != intf->addrinfo[0].address)) { + /* + * The event receiver is valid, send an IPMB + * message. + */ + ipmb = (struct ipmi_ipmb_addr *) &addr; + ipmb->addr_type = IPMI_IPMB_ADDR_TYPE; + ipmb->channel = 0; /* FIXME - is this right? */ + ipmb->lun = intf->event_receiver_lun; + ipmb->slave_addr = intf->event_receiver; + } else if (intf->local_sel_device) { + /* + * The event receiver was not valid (or was + * me), but I am an SEL device, just dump it + * in my SEL. + */ + si = (struct ipmi_system_interface_addr *) &addr; + si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; + si->channel = IPMI_BMC_CHANNEL; + si->lun = 0; + } else + return; /* No where to send the event. */ + msg.netfn = IPMI_NETFN_STORAGE_REQUEST; /* Storage. */ + msg.cmd = IPMI_ADD_SEL_ENTRY_CMD; + msg.data = data; + msg.data_len = 16; + + j = 0; + while (*p) { + int size = strlen(p); + + if (size > 11) + size = 11; + data[0] = 0; + data[1] = 0; + data[2] = 0xf0; /* OEM event without timestamp. */ + data[3] = intf->addrinfo[0].address; + data[4] = j++; /* sequence # */ /* - * Validate the event receiver. The low bit must not - * be 1 (it must be a valid IPMB address), it cannot - * be zero, and it must not be my address. + * Always give 11 bytes, so strncpy will fill + * it with zeroes for me. */ - if (((intf->event_receiver & 1) == 0) - && (intf->event_receiver != 0) - && (intf->event_receiver != intf->addrinfo[0].address)) { - /* - * The event receiver is valid, send an IPMB - * message. - */ - ipmb = (struct ipmi_ipmb_addr *) &addr; - ipmb->addr_type = IPMI_IPMB_ADDR_TYPE; - ipmb->channel = 0; /* FIXME - is this right? */ - ipmb->lun = intf->event_receiver_lun; - ipmb->slave_addr = intf->event_receiver; - } else if (intf->local_sel_device) { - /* - * The event receiver was not valid (or was - * me), but I am an SEL device, just dump it - * in my SEL. - */ - si = (struct ipmi_system_interface_addr *) &addr; - si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; - si->channel = IPMI_BMC_CHANNEL; - si->lun = 0; - } else - continue; /* No where to send the event. */ - - msg.netfn = IPMI_NETFN_STORAGE_REQUEST; /* Storage. */ - msg.cmd = IPMI_ADD_SEL_ENTRY_CMD; - msg.data = data; - msg.data_len = 16; - - j = 0; - while (*p) { - int size = strlen(p); - - if (size > 11) - size = 11; - data[0] = 0; - data[1] = 0; - data[2] = 0xf0; /* OEM event without timestamp. */ - data[3] = intf->addrinfo[0].address; - data[4] = j++; /* sequence # */ - /* - * Always give 11 bytes, so strncpy will fill - * it with zeroes for me. - */ - strncpy(data+5, p, 11); - p += size; + strncpy(data+5, p, 11); + p += size; - ipmi_panic_request_and_wait(intf, &addr, &msg); - } + ipmi_panic_request_and_wait(intf, &addr, &msg); } } @@ -5125,6 +5109,7 @@ static int panic_event(struct notifier_block *this, void *ptr) { ipmi_smi_t intf; + ipmi_user_t user; if (has_panicked) return NOTIFY_DONE; @@ -5132,10 +5117,13 @@ static int panic_event(struct notifier_block *this, /* For every registered interface, set it to run to completion. */ list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { - if (!intf->handlers) + if (!intf->handlers || intf->intf_num == -1) /* Interface is not ready. */ continue; + if (!intf->handlers->poll) + continue; + /* * If we were interrupted while locking xmit_msgs_lock or * waiting_rcv_msgs_lock, the corresponding list may be @@ -5157,9 +5145,15 @@ static int panic_event(struct notifier_block *this, if (intf->handlers->set_run_to_completion) intf->handlers->set_run_to_completion(intf->send_info, 1); - } - send_panic_events(ptr); + list_for_each_entry_rcu(user, &intf->users, link) { + if (user->handler->ipmi_panic_handler) + user->handler->ipmi_panic_handler( + user->handler_data); + } + + send_panic_events(intf, ptr); + } return NOTIFY_DONE; } diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 8b0626cec980..39a29fb3131b 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -77,6 +77,12 @@ struct ipmi_user_hndl { /* Called when the interface detects a watchdog pre-timeout. If this is NULL, it will be ignored for the user. */ void (*ipmi_watchdog_pretimeout)(void *handler_data); + + /* + * If not NULL, called at panic time after the interface has + * been set up to handle run to completion. + */ + void (*ipmi_panic_handler)(void *handler_data); }; /* Create a new user of the IPMI layer on the given interface number. */ -- cgit v1.2.3 From 6dc1181f9fbcf7ba0e62adfaea41666f00ee9d18 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 4 Apr 2018 08:54:05 -0500 Subject: ipmi: Clean up comments in include files. Make the comments correct and consistent. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 105 ++++++++++++++++++++++++++----------------- include/linux/ipmi_smi.h | 115 ++++++++++++++++++++++++++++------------------- 2 files changed, 134 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 39a29fb3131b..3474f04cf9aa 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -23,8 +23,10 @@ struct module; struct device; -/* Opaque type for a IPMI message user. One of these is needed to - send and receive messages. */ +/* + * Opaque type for a IPMI message user. One of these is needed to + * send and receive messages. + */ typedef struct ipmi_user *ipmi_user_t; /* @@ -37,8 +39,10 @@ typedef struct ipmi_user *ipmi_user_t; struct ipmi_recv_msg { struct list_head link; - /* The type of message as defined in the "Receive Types" - defines above. */ + /* + * The type of message as defined in the "Receive Types" + * defines above. + */ int recv_type; ipmi_user_t user; @@ -46,19 +50,25 @@ struct ipmi_recv_msg { long msgid; struct kernel_ipmi_msg msg; - /* The user_msg_data is the data supplied when a message was - sent, if this is a response to a sent message. If this is - not a response to a sent message, then user_msg_data will - be NULL. If the user above is NULL, then this will be the - intf. */ + /* + * The user_msg_data is the data supplied when a message was + * sent, if this is a response to a sent message. If this is + * not a response to a sent message, then user_msg_data will + * be NULL. If the user above is NULL, then this will be the + * intf. + */ void *user_msg_data; - /* Call this when done with the message. It will presumably free - the message and do any other necessary cleanup. */ + /* + * Call this when done with the message. It will presumably free + * the message and do any other necessary cleanup. + */ void (*done)(struct ipmi_recv_msg *msg); - /* Place-holder for the data, don't make any assumptions about - the size or existence of this, since it may change. */ + /* + * Place-holder for the data, don't make any assumptions about + * the size or existence of this, since it may change. + */ unsigned char msg_data[IPMI_MAX_MSG_LENGTH]; }; @@ -66,16 +76,20 @@ struct ipmi_recv_msg { void ipmi_free_recv_msg(struct ipmi_recv_msg *msg); struct ipmi_user_hndl { - /* Routine type to call when a message needs to be routed to - the upper layer. This will be called with some locks held, - the only IPMI routines that can be called are ipmi_request - and the alloc/free operations. The handler_data is the - variable supplied when the receive handler was registered. */ + /* + * Routine type to call when a message needs to be routed to + * the upper layer. This will be called with some locks held, + * the only IPMI routines that can be called are ipmi_request + * and the alloc/free operations. The handler_data is the + * variable supplied when the receive handler was registered. + */ void (*ipmi_recv_hndl)(struct ipmi_recv_msg *msg, void *user_msg_data); - /* Called when the interface detects a watchdog pre-timeout. If - this is NULL, it will be ignored for the user. */ + /* + * Called when the interface detects a watchdog pre-timeout. If + * this is NULL, it will be ignored for the user. + */ void (*ipmi_watchdog_pretimeout)(void *handler_data); /* @@ -91,12 +105,14 @@ int ipmi_create_user(unsigned int if_num, void *handler_data, ipmi_user_t *user); -/* Destroy the given user of the IPMI layer. Note that after this - function returns, the system is guaranteed to not call any - callbacks for the user. Thus as long as you destroy all the users - before you unload a module, you will be safe. And if you destroy - the users before you destroy the callback structures, it should be - safe, too. */ +/* + * Destroy the given user of the IPMI layer. Note that after this + * function returns, the system is guaranteed to not call any + * callbacks for the user. Thus as long as you destroy all the users + * before you unload a module, you will be safe. And if you destroy + * the users before you destroy the callback structures, it should be + * safe, too. + */ int ipmi_destroy_user(ipmi_user_t user); /* Get the IPMI version of the BMC we are talking to. */ @@ -104,12 +120,15 @@ int ipmi_get_version(ipmi_user_t user, unsigned char *major, unsigned char *minor); -/* Set and get the slave address and LUN that we will use for our - source messages. Note that this affects the interface, not just - this user, so it will affect all users of this interface. This is - so some initialization code can come in and do the OEM-specific - things it takes to determine your address (if not the BMC) and set - it for everyone else. Note that each channel can have its own address. */ +/* + * Set and get the slave address and LUN that we will use for our + * source messages. Note that this affects the interface, not just + * this user, so it will affect all users of this interface. This is + * so some initialization code can come in and do the OEM-specific + * things it takes to determine your address (if not the BMC) and set + * it for everyone else. Note that each channel can have its own + * address. + */ int ipmi_set_my_address(ipmi_user_t user, unsigned int channel, unsigned char address); @@ -235,14 +254,18 @@ int ipmi_set_gets_events(ipmi_user_t user, bool val); struct ipmi_smi_watcher { struct list_head link; - /* You must set the owner to the current module, if you are in - a module (generally just set it to "THIS_MODULE"). */ + /* + * You must set the owner to the current module, if you are in + * a module (generally just set it to "THIS_MODULE"). + */ struct module *owner; - /* These two are called with read locks held for the interface - the watcher list. So you can add and remove users from the - IPMI interface, send messages, etc., but you cannot add - or remove SMI watchers or SMI interfaces. */ + /* + * These two are called with read locks held for the interface + * the watcher list. So you can add and remove users from the + * IPMI interface, send messages, etc., but you cannot add + * or remove SMI watchers or SMI interfaces. + */ void (*new_smi)(int if_num, struct device *dev); void (*smi_gone)(int if_num); }; @@ -250,8 +273,10 @@ struct ipmi_smi_watcher { int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher); int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher); -/* The following are various helper functions for dealing with IPMI - addresses. */ +/* + * The following are various helper functions for dealing with IPMI + * addresses. + */ /* Return the maximum length of an IPMI address given it's type. */ unsigned int ipmi_addr_length(int addr_type); diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index af457b5a689e..9e5c3079d232 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -22,8 +22,10 @@ struct device; -/* This files describes the interface for IPMI system management interface - drivers to bind into the IPMI message handler. */ +/* + * This files describes the interface for IPMI system management interface + * drivers to bind into the IPMI message handler. + */ /* Structure for the low-level drivers. */ typedef struct ipmi_smi *ipmi_smi_t; @@ -61,10 +63,12 @@ struct ipmi_smi_msg { struct ipmi_smi_handlers { struct module *owner; - /* The low-level interface cannot start sending messages to - the upper layer until this function is called. This may - not be NULL, the lower layer must take the interface from - this call. */ + /* + * The low-level interface cannot start sending messages to + * the upper layer until this function is called. This may + * not be NULL, the lower layer must take the interface from + * this call. + */ int (*start_processing)(void *send_info, ipmi_smi_t new_intf); @@ -75,25 +79,31 @@ struct ipmi_smi_handlers { */ int (*get_smi_info)(void *send_info, struct ipmi_smi_info *data); - /* Called to enqueue an SMI message to be sent. This - operation is not allowed to fail. If an error occurs, it - should report back the error in a received message. It may - do this in the current call context, since no write locks - are held when this is run. Message are delivered one at - a time by the message handler, a new message will not be - delivered until the previous message is returned. */ + /* + * Called to enqueue an SMI message to be sent. This + * operation is not allowed to fail. If an error occurs, it + * should report back the error in a received message. It may + * do this in the current call context, since no write locks + * are held when this is run. Message are delivered one at + * a time by the message handler, a new message will not be + * delivered until the previous message is returned. + */ void (*sender)(void *send_info, struct ipmi_smi_msg *msg); - /* Called by the upper layer to request that we try to get - events from the BMC we are attached to. */ + /* + * Called by the upper layer to request that we try to get + * events from the BMC we are attached to. + */ void (*request_events)(void *send_info); - /* Called by the upper layer when some user requires that the - interface watch for events, received messages, watchdog - pretimeouts, or not. Used by the SMI to know if it should - watch for these. This may be NULL if the SMI does not - implement it. */ + /* + * Called by the upper layer when some user requires that the + * interface watch for events, received messages, watchdog + * pretimeouts, or not. Used by the SMI to know if it should + * watch for these. This may be NULL if the SMI does not + * implement it. + */ void (*set_need_watch)(void *send_info, bool enable); /* @@ -101,28 +111,36 @@ struct ipmi_smi_handlers { */ void (*flush_messages)(void *send_info); - /* Called when the interface should go into "run to - completion" mode. If this call sets the value to true, the - interface should make sure that all messages are flushed - out and that none are pending, and any new requests are run - to completion immediately. */ + /* + * Called when the interface should go into "run to + * completion" mode. If this call sets the value to true, the + * interface should make sure that all messages are flushed + * out and that none are pending, and any new requests are run + * to completion immediately. + */ void (*set_run_to_completion)(void *send_info, bool run_to_completion); - /* Called to poll for work to do. This is so upper layers can - poll for operations during things like crash dumps. */ + /* + * Called to poll for work to do. This is so upper layers can + * poll for operations during things like crash dumps. + */ void (*poll)(void *send_info); - /* Enable/disable firmware maintenance mode. Note that this - is *not* the modes defined, this is simply an on/off - setting. The message handler does the mode handling. Note - that this is called from interrupt context, so it cannot - block. */ + /* + * Enable/disable firmware maintenance mode. Note that this + * is *not* the modes defined, this is simply an on/off + * setting. The message handler does the mode handling. Note + * that this is called from interrupt context, so it cannot + * block. + */ void (*set_maintenance_mode)(void *send_info, bool enable); - /* Tell the handler that we are using it/not using it. The - message handler get the modules that this handler belongs - to; this function lets the SMI claim any modules that it - uses. These may be NULL if this is not required. */ + /* + * Tell the handler that we are using it/not using it. The + * message handler get the modules that this handler belongs + * to; this function lets the SMI claim any modules that it + * uses. These may be NULL if this is not required. + */ int (*inc_usecount)(void *send_info); void (*dec_usecount)(void *send_info); }; @@ -143,7 +161,8 @@ struct ipmi_device_id { #define ipmi_version_major(v) ((v)->ipmi_version & 0xf) #define ipmi_version_minor(v) ((v)->ipmi_version >> 4) -/* Take a pointer to an IPMI response and extract device id information from +/* + * Take a pointer to an IPMI response and extract device id information from * it. @netfn is in the IPMI_NETFN_ format, so may need to be shifted from * a SI response. */ @@ -187,12 +206,14 @@ static inline int ipmi_demangle_device_id(uint8_t netfn, uint8_t cmd, return 0; } -/* Add a low-level interface to the IPMI driver. Note that if the - interface doesn't know its slave address, it should pass in zero. - The low-level interface should not deliver any messages to the - upper layer until the start_processing() function in the handlers - is called, and the lower layer must get the interface from that - call. */ +/* + * Add a low-level interface to the IPMI driver. Note that if the + * interface doesn't know its slave address, it should pass in zero. + * The low-level interface should not deliver any messages to the + * upper layer until the start_processing() function in the handlers + * is called, and the lower layer must get the interface from that + * call. + */ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers, void *send_info, struct device *dev, @@ -223,9 +244,11 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg) } #ifdef CONFIG_IPMI_PROC_INTERFACE -/* Allow the lower layer to add things to the proc filesystem - directory for this interface. Note that the entry will - automatically be dstroyed when the interface is destroyed. */ +/* + * Allow the lower layer to add things to the proc filesystem + * directory for this interface. Note that the entry will + * automatically be dstroyed when the interface is destroyed. + */ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name, const struct file_operations *proc_ops, void *data); -- cgit v1.2.3 From b7780dab90e8da302bc7d0d1b39b538b822017e8 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 5 Apr 2018 16:44:12 -0500 Subject: ipmi: Add shutdown functions for users and interfaces Since things that IPMI uses can be hot-swapped, the users and interfaces really need to be able to handle this. Add the functions so the users and interfaces can implement them, the actual function will be added after everything is ready. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 8 ++++++++ include/linux/ipmi_smi.h | 6 ++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 3474f04cf9aa..d89bea1e457a 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -97,6 +97,14 @@ struct ipmi_user_hndl { * been set up to handle run to completion. */ void (*ipmi_panic_handler)(void *handler_data); + + /* + * Called when the interface has been removed. After this returns + * the user handle will be invalid. The interface may or may + * not be usable when this is called, but it will return errors + * if it is not usable. + */ + void (*shutdown)(void *handler_data); }; /* Create a new user of the IPMI layer on the given interface number. */ diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 9e5c3079d232..bdcda4741c89 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -72,6 +72,12 @@ struct ipmi_smi_handlers { int (*start_processing)(void *send_info, ipmi_smi_t new_intf); + /* + * When called, the low-level interface should disable all + * processing, it should be complete shut down when it returns. + */ + void (*shutdown)(void *send_info); + /* * Get the detailed private info of the low level interface and store * it into the structure of ipmi_smi_data. For example: the -- cgit v1.2.3 From 8eb005bf6e58ca08802ac8170e5d3c8486cb2d79 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 5 Apr 2018 22:10:08 -0500 Subject: ipmi: Remove usecount function from interfaces All the users are now gone. Signed-off-by: Corey Minyard --- include/linux/ipmi_smi.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index bdcda4741c89..16662b0423bf 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -140,15 +140,6 @@ struct ipmi_smi_handlers { * block. */ void (*set_maintenance_mode)(void *send_info, bool enable); - - /* - * Tell the handler that we are using it/not using it. The - * message handler get the modules that this handler belongs - * to; this function lets the SMI claim any modules that it - * uses. These may be NULL if this is not required. - */ - int (*inc_usecount)(void *send_info); - void (*dec_usecount)(void *send_info); }; struct ipmi_device_id { -- cgit v1.2.3 From 6a0d23ed338ed7015128378e0ceec03eaa3d91e2 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 11 Apr 2018 12:41:33 -0500 Subject: ipmi: ipmi_unregister_smi() cannot fail, have it return void Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 4 +--- drivers/char/ipmi/ipmi_si_intf.c | 5 +---- drivers/char/ipmi/ipmi_ssif.c | 4 +--- include/linux/ipmi_smi.h | 2 +- 4 files changed, 4 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 7ddadab65f33..946bfcb1eeee 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -3711,7 +3711,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) } } -int ipmi_unregister_smi(struct ipmi_smi *intf) +void ipmi_unregister_smi(struct ipmi_smi *intf) { struct ipmi_smi_watcher *w; int intf_num = intf->intf_num, index; @@ -3755,8 +3755,6 @@ int ipmi_unregister_smi(struct ipmi_smi *intf) cleanup_srcu_struct(&intf->users_srcu); kref_put(&intf->refcount, intf_free); - - return 0; } EXPORT_SYMBOL(ipmi_unregister_smi); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 00a324060dcd..2222caf4bab7 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2365,16 +2365,13 @@ static void shutdown_smi(void *send_info) static void shutdown_one_si(struct smi_info *smi_info) { - int rv; ipmi_smi_t intf = smi_info->intf; if (!intf) return; smi_info->intf = NULL; - rv = ipmi_unregister_smi(intf); - if (rv) - pr_err(PFX "Unable to unregister device: errno=%d\n", rv); + ipmi_unregister_smi(intf); } static void cleanup_one_si(struct smi_info *smi_info) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 8c72f271d4b4..17cae7a41b70 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1247,9 +1247,7 @@ static int ssif_remove(struct i2c_client *client) */ intf = ssif_info->intf; ssif_info->intf = NULL; - rv = ipmi_unregister_smi(intf); - if (rv) - pr_err(PFX "Unable to unregister device: errno=%d\n", rv); + ipmi_unregister_smi(intf); list_for_each_entry(addr_info, &ssif_infos, link) { if (addr_info->client == client) { diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 16662b0423bf..26ba57c307f0 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -220,7 +220,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers, * Remove a low-level interface from the IPMI driver. This will * return an error if the interface is still in use by a user. */ -int ipmi_unregister_smi(ipmi_smi_t intf); +void ipmi_unregister_smi(ipmi_smi_t intf); /* * The lower layer reports received messages through this interface. -- cgit v1.2.3 From 5ce1a7dc806efb2181e93d4a088b281c4cff4eaa Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 11 Apr 2018 13:11:54 -0500 Subject: ipmi: Get rid of ipmi_user_t and ipmi_smi_t in include files Convert over to struct ipmi_user * and struct ipmi_smi *. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 34 +++++++++++++++++----------------- include/linux/ipmi_smi.h | 12 ++++++------ 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index d89bea1e457a..41f5c086f670 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -45,7 +45,7 @@ struct ipmi_recv_msg { */ int recv_type; - ipmi_user_t user; + struct ipmi_user *user; struct ipmi_addr addr; long msgid; struct kernel_ipmi_msg msg; @@ -111,7 +111,7 @@ struct ipmi_user_hndl { int ipmi_create_user(unsigned int if_num, const struct ipmi_user_hndl *handler, void *handler_data, - ipmi_user_t *user); + struct ipmi_user **user); /* * Destroy the given user of the IPMI layer. Note that after this @@ -121,10 +121,10 @@ int ipmi_create_user(unsigned int if_num, * the users before you destroy the callback structures, it should be * safe, too. */ -int ipmi_destroy_user(ipmi_user_t user); +int ipmi_destroy_user(struct ipmi_user *user); /* Get the IPMI version of the BMC we are talking to. */ -int ipmi_get_version(ipmi_user_t user, +int ipmi_get_version(struct ipmi_user *user, unsigned char *major, unsigned char *minor); @@ -137,16 +137,16 @@ int ipmi_get_version(ipmi_user_t user, * it for everyone else. Note that each channel can have its own * address. */ -int ipmi_set_my_address(ipmi_user_t user, +int ipmi_set_my_address(struct ipmi_user *user, unsigned int channel, unsigned char address); -int ipmi_get_my_address(ipmi_user_t user, +int ipmi_get_my_address(struct ipmi_user *user, unsigned int channel, unsigned char *address); -int ipmi_set_my_LUN(ipmi_user_t user, +int ipmi_set_my_LUN(struct ipmi_user *user, unsigned int channel, unsigned char LUN); -int ipmi_get_my_LUN(ipmi_user_t user, +int ipmi_get_my_LUN(struct ipmi_user *user, unsigned int channel, unsigned char *LUN); @@ -163,7 +163,7 @@ int ipmi_get_my_LUN(ipmi_user_t user, * it makes no sense to do it here. However, this can be used if you * have unusual requirements. */ -int ipmi_request_settime(ipmi_user_t user, +int ipmi_request_settime(struct ipmi_user *user, struct ipmi_addr *addr, long msgid, struct kernel_ipmi_msg *msg, @@ -181,7 +181,7 @@ int ipmi_request_settime(ipmi_user_t user, * change as the system changes, so don't use it unless you REALLY * have to. */ -int ipmi_request_supply_msgs(ipmi_user_t user, +int ipmi_request_supply_msgs(struct ipmi_user *user, struct ipmi_addr *addr, long msgid, struct kernel_ipmi_msg *msg, @@ -197,7 +197,7 @@ int ipmi_request_supply_msgs(ipmi_user_t user, * way. This is useful if you need to spin waiting for something to * happen in the IPMI driver. */ -void ipmi_poll_interface(ipmi_user_t user); +void ipmi_poll_interface(struct ipmi_user *user); /* * When commands come in to the SMS, the user can register to receive @@ -208,11 +208,11 @@ void ipmi_poll_interface(ipmi_user_t user); * error. Channels are specified as a bitfield, use IPMI_CHAN_ALL to * mean all channels. */ -int ipmi_register_for_cmd(ipmi_user_t user, +int ipmi_register_for_cmd(struct ipmi_user *user, unsigned char netfn, unsigned char cmd, unsigned int chans); -int ipmi_unregister_for_cmd(ipmi_user_t user, +int ipmi_unregister_for_cmd(struct ipmi_user *user, unsigned char netfn, unsigned char cmd, unsigned int chans); @@ -243,8 +243,8 @@ int ipmi_unregister_for_cmd(ipmi_user_t user, * * See the IPMI_MAINTENANCE_MODE_xxx defines for what the mode means. */ -int ipmi_get_maintenance_mode(ipmi_user_t user); -int ipmi_set_maintenance_mode(ipmi_user_t user, int mode); +int ipmi_get_maintenance_mode(struct ipmi_user *user); +int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode); /* * When the user is created, it will not receive IPMI events by @@ -252,7 +252,7 @@ int ipmi_set_maintenance_mode(ipmi_user_t user, int mode); * The first user that sets this to TRUE will receive all events that * have been queued while no one was waiting for events. */ -int ipmi_set_gets_events(ipmi_user_t user, bool val); +int ipmi_set_gets_events(struct ipmi_user *user, bool val); /* * Called when a new SMI is registered. This will also be called on @@ -330,7 +330,7 @@ struct ipmi_smi_info { union ipmi_smi_info_union addr_info; }; -/* This is to get the private info of ipmi_smi_t */ +/* This is to get the private info of struct ipmi_smi */ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); #endif /* __LINUX_IPMI_H */ diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 26ba57c307f0..0d438662a821 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -69,8 +69,8 @@ struct ipmi_smi_handlers { * not be NULL, the lower layer must take the interface from * this call. */ - int (*start_processing)(void *send_info, - ipmi_smi_t new_intf); + int (*start_processing)(void *send_info, + struct ipmi_smi *new_intf); /* * When called, the low-level interface should disable all @@ -220,7 +220,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers, * Remove a low-level interface from the IPMI driver. This will * return an error if the interface is still in use by a user. */ -void ipmi_unregister_smi(ipmi_smi_t intf); +void ipmi_unregister_smi(struct ipmi_smi *intf); /* * The lower layer reports received messages through this interface. @@ -228,11 +228,11 @@ void ipmi_unregister_smi(ipmi_smi_t intf); * the lower layer gets an error sending a message, it should format * an error response in the message response. */ -void ipmi_smi_msg_received(ipmi_smi_t intf, +void ipmi_smi_msg_received(struct ipmi_smi *intf, struct ipmi_smi_msg *msg); /* The lower layer received a watchdog pre-timeout on interface. */ -void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf); +void ipmi_smi_watchdog_pretimeout(struct ipmi_smi *intf); struct ipmi_smi_msg *ipmi_alloc_smi_msg(void); static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg) @@ -246,7 +246,7 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg) * directory for this interface. Note that the entry will * automatically be dstroyed when the interface is destroyed. */ -int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name, +int ipmi_smi_add_proc_entry(struct ipmi_smi *smi, char *name, const struct file_operations *proc_ops, void *data); #endif -- cgit v1.2.3 From b32cc5b9a346319c171e3ad905e0cddda032b5eb Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Tue, 17 Apr 2018 21:42:13 -0700 Subject: bpf: adding bpf_xdp_adjust_tail helper Adding new bpf helper which would allow us to manipulate xdp's data_end pointer, and allow us to reduce packet's size indended use case: to generate ICMP messages from XDP context, where such message would contain truncated original packet. Signed-off-by: Nikita V. Shirokov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 +++++++++- net/core/filter.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c5ec89732a8d..9a2d1a04eb24 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -755,6 +755,13 @@ union bpf_attr { * @addr: pointer to struct sockaddr to bind socket to * @addr_len: length of sockaddr structure * Return: 0 on success or negative error code + * + * int bpf_xdp_adjust_tail(xdp_md, delta) + * Adjust the xdp_md.data_end by delta. Only shrinking of packet's + * size is supported. + * @xdp_md: pointer to xdp_md + * @delta: A negative integer to be added to xdp_md.data_end + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -821,7 +828,8 @@ union bpf_attr { FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ FN(msg_pull_data), \ - FN(bind), + FN(bind), \ + FN(xdp_adjust_tail), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a374b8560bc4..29318598fd60 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2725,6 +2725,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_end = xdp->data_end + offset; + + /* only shrinking is allowed for now. */ + if (unlikely(offset >= 0)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *meta = xdp->data_meta + offset; @@ -3074,7 +3098,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data) + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail) return true; return false; @@ -3888,6 +3913,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From b6240a4df0186c03e5ffff6f61570ed31a1a5172 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 26 Mar 2018 17:27:41 +0800 Subject: scsi: libsas: add transport class for ATA devices Now ata devices attached with sas controller do not have transport class, so that we can not see any information of these ata devices in /sys/class/ata_port(or ata_link or ata_device). Add transport class for the ata devices attached with sas controller. The /sys/class directory will show the infomation of the ata devices as follows: localhost:/sys/class # ls ata* ata_device: dev1.0 dev2.0 ata_link: link1 link2 ata_port: ata1 ata2 No functional change of the device scanning and io path. The ata transport class was deleted when destroying the sas devices. Signed-off-by: Jason Yan CC: Dan Williams CC: Tejun Heo Acked-by: Tejun Heo Signed-off-by: Martin K. Petersen --- drivers/ata/libata-scsi.c | 12 ++++++++++++ drivers/scsi/libsas/sas_ata.c | 5 +++++ drivers/scsi/libsas/sas_discover.c | 1 + include/linux/libata.h | 2 ++ 4 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 89a9d4a2efc8..1c9f80fbc51c 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -5051,6 +5051,18 @@ int ata_sas_port_init(struct ata_port *ap) } EXPORT_SYMBOL_GPL(ata_sas_port_init); +int ata_sas_tport_add(struct device *parent, struct ata_port *ap) +{ + return ata_tport_add(parent, ap); +} +EXPORT_SYMBOL_GPL(ata_sas_tport_add); + +void ata_sas_tport_delete(struct ata_port *ap) +{ + ata_tport_delete(ap); +} +EXPORT_SYMBOL_GPL(ata_sas_tport_delete); + /** * ata_sas_port_destroy - Destroy a SATA port allocated by ata_sas_port_alloc * @ap: SATA port to destroy diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 0cc1567eacc1..ff1d612f6fb9 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -577,6 +577,11 @@ int sas_ata_init(struct domain_device *found_dev) ata_sas_port_destroy(ap); return rc; } + rc = ata_sas_tport_add(found_dev->sata_dev.ata_host.dev, ap); + if (rc) { + ata_sas_port_destroy(ap); + return rc; + } found_dev->sata_dev.ap = ap; return 0; diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index a0fa7ef3a071..1ffca28fe6a8 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -314,6 +314,7 @@ void sas_free_device(struct kref *kref) kfree(dev->ex_dev.ex_phy); if (dev_is_sata(dev) && dev->sata_dev.ap) { + ata_sas_tport_delete(dev->sata_dev.ap); ata_sas_port_destroy(dev->sata_dev.ap); dev->sata_dev.ap = NULL; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 1795fecdea17..0619ebf4d475 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1130,6 +1130,8 @@ extern void ata_sas_async_probe(struct ata_port *ap); extern int ata_sas_sync_probe(struct ata_port *ap); extern int ata_sas_port_init(struct ata_port *); extern int ata_sas_port_start(struct ata_port *ap); +extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); +extern void ata_sas_tport_delete(struct ata_port *ap); extern void ata_sas_port_stop(struct ata_port *ap); extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); -- cgit v1.2.3 From 63273cb40101b6f303a5493f1bdf629d4ab3746b Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 27 Mar 2018 17:48:38 -0700 Subject: scsi: vmbus: Add function to report available ring buffer to write in total ring size percentage Netvsc has a function to calculate how much ring buffer in percentage is available to write. This function is also useful for storvsc and other vmbus devices. Define a similar function in vmbus to be used by other vmbus devices. Signed-off-by: Long Li Acked-by: David S. Miller Signed-off-by: Martin K. Petersen --- drivers/hv/ring_buffer.c | 2 ++ include/linux/hyperv.h | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 8699bb969e7e..3c836c099a8f 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -227,6 +227,8 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer->feature_bits.value = 1; ring_info->ring_size = page_cnt << PAGE_SHIFT; + ring_info->ring_size_div10_reciprocal = + reciprocal_value(ring_info->ring_size / 10); ring_info->ring_datasize = ring_info->ring_size - sizeof(struct hv_ring_buffer); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 192ed8fbc403..9ac954ee577e 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -35,6 +35,7 @@ #include #include #include +#include #define MAX_PAGE_BUFFER_COUNT 32 #define MAX_MULTIPAGE_BUFFER_COUNT 32 /* 128K */ @@ -120,6 +121,7 @@ struct hv_ring_buffer { struct hv_ring_buffer_info { struct hv_ring_buffer *ring_buffer; u32 ring_size; /* Include the shared header */ + struct reciprocal_value ring_size_div10_reciprocal; spinlock_t ring_lock; u32 ring_datasize; /* < ring_size */ @@ -154,6 +156,16 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi) return write; } +static inline u32 hv_get_avail_to_write_percent( + const struct hv_ring_buffer_info *rbi) +{ + u32 avail_write = hv_get_bytes_to_write(rbi); + + return reciprocal_divide( + (avail_write << 3) + (avail_write << 1), + rbi->ring_size_div10_reciprocal); +} + /* * VMBUS version is 32 bit entity broken up into * two 16 bit quantities: major_number. minor_number. -- cgit v1.2.3 From a2dd6877b43ef14129f258910d60b2e81b32100b Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Tue, 17 Apr 2018 17:30:31 -0400 Subject: soc: ti: K2G: provide APIs to support driver probe deferral This patch provide APIs to allow client drivers to support probe deferral. On K2G SoC, devices can be probed only after the ti_sci_pm_domains driver is probed and ready. As drivers may get probed at different order, any driver that depends on knav dma and qmss drivers, for example netcp network driver, needs to defer probe until knav devices are probed and ready to service. To do this, add an API to query the device ready status from the knav dma and qmss devices. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/soc/ti/knav_dma.c | 8 ++++++++ drivers/soc/ti/knav_qmss_queue.c | 8 ++++++++ include/linux/soc/ti/knav_dma.h | 12 ++++++++++++ include/linux/soc/ti/knav_qmss.h | 1 + 4 files changed, 29 insertions(+) (limited to 'include') diff --git a/drivers/soc/ti/knav_dma.c b/drivers/soc/ti/knav_dma.c index 026182d3b27c..224d7ddeeb76 100644 --- a/drivers/soc/ti/knav_dma.c +++ b/drivers/soc/ti/knav_dma.c @@ -134,6 +134,13 @@ struct knav_dma_chan { static struct knav_dma_pool_device *kdev; +static bool device_ready; +bool knav_dma_device_ready(void) +{ + return device_ready; +} +EXPORT_SYMBOL_GPL(knav_dma_device_ready); + static bool check_config(struct knav_dma_chan *chan, struct knav_dma_cfg *cfg) { if (!memcmp(&chan->cfg, cfg, sizeof(*cfg))) @@ -773,6 +780,7 @@ static int knav_dma_probe(struct platform_device *pdev) debugfs_create_file("knav_dma", S_IFREG | S_IRUGO, NULL, NULL, &knav_dma_debug_ops); + device_ready = true; return ret; } diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c index 8526c8ed3af2..419365a8d1c2 100644 --- a/drivers/soc/ti/knav_qmss_queue.c +++ b/drivers/soc/ti/knav_qmss_queue.c @@ -74,6 +74,13 @@ static DEFINE_MUTEX(knav_dev_lock); */ const char *knav_acc_firmwares[] = {"ks2_qmss_pdsp_acc48.bin"}; +static bool device_ready; +bool knav_qmss_device_ready(void) +{ + return device_ready; +} +EXPORT_SYMBOL_GPL(knav_qmss_device_ready); + /** * knav_queue_notify: qmss queue notfier call * @@ -1849,6 +1856,7 @@ static int knav_queue_probe(struct platform_device *pdev) debugfs_create_file("qmss", S_IFREG | S_IRUGO, NULL, NULL, &knav_queue_debug_ops); + device_ready = true; return 0; err: diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h index 66693bc4c6ad..7127ec301537 100644 --- a/include/linux/soc/ti/knav_dma.h +++ b/include/linux/soc/ti/knav_dma.h @@ -167,6 +167,8 @@ struct knav_dma_desc { void *knav_dma_open_channel(struct device *dev, const char *name, struct knav_dma_cfg *config); void knav_dma_close_channel(void *channel); +int knav_dma_get_flow(void *channel); +bool knav_dma_device_ready(void); #else static inline void *knav_dma_open_channel(struct device *dev, const char *name, struct knav_dma_cfg *config) @@ -176,6 +178,16 @@ static inline void *knav_dma_open_channel(struct device *dev, const char *name, static inline void knav_dma_close_channel(void *channel) {} +static inline int knav_dma_get_flow(void *channel) +{ + return -EINVAL; +} + +static inline bool knav_dma_device_ready(void) +{ + return false; +} + #endif #endif /* __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__ */ diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h index 9f0ebb3bad27..9745df6ed9d3 100644 --- a/include/linux/soc/ti/knav_qmss.h +++ b/include/linux/soc/ti/knav_qmss.h @@ -86,5 +86,6 @@ int knav_pool_desc_map(void *ph, void *desc, unsigned size, void *knav_pool_desc_unmap(void *ph, dma_addr_t dma, unsigned dma_sz); dma_addr_t knav_pool_desc_virt_to_dma(void *ph, void *virt); void *knav_pool_desc_dma_to_virt(void *ph, dma_addr_t dma); +bool knav_qmss_device_ready(void); #endif /* __SOC_TI_KNAV_QMSS_H__ */ -- cgit v1.2.3 From ce20cdf498bdfa6c6130be75cf4359dad305ebcd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 9 Apr 2018 16:40:34 +0200 Subject: netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet. The nfulnl_log_packet() is added to make sure that the NFLOG target works as only user-space logger. but now, nf_log_packet() can find proper log function using NF_LOG_TYPE_ULOG and NF_LOG_TYPE_LOG. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nfnetlink_log.h | 17 ----------------- net/netfilter/nfnetlink_log.c | 8 +++----- net/netfilter/xt_NFLOG.c | 15 +++++++++++---- 3 files changed, 14 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h index 612cfb63ac68..ea32a7d3cf1b 100644 --- a/include/net/netfilter/nfnetlink_log.h +++ b/include/net/netfilter/nfnetlink_log.h @@ -1,18 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _KER_NFNETLINK_LOG_H -#define _KER_NFNETLINK_LOG_H - -void -nfulnl_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li_user, - const char *prefix); - -#define NFULNL_COPY_DISABLED 0xff - -#endif /* _KER_NFNETLINK_LOG_H */ - diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 7b46aa4c478d..e5cc4d9b9ce7 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -47,6 +46,7 @@ #include "../bridge/br_private.h" #endif +#define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ @@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = { }; /* log handler for internal netfilter logging api */ -void +static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, @@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net, struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; - unsigned int plen; + unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; struct nf_conn *ct = NULL; @@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net, if (!inst) return; - plen = 0; if (prefix) plen = strlen(prefix) + 1; @@ -760,7 +759,6 @@ alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } -EXPORT_SYMBOL_GPL(nfulnl_log_packet); static int nfulnl_rcv_nl_event(struct notifier_block *this, diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index c7f8958cea4a..1ed0cac585c4 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -13,7 +13,6 @@ #include #include #include -#include MODULE_AUTHOR("Patrick McHardy "); MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); @@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) if (info->flags & XT_NFLOG_F_COPY_LEN) li.u.ulog.flags |= NF_LOG_F_COPY_LEN; - nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb, - xt_in(par), xt_out(par), &li, info->prefix); + nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), + xt_out(par), &li, "%s", info->prefix); + return XT_CONTINUE; } @@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return 0; + + return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); +} + +static void nflog_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } static struct xt_target nflog_tg_reg __read_mostly = { @@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = { .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, -- cgit v1.2.3 From 291bfb928863d496e25c785e132a8fbfb32341a8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 19 Apr 2018 12:14:10 +0100 Subject: ASoC: topology: Revert recent changes while boot errors are investigated Krzysztof Kozlowski reported a NULL dereference in _instantiate_card() on Odroid XU3 and XU boards which he bisected to 45f8cb57da0d7 (ASoC: core: Allow topology to override machine driver FE DAI link config). Revert that commit for now, along with f11a5c27f928 (ASoC: core: Add name prefix for machines with topology rewrites) due to dependency issues, in order to keep things booting cleanly in -next. Reported-by: Krzysztof Kozlowski Signed-off-by: Mark Brown --- include/sound/soc.h | 12 -------- sound/soc/soc-core.c | 87 ++-------------------------------------------------- sound/soc/soc-pcm.c | 12 -------- 3 files changed, 3 insertions(+), 108 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 3676d0a8f532..ad266d7e9553 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1012,14 +1012,6 @@ struct snd_soc_platform_driver { /* platform stream compress ops */ const struct snd_compr_ops *compr_ops; - - /* this platform uses topology and ignore machine driver FEs */ - const char *ignore_machine; - const char *topology_name_prefix; - int (*be_hw_params_fixup)(struct snd_soc_pcm_runtime *rtd, - struct snd_pcm_hw_params *params); - bool use_dai_pcm_id; /* use the DAI link PCM ID as PCM device number */ - int be_pcm_base; /* base device ID for all BE PCMs */ }; struct snd_soc_dai_link_component { @@ -1126,9 +1118,6 @@ struct snd_soc_dai_link { /* pmdown_time is ignored at stop */ unsigned int ignore_pmdown_time:1; - /* Do not create a PCM for this DAI link (Backend link) */ - unsigned int ignore:1; - struct list_head list; /* DAI link list of the soc card */ struct snd_soc_dobj dobj; /* For topology */ }; @@ -1168,7 +1157,6 @@ struct snd_soc_card { const char *long_name; const char *driver_name; char dmi_longname[80]; - char topology_shortname[32]; struct device *dev; struct snd_card *snd_card; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index aa1c33b3cce0..bf7ca32ab31f 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1050,9 +1050,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card, const char *platform_name; int i; - if (dai_link->ignore) - return 0; - dev_dbg(card->dev, "ASoC: binding %s\n", dai_link->name); if (soc_is_dai_link_bound(card, dai_link)) { @@ -1675,7 +1672,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, { struct snd_soc_dai_link *dai_link = rtd->dai_link; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - int i, ret, num; + int i, ret; dev_dbg(card->dev, "ASoC: probe %s dai link %d late %d\n", card->name, rtd->num, order); @@ -1721,23 +1718,9 @@ static int soc_probe_link_dais(struct snd_soc_card *card, soc_dpcm_debugfs_add(rtd); #endif - /* - * most drivers will register their PCMs using DAI link ordering but - * topology based drivers can use the DAI link id field to set PCM - * device number and then use rtd + a base offset of the BEs. - */ - if (rtd->platform->driver->use_dai_pcm_id) { - if (rtd->dai_link->no_pcm) - num = rtd->platform->driver->be_pcm_base + rtd->num; - else - num = rtd->dai_link->id; - } else { - num = rtd->num; - } - if (cpu_dai->driver->compress_new) { /*create compress_device"*/ - ret = cpu_dai->driver->compress_new(rtd, num); + ret = cpu_dai->driver->compress_new(rtd, rtd->num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create compress %s\n", dai_link->stream_name); @@ -1747,7 +1730,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, if (!dai_link->params) { /* create the pcm */ - ret = soc_new_pcm(rtd, num); + ret = soc_new_pcm(rtd, rtd->num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create pcm %s :%d\n", dai_link->stream_name, ret); @@ -2093,67 +2076,6 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour) EXPORT_SYMBOL_GPL(snd_soc_set_dmi_name); #endif /* CONFIG_DMI */ -static void soc_check_tplg_fes(struct snd_soc_card *card) -{ - struct snd_soc_platform *platform; - struct snd_soc_dai_link *dai_link; - int i; - - list_for_each_entry(platform, &platform_list, list) { - - /* does this platform override FEs ? */ - if (!platform->driver->ignore_machine) - continue; - - /* for this machine ? */ - if (strcmp(platform->driver->ignore_machine, - card->dev->driver->name)) - continue; - - /* machine matches, so override the rtd data */ - for (i = 0; i < card->num_links; i++) { - - dai_link = &card->dai_link[i]; - - /* ignore this FE */ - if (dai_link->dynamic) { - dai_link->ignore = true; - continue; - } - - dev_info(card->dev, "info: override FE DAI link %s\n", - card->dai_link[i].name); - - /* override platform */ - dai_link->platform_name = platform->component.name; - dai_link->cpu_dai_name = platform->component.name; - - /* convert non BE into BE */ - dai_link->no_pcm = 1; - dai_link->dpcm_playback = 1; - dai_link->dpcm_capture = 1; - - /* override any BE fixups */ - dai_link->be_hw_params_fixup = - platform->driver->be_hw_params_fixup; - - /* most BE links dont set stream name, so set it to - * dai link name if it's NULL to help bind widgets. - */ - if (!dai_link->stream_name) - dai_link->stream_name = dai_link->name; - } - - /* Inform userspace we are using alternate topology */ - if (platform->driver->topology_name_prefix) { - snprintf(card->topology_shortname, 32, "%s-%s", - platform->driver->topology_name_prefix, - card->name); - card->name = card->topology_shortname; - } - } -} - static int snd_soc_instantiate_card(struct snd_soc_card *card) { struct snd_soc_codec *codec; @@ -2164,9 +2086,6 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) mutex_lock(&client_mutex); mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT); - /* check whether any platform is ignore machine FE and using topology */ - soc_check_tplg_fes(card); - /* bind DAIs */ for (i = 0; i < card->num_links; i++) { ret = soc_bind_dai_link(card, &card->dai_link[i]); diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 4ce489165a6d..68d9dc930096 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -909,20 +909,8 @@ int soc_dai_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) { - struct snd_soc_pcm_runtime *rtd = substream->private_data; int ret; - /* perform any topology hw_params fixups before DAI */ - if (rtd->dai_link->be_hw_params_fixup) { - ret = rtd->dai_link->be_hw_params_fixup(rtd, params); - if (ret < 0) { - dev_err(rtd->dev, - "ASoC: hw_params topology fixup failed %d\n", - ret); - return ret; - } - } - if (dai->driver->ops->hw_params) { ret = dai->driver->ops->hw_params(substream, params, dai); if (ret < 0) { -- cgit v1.2.3 From 5055c67760df8e13c68f73b8534d5792d1d36543 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:24 -0700 Subject: compat: Make compat helpers independent of CONFIG_COMPAT Many of the compat time syscalls are also repurposed as 32 bit native syscalls to provide backward compatibility while adding new y2038 safe sycalls. Enabling the helpers makes this possible. Signed-off-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index 081281ad5772..3f35f4cfb98d 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -8,8 +8,6 @@ #include -#ifdef CONFIG_COMPAT - #include #include /* for HZ */ #include @@ -20,9 +18,11 @@ #include #include +#ifdef CONFIG_COMPAT #include #include #include +#endif #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* @@ -83,6 +83,8 @@ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* COMPAT_SYSCALL_DEFINEx */ +#ifdef CONFIG_COMPAT + #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() #endif @@ -1016,7 +1018,9 @@ static inline struct compat_timeval ns_to_compat_timeval(s64 nsec) #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) +#ifndef in_compat_syscall static inline bool in_compat_syscall(void) { return false; } +#endif #endif /* CONFIG_COMPAT */ -- cgit v1.2.3 From 2b5a9a37e97467fea67070a94529d6d07e549270 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Mar 2018 16:59:15 +0200 Subject: time: Add an asm-generic/compat.h file We have a couple of files that try to include asm/compat.h on architectures where this is available. Those should generally use the higher-level linux/compat.h file, but that in turn fails to include asm/compat.h when CONFIG_COMPAT is disabled, unless we can provide that header on all architectures. This adds the asm/compat.h for all remaining architectures to simplify the dependencies. Architectures that are getting removed in linux-4.17 are not changed here, to avoid needless conflicts with the removal patches. Those architectures are broken by this patch, but we have already shown that they have no users. Signed-off-by: Arnd Bergmann --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/c6x/include/asm/Kbuild | 1 + arch/h8300/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/nds32/include/asm/Kbuild | 1 + arch/nios2/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/unicore32/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/compat.h | 3 +++ include/linux/compat.h | 3 ++- 18 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 include/asm-generic/compat.h (limited to 'include') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 9b68790013e2..0580cb8c84b2 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +generic-y += compat.h generic-y += exec.h generic-y += export.h generic-y += fb.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 4bd5d4369e05..d51bc22e8795 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += bugs.h +generic-y += compat.h generic-y += device.h generic-y += div64.h generic-y += emergency-restart.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 873e3c189279..1d66db9c9db5 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -1,3 +1,4 @@ +generic-y += compat.h generic-y += current.h generic-y += early_ioremap.h generic-y += emergency-restart.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index fd4c840de837..24037486317c 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += atomic.h generic-y += barrier.h generic-y += bugs.h +generic-y += compat.h generic-y += current.h generic-y += device.h generic-y += div64.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 14bac06b7116..a5d0b2991f47 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += barrier.h generic-y += bugs.h generic-y += cacheflush.h generic-y += checksum.h +generic-y += compat.h generic-y += current.h generic-y += delay.h generic-y += device.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index e9743f689fb8..dd2fd9c0d292 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += barrier.h generic-y += bug.h generic-y += bugs.h +generic-y += compat.h generic-y += current.h generic-y += device.h generic-y += div64.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 6dd867873364..557bbc8ba9f5 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -1,3 +1,4 @@ +generic-y += compat.h generic-y += exec.h generic-y += irq_work.h generic-y += mcs_spinlock.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 88a9d27df1ac..4d8d68c4e3dd 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += compat.h generic-y += device.h generic-y += emergency-restart.h generic-y += exec.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 3c80a5a308ed..fe6a6c6e5003 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += barrier.h generic-y += bitops.h generic-y += bug.h generic-y += bugs.h +generic-y += compat.h generic-y += device.h generic-y += div64.h generic-y += emergency-restart.h diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index 06bdf8167f5a..d1b1f89d44f6 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += checksum.h generic-y += clkdev.h generic-y += cmpxchg.h generic-y += cmpxchg-local.h +generic-y += compat.h generic-y += cputime.h generic-y += device.h generic-y += div64.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index d232da2cbb38..64ed3d656956 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += bitops.h generic-y += bug.h generic-y += bugs.h generic-y += cmpxchg.h +generic-y += compat.h generic-y += current.h generic-y += device.h generic-y += div64.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index f05c722a21f8..65964d390b10 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += barrier.h generic-y += bug.h generic-y += bugs.h generic-y += checksum.h +generic-y += compat.h generic-y += current.h generic-y += device.h generic-y += div64.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 1efcce74997b..46dd82ab2c29 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -1,3 +1,4 @@ +generic-y += compat.h generic-y += current.h generic-y += delay.h generic-y += div64.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index bb5a196c3061..b10dde6cb793 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += barrier.h generic-y += bpf_perf_event.h generic-y += bug.h +generic-y += compat.h generic-y += current.h generic-y += delay.h generic-y += device.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index 6f70c76c81fc..bfc7abe77905 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -1,5 +1,6 @@ generic-y += atomic.h generic-y += bugs.h +generic-y += compat.h generic-y += current.h generic-y += device.h generic-y += div64.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 436b20337168..e5e1e61c538c 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += bug.h +generic-y += compat.h generic-y += device.h generic-y += div64.h generic-y += dma-contiguous.h diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h new file mode 100644 index 000000000000..28819451b6d1 --- /dev/null +++ b/include/asm-generic/compat.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* This is an empty stub for 32-bit-only architectures */ diff --git a/include/linux/compat.h b/include/linux/compat.h index 3f35f4cfb98d..48d29b1339be 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -18,8 +18,9 @@ #include #include -#ifdef CONFIG_COMPAT #include + +#ifdef CONFIG_COMPAT #include #include #endif -- cgit v1.2.3 From 0d55303c51a4f35f674617e415632d492b596c26 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:25 -0700 Subject: compat: Move compat_timespec/ timeval to compat_time.h All the current architecture specific defines for these are the same. Refactor these common defines to a common header file. The new common linux/compat_time.h is also useful as it will eventually be used to hold all the defines that are needed for compat time types that support non y2038 safe types. New architectures need not have to define these new types as they will only use new y2038 safe syscalls. This file can be deleted after y2038 when we stop supporting non y2038 safe syscalls. The patch also requires an operation similar to: git grep "asm/compat\.h" | cut -d ":" -f 1 | xargs -n 1 sed -i -e "s%asm/compat.h%linux/compat.h%g" Cc: acme@kernel.org Cc: benh@kernel.crashing.org Cc: borntraeger@de.ibm.com Cc: catalin.marinas@arm.com Cc: cmetcalf@mellanox.com Cc: cohuck@redhat.com Cc: davem@davemloft.net Cc: deller@gmx.de Cc: devel@driverdev.osuosl.org Cc: gerald.schaefer@de.ibm.com Cc: gregkh@linuxfoundation.org Cc: heiko.carstens@de.ibm.com Cc: hoeppner@linux.vnet.ibm.com Cc: hpa@zytor.com Cc: jejb@parisc-linux.org Cc: jwi@linux.vnet.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: mark.rutland@arm.com Cc: mingo@redhat.com Cc: mpe@ellerman.id.au Cc: oberpar@linux.vnet.ibm.com Cc: oprofile-list@lists.sf.net Cc: paulus@samba.org Cc: peterz@infradead.org Cc: ralf@linux-mips.org Cc: rostedt@goodmis.org Cc: rric@kernel.org Cc: schwidefsky@de.ibm.com Cc: sebott@linux.vnet.ibm.com Cc: sparclinux@vger.kernel.org Cc: sth@linux.vnet.ibm.com Cc: ubraun@linux.vnet.ibm.com Cc: will.deacon@arm.com Cc: x86@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Acked-by: Steven Rostedt (VMware) Acked-by: Catalin Marinas Acked-by: James Hogan Acked-by: Helge Deller Signed-off-by: Arnd Bergmann --- arch/arm64/include/asm/compat.h | 11 ----------- arch/arm64/include/asm/stat.h | 1 + arch/arm64/kernel/hw_breakpoint.c | 1 - arch/arm64/kernel/perf_regs.c | 2 +- arch/mips/include/asm/compat.h | 11 ----------- arch/mips/kernel/signal32.c | 2 +- arch/parisc/include/asm/compat.h | 11 ----------- arch/powerpc/include/asm/compat.h | 11 ----------- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/oprofile/backtrace.c | 1 + arch/s390/hypfs/hypfs_sprp.c | 1 - arch/s390/include/asm/compat.h | 11 ----------- arch/s390/include/asm/elf.h | 4 ++-- arch/s390/kvm/priv.c | 1 - arch/s390/pci/pci_clp.c | 1 - arch/sparc/include/asm/compat.h | 11 ----------- arch/x86/events/core.c | 2 +- arch/x86/include/asm/compat.h | 11 ----------- arch/x86/include/asm/ftrace.h | 2 +- arch/x86/kernel/sys_x86_64.c | 2 +- drivers/s390/block/dasd_ioctl.c | 1 - drivers/s390/char/fs3270.c | 1 - drivers/s390/char/sclp_ctl.c | 1 - drivers/s390/char/vmcp.c | 1 - drivers/s390/cio/chsc_sch.c | 1 - drivers/s390/net/qeth_core_main.c | 2 +- include/linux/compat.h | 1 + include/linux/compat_time.h | 19 +++++++++++++++++++ 28 files changed, 31 insertions(+), 95 deletions(-) create mode 100644 include/linux/compat_time.h (limited to 'include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index c00c62e1a4a3..0030f79808b3 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -34,7 +34,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u16 __compat_uid_t; @@ -66,16 +65,6 @@ typedef u32 compat_ulong_t; typedef u64 compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { #ifdef __AARCH64EB__ short st_dev; diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h index 15e35598ac40..eab738019707 100644 --- a/arch/arm64/include/asm/stat.h +++ b/arch/arm64/include/asm/stat.h @@ -20,6 +20,7 @@ #ifdef CONFIG_COMPAT +#include #include /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 74bb56f656ef..413dbe530da8 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -30,7 +30,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 1d091d048d04..0bbac612146e 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include #include #include -#include #include #include diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 9a0fa66b81ac..3e548ee99a2f 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -14,7 +14,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_suseconds_t; @@ -46,16 +45,6 @@ typedef u32 compat_ulong_t; typedef u64 compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; s32 st_pad1[3]; diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index c4db910a8794..b5d9e1784aff 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -8,13 +8,13 @@ * Copyright (C) 1999, 2000 Silicon Graphics, Inc. * Copyright (C) 2016, Imagination Technologies Ltd. */ +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index 57b8b2a2fd4e..0cdfec8857bd 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -13,7 +13,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u32 __compat_uid_t; @@ -40,16 +39,6 @@ typedef u32 compat_ulong_t; typedef u64 compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; /* dev_t is 32 bits on parisc */ compat_ino_t st_ino; /* 32 bits */ diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 62168e1158f1..b4773c81f7d5 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -17,7 +17,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u32 __compat_uid_t; @@ -45,16 +44,6 @@ typedef u32 compat_ulong_t; typedef u64 compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; compat_ino_t st_ino; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6bee65f3cfd3..a77528db474c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -42,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c index ecc66d5f02c9..ad054dd0d666 100644 --- a/arch/powerpc/oprofile/backtrace.c +++ b/arch/powerpc/oprofile/backtrace.c @@ -7,6 +7,7 @@ * 2 of the License, or (at your option) any later version. **/ +#include #include #include #include diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c index ae0ed8dd5f1b..5d85a039391c 100644 --- a/arch/s390/hypfs/hypfs_sprp.c +++ b/arch/s390/hypfs/hypfs_sprp.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include "hypfs.h" diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 9830fb6b076e..501aaff85304 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -53,7 +53,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u16 __compat_uid_t; @@ -97,16 +96,6 @@ typedef struct { u32 gprs_high[NUM_GPRS]; } s390_compat_regs_high; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; u16 __pad1; diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 1a61b1b997f2..7d22a474a040 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -125,8 +125,9 @@ * ELF register definitions.. */ +#include + #include -#include #include #include @@ -136,7 +137,6 @@ typedef s390_regs elf_gregset_t; typedef s390_fp_regs compat_elf_fpregset_t; typedef s390_compat_regs compat_elf_gregset_t; -#include #include /* for task_struct */ #include diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ebfa0442e569..a3bce0e84346 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include "gaccess.h" #include "kvm-s390.h" diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 93cd0f1ca12b..19b2d2a9b43d 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 1898b6223160..1910c44521e3 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -11,7 +11,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u16 __compat_uid_t; @@ -39,16 +38,6 @@ typedef u32 compat_ulong_t; typedef u64 compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; compat_ino_t st_ino; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a6006e7bb729..7a987e6c7c35 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2391,7 +2391,7 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_IA32_EMULATION -#include +#include static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index e1c8dab86670..7cd314b71c51 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -17,7 +17,6 @@ typedef u32 compat_size_t; typedef s32 compat_ssize_t; -typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; typedef u16 __compat_uid_t; @@ -46,16 +45,6 @@ typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - struct compat_stat { compat_dev_t st_dev; u16 __pad1; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 09ad88572746..db25aa15b705 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -49,7 +49,7 @@ int ftrace_int3_handler(struct pt_regs *regs); #if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS) #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) -#include +#include /* * Because ia32 syscalls do not map to x86_64 syscall numbers diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a3f15ed545b5..6a78d4b36a79 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -19,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 7bdc6aaa0ba3..2016e0ed5865 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c index 61822480a2a0..16a4e8528bbc 100644 --- a/drivers/s390/char/fs3270.c +++ b/drivers/s390/char/fs3270.c @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/drivers/s390/char/sclp_ctl.c b/drivers/s390/char/sclp_ctl.c index a78cea0c3a09..248b5db3eaa8 100644 --- a/drivers/s390/char/sclp_ctl.c +++ b/drivers/s390/char/sclp_ctl.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 17e411c57576..948ce82a7725 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c index 0015729d917d..8d9f36625ba5 100644 --- a/drivers/s390/cio/chsc_sch.c +++ b/drivers/s390/cio/chsc_sch.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 04fefa5bb08d..df5d8451a51a 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -10,6 +10,7 @@ #define KMSG_COMPONENT "qeth" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include #include #include #include @@ -32,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/compat.h b/include/linux/compat.h index 48d29b1339be..cd50b00ef59a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -7,6 +7,7 @@ */ #include +#include #include #include /* for HZ */ diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h new file mode 100644 index 000000000000..56a54a1e4355 --- /dev/null +++ b/include/linux/compat_time.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_COMPAT_TIME_H +#define _LINUX_COMPAT_TIME_H + +#include + +typedef s32 compat_time_t; + +struct compat_timespec { + compat_time_t tv_sec; + s32 tv_nsec; +}; + +struct compat_timeval { + compat_time_t tv_sec; + s32 tv_usec; +}; + +#endif /* _LINUX_COMPAT_TIME_H */ -- cgit v1.2.3 From 1c68adf61e5845f3525522fa9c797ed370689c85 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:26 -0700 Subject: compat: Enable compat_get/put_timespec64 always These functions are used in the repurposed compat syscalls to provide backward compatibility for using 32 bit time_t on 32 bit systems. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 2 -- include/linux/compat_time.h | 4 ++++ kernel/compat.c | 52 +++++++-------------------------------------- kernel/time/time.c | 44 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index cd50b00ef59a..cfc1b6383ae0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -294,8 +294,6 @@ extern int compat_get_timespec(struct timespec *, const void __user *); extern int compat_put_timespec(const struct timespec *, void __user *); extern int compat_get_timeval(struct timeval *, const void __user *); extern int compat_put_timeval(const struct timeval *, void __user *); -extern int compat_get_timespec64(struct timespec64 *, const void __user *); -extern int compat_put_timespec64(const struct timespec64 *, void __user *); extern int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits); extern int put_compat_itimerspec64(const struct itimerspec64 *its, diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h index 56a54a1e4355..31f2774f1994 100644 --- a/include/linux/compat_time.h +++ b/include/linux/compat_time.h @@ -3,6 +3,7 @@ #define _LINUX_COMPAT_TIME_H #include +#include typedef s32 compat_time_t; @@ -16,4 +17,7 @@ struct compat_timeval { s32 tv_usec; }; +extern int compat_get_timespec64(struct timespec64 *, const void __user *); +extern int compat_put_timespec64(const struct timespec64 *, void __user *); + #endif /* _LINUX_COMPAT_TIME_H */ diff --git a/kernel/compat.c b/kernel/compat.c index 6d21894806b4..51a081b46832 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -120,50 +120,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) -{ - struct compat_timespec ts; - int ret; - - ret = copy_from_user(&ts, cts, sizeof(ts)); - if (ret) - return -EFAULT; - - ts64->tv_sec = ts.tv_sec; - ts64->tv_nsec = ts.tv_nsec; - - return 0; -} - -static int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) -{ - struct compat_timespec ts = { - .tv_sec = ts64->tv_sec, - .tv_nsec = ts64->tv_nsec - }; - return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; -} - -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec64); - -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec64); - int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) @@ -367,6 +323,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } +/* Todo: Delete these extern declarations when get/put_compat_itimerspec64() + * are moved to kernel/time/time.c . + */ +extern int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts); +extern int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts); + int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 3044d48ebe56..df61143a54f6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -880,6 +880,50 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); +int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int get_itimerspec64(struct itimerspec64 *it, const struct itimerspec __user *uit) { -- cgit v1.2.3 From acf8870a62afd4f1b3c0b695aaa619df355c0851 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:30 -0700 Subject: time: Add new y2038 safe __kernel_timespec The new struct __kernel_timespec is similar to current internal kernel struct timespec64 on 64 bit architecture. The compat structure however is similar to below on little endian systems (padding and tv_nsec are switched for big endian systems): typedef s32 compat_long_t; typedef s64 compat_kernel_time64_t; struct compat_kernel_timespec { compat_kernel_time64_t tv_sec; compat_long_t tv_nsec; compat_long_t padding; }; This allows for both the native and compat representations to be the same and syscalls using this type as part of their ABI can have a single entry point to both. Note that the compat define is not included anywhere in the kernel explicitly to avoid confusion. These types will be used by the new syscalls that will be introduced in the consequent patches. Most of the new syscalls are just an update to the existing native ones with this new type. Hence, put this new type under an ifdef so that the architectures can define CONFIG_64BIT_TIME when they are ready to handle this switch. Cc: linux-arch@vger.kernel.org Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/time64.h | 10 +++++++++- include/uapi/asm-generic/posix_types.h | 1 + include/uapi/linux/time.h | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/time64.h b/include/linux/time64.h index 93d39499838e..0d96887ba4e0 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -2,12 +2,20 @@ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H -#include #include typedef __s64 time64_t; typedef __u64 timeu64_t; +/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path + * and 32-bit emulation. + */ +#ifndef CONFIG_64BIT_TIME +#define __kernel_timespec timespec +#endif + +#include + #if __BITS_PER_LONG == 64 /* this trick allows us to optimize out timespec64_to_timespec */ # define timespec64 timespec diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index 5e6ea22bd525..f0733a26ebfc 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -87,6 +87,7 @@ typedef struct { typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; typedef __kernel_long_t __kernel_time_t; +typedef long long __kernel_time64_t; typedef __kernel_long_t __kernel_clock_t; typedef int __kernel_timer_t; typedef int __kernel_clockid_t; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 16a296612ba4..94adfae599e0 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,6 +42,13 @@ struct itimerval { struct timeval it_value; /* current value */ }; +#ifndef __kernel_timespec +struct __kernel_timespec { + __kernel_time64_t tv_sec; /* seconds */ + long long tv_nsec; /* nanoseconds */ +}; +#endif + /* * legacy timeval structure, only embedded in structures that * traditionally used 'timeval' to pass time intervals (not absolute -- cgit v1.2.3 From ea2ce8f3514e2074a1910d8d721842d7341d5c81 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:31 -0700 Subject: time: Fix get_timespec64() for y2038 safe compat interfaces get/put_timespec64() interfaces will eventually be used for conversions between the new y2038 safe struct __kernel_timespec and struct timespec64. The new y2038 safe syscalls have a common entry for native and compat interfaces. On compat interfaces, the high order bits of nanoseconds should be zeroed out. This is because the application code or the libc do not guarantee zeroing of these. If used without zeroing, kernel might be at risk of using timespec values incorrectly. Note that clearing of bits is dependent on CONFIG_64BIT_TIME for now. This is until COMPAT_USE_64BIT_TIME has been handled correctly. x86 will be the first architecture that will use the CONFIG_64BIT_TIME. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/time.h | 4 ++-- kernel/time/time.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/time.h b/include/linux/time.h index 4b62a2c0a661..aed74463592d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -10,9 +10,9 @@ extern struct timezone sys_tz; int get_timespec64(struct timespec64 *ts, - const struct timespec __user *uts); + const struct __kernel_timespec __user *uts); int put_timespec64(const struct timespec64 *ts, - struct timespec __user *uts); + struct __kernel_timespec __user *uts); int get_itimerspec64(struct itimerspec64 *it, const struct itimerspec __user *uit); int put_itimerspec64(const struct itimerspec64 *it, diff --git a/kernel/time/time.c b/kernel/time/time.c index df61143a54f6..ccd751e95fcb 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -853,9 +853,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, } int get_timespec64(struct timespec64 *ts, - const struct timespec __user *uts) + const struct __kernel_timespec __user *uts) { - struct timespec kts; + struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); @@ -863,6 +863,11 @@ int get_timespec64(struct timespec64 *ts, return -EFAULT; ts->tv_sec = kts.tv_sec; + + /* Zero out the padding for 32 bit systems or in compat mode */ + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + kts.tv_nsec &= 0xFFFFFFFFUL; + ts->tv_nsec = kts.tv_nsec; return 0; @@ -870,12 +875,13 @@ int get_timespec64(struct timespec64 *ts, EXPORT_SYMBOL_GPL(get_timespec64); int put_timespec64(const struct timespec64 *ts, - struct timespec __user *uts) + struct __kernel_timespec __user *uts) { - struct timespec kts = { + struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); -- cgit v1.2.3 From 6d5b84132459c644cf4ee8de090382bad44b8ebd Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:32 -0700 Subject: time: Change types to new y2038 safe __kernel_* types Change over clock_settime, clock_gettime and clock_getres syscalls to use __kernel_timespec times. This will enable changing over of these syscalls to use new y2038 safe syscalls when the architectures define the CONFIG_64BIT_TIME. Cc: linux-api@vger.kernel.org Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 6 +++--- kernel/time/posix-stubs.c | 6 +++--- kernel/time/posix-timers.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 70fcda1a9049..40bb40d1741b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -567,11 +567,11 @@ asmlinkage long sys_timer_settime(timer_t timer_id, int flags, struct itimerspec __user *old_setting); asmlinkage long sys_timer_delete(timer_t timer_id); asmlinkage long sys_clock_settime(clockid_t which_clock, - const struct timespec __user *tp); + const struct __kernel_timespec __user *tp); asmlinkage long sys_clock_gettime(clockid_t which_clock, - struct timespec __user *tp); + struct __kernel_timespec __user *tp); asmlinkage long sys_clock_getres(clockid_t which_clock, - struct timespec __user *tp); + struct __kernel_timespec __user *tp); asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, const struct timespec __user *rqtp, struct timespec __user *rmtp); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b605b0a40ba9..4e76021cea7f 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -59,7 +59,7 @@ SYS_NI(alarm); */ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { struct timespec64 new_tp; @@ -92,7 +92,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -106,7 +106,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return 0; } -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 147d79e2cef5..93b2c38ad0f3 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1041,7 +1041,7 @@ void exit_itimers(struct signal_struct *sig) } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp; @@ -1056,7 +1056,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp; @@ -1097,7 +1097,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp; -- cgit v1.2.3 From 01909974b41036a6a8d3907c66cc7b41c9a73da9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:33 -0700 Subject: time: Change nanosleep to safe __kernel_* types Change over clock_nanosleep syscalls to use y2038 safe __kernel_timespec times. This will enable changing over of these syscalls to use new y2038 safe syscalls when the architectures define the CONFIG_64BIT_TIME. Note that nanosleep syscall is deprecated and does not have a plan for making it y2038 safe. But, the syscall should work as before on 64 bit machines and on 32 bit machines, the syscall works correctly until y2038 as before using the existing compat syscall version. There is no new syscall for supporting 64 bit time_t on 32 bit architectures. Cc: linux-api@vger.kernel.org Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/restart_block.h | 7 ++----- include/linux/syscalls.h | 7 ++++--- kernel/time/hrtimer.c | 8 ++++++-- kernel/time/posix-stubs.c | 4 ++-- kernel/time/posix-timers.c | 4 ++-- 5 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index bcfdb918cd81..5d83d0c1d06c 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -7,6 +7,7 @@ #include #include +#include struct timespec; struct compat_timespec; @@ -15,9 +16,7 @@ struct pollfd; enum timespec_type { TT_NONE = 0, TT_NATIVE = 1, -#ifdef CONFIG_COMPAT TT_COMPAT = 2, -#endif }; /* @@ -40,10 +39,8 @@ struct restart_block { clockid_t clockid; enum timespec_type type; union { - struct timespec __user *rmtp; -#ifdef CONFIG_COMPAT + struct __kernel_timespec __user *rmtp; struct compat_timespec __user *compat_rmtp; -#endif }; u64 expires; } nanosleep; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 40bb40d1741b..c9a2a2601852 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -536,7 +536,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); /* kernel/hrtimer.c */ -asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); +asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); /* kernel/itimer.c */ asmlinkage long sys_getitimer(int which, struct itimerval __user *value); @@ -573,8 +574,8 @@ asmlinkage long sys_clock_gettime(clockid_t which_clock, asmlinkage long sys_clock_getres(clockid_t which_clock, struct __kernel_timespec __user *tp); asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, - const struct timespec __user *rqtp, - struct timespec __user *rmtp); + const struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); /* kernel/printk.c */ asmlinkage long sys_syslog(int type, char __user *buf, int len); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f183257ff0c6..d7051b3993b5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1747,8 +1747,10 @@ out: return ret; } -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; @@ -1763,6 +1765,8 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } +#endif + #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 4e76021cea7f..474e4ca2e28f 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -126,8 +126,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_time } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 93b2c38ad0f3..c21f4c4f8660 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1212,8 +1212,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; -- cgit v1.2.3 From e21db6f69a95b846ff04e31fe0a86004cbd000d7 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:48 -0700 Subject: tcp: track total bytes delivered with ECN CE marks Introduce a new delivered_ce stat in tcp socket to estimate number of packets being marked with CE bits. The estimation is done via ACKs with ECE bit. Depending on the actual receiver behavior, the estimation could have biases. Since the TCP sender can't really see the CE bit in the data path, so the sender is technically counting packets marked delivered with the "ECE / ECN-Echo" flag set. With RFC3168 ECN, because the ECE bit is sticky, this count can drastically overestimate the nummber of CE-marked data packets With DCTCP-style ECN this should be reasonably precise unless there is loss in the ACK path, in which case it's not precise. With AccECN proposal this can be made still more precise, even in the case some degree of ACK loss. However this is sender's best estimate of CE information. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8f4c54986f97..20585d5c4e1c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,6 +281,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 438fbca96cd3..5a5ce6da4792 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2559,6 +2559,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; + tp->delivered_ce = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 01cce28f90ca..b3bff9c20606 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3503,6 +3503,8 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) u32 delivered; delivered = tp->delivered - prior_delivered; + if (flag & FLAG_ECE) + tp->delivered_ce += delivered; return delivered; } -- cgit v1.2.3 From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:49 -0700 Subject: tcp: export packets delivery info Export data delivered and delivered with CE marks to 1) SNMP TCPDelivered and TCPDeliveredCE 2) getsockopt(TCP_INFO) 3) Timestamping API SOF_TIMESTAMPING_OPT_STATS Note that for SCM_TSTAMP_ACK, the delivery info in SOF_TIMESTAMPING_OPT_STATS is reported before the info was fully updated on the ACK. These stats help application monitor TCP delivery and ECN status on per host, per connection, even per message level. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 5 +++++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp.c | 7 ++++++- net/ipv4/tcp_input.c | 6 +++++- 5 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 33a70ece462f..d02e859301ff 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -276,6 +276,8 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ + LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 560374c978f9..379b08700a54 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -224,6 +224,9 @@ struct tcp_info { __u64 tcpi_busy_time; /* Time (usec) busy sending data */ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -244,6 +247,8 @@ enum { TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a058de677e94..261b71d0ccc5 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), + SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5a5ce6da4792..4022073b0aee 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3167,6 +3167,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; + info->tcpi_delivered = tp->delivered; + info->tcpi_delivered_ce = tp->delivered_ce; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3180,7 +3182,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 5 * nla_total_size(sizeof(u32)) + + 7 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3211,9 +3213,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b3bff9c20606..0396fb919b5d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3499,12 +3499,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) { + const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 delivered; delivered = tp->delivered - prior_delivered; - if (flag & FLAG_ECE) + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); + if (flag & FLAG_ECE) { tp->delivered_ce += delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + } return delivered; } -- cgit v1.2.3 From af81f9e75e546de43f4f0bf532d880ddc16a4dc8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:08 +0100 Subject: netfilter: nf_flow_table: use IP_CT_DIR_* values for FLOW_OFFLOAD_DIR_* Simplifies further code cleanups Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 833752dd0c58..09ba67598991 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -6,6 +6,7 @@ #include #include #include +#include #include struct nf_flowtable; @@ -27,11 +28,10 @@ struct nf_flowtable { }; enum flow_offload_tuple_dir { - FLOW_OFFLOAD_DIR_ORIGINAL, - FLOW_OFFLOAD_DIR_REPLY, - __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, + FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, + FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX }; -#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) struct flow_offload_tuple { union { -- cgit v1.2.3 From 88078d98d1bb085d72af8437707279e203524fa5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Apr 2018 11:43:15 -0700 Subject: net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends After working on IP defragmentation lately, I found that some large packets defeat CHECKSUM_COMPLETE optimization because of NIC adding zero paddings on the last (small) fragment. While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed to CHECKSUM_NONE, forcing a full csum validation, even if all prior fragments had CHECKSUM_COMPLETE set. We can instead compute the checksum of the part we are trimming, usually smaller than the part we keep. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ++--- net/core/skbuff.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9065477ed255..d274059529eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3131,6 +3131,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) return skb->data; } +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim @@ -3144,9 +3145,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - return __pskb_trim(skb, len); + return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 345b51837ca8..ff49e352deea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1839,6 +1839,20 @@ done: } EXPORT_SYMBOL(___pskb_trim); +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + int delta = skb->len - len; + + skb->csum = csum_sub(skb->csum, + skb_checksum(skb, len, delta, 0)); + } + return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); + /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate -- cgit v1.2.3 From 93c2fb253d177a0b8f4f93592441f88c9b7d6245 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:38:59 -0700 Subject: net/ipv6: Rename fib6_info struct elements Change the prefix for fib6_info struct elements from rt6i_ to fib6_. rt6i_pcpu and rt6i_exception_bucket are left as is given that they point to rt6_info entries. Rename only; not functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 56 ++--- include/net/ip6_fib.h | 38 +-- include/net/ip6_route.h | 26 +- net/ipv6/addrconf.c | 24 +- net/ipv6/anycast.c | 8 +- net/ipv6/ip6_fib.c | 170 ++++++------- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 272 ++++++++++----------- 8 files changed, 298 insertions(+), 298 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index b85b15851841..8e4edb634b11 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4705,17 +4705,17 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt) * are trapped to the CPU, so no need to program specific routes * for them. */ - if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LINKLOCAL) + if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_LINKLOCAL) return true; /* Multicast routes aren't supported, so ignore them. Neighbour * Discovery packets are specifically trapped. */ - if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) + if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_MULTICAST) return true; /* Cloned routes are irrelevant in the forwarding path. */ - if (rt->rt6i_flags & RTF_CACHE) + if (rt->fib6_flags & RTF_CACHE) return true; return false; @@ -4759,7 +4759,7 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) { /* RTF_CACHE routes are ignored */ - return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; + return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; } static struct fib6_info * @@ -4784,16 +4784,16 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, /* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same * virtual router. */ - if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id) + if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id) continue; - if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id) + if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (rt->rt6i_metric < nrt->rt6i_metric) + if (rt->fib6_metric < nrt->fib6_metric) continue; - if (rt->rt6i_metric == nrt->rt6i_metric && + if (rt->fib6_metric == nrt->fib6_metric && mlxsw_sp_fib6_rt_can_mp(rt)) return fib6_entry; - if (rt->rt6i_metric > nrt->rt6i_metric) + if (rt->fib6_metric > nrt->fib6_metric) break; } @@ -4899,7 +4899,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, const struct fib6_info *rt) { - return rt->rt6i_flags & RTF_GATEWAY || + return rt->fib6_flags & RTF_GATEWAY || mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL); } @@ -5092,9 +5092,9 @@ static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp, * local, which will cause them to be trapped with a lower * priority than packets that need to be locally received. */ - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP; - else if (rt->rt6i_flags & RTF_REJECT) + else if (rt->fib6_flags & RTF_REJECT) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL; else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt)) fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE; @@ -5175,18 +5175,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); - if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id) + if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id) continue; - if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id) + if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (replace && rt->rt6i_metric == nrt->rt6i_metric) { + if (replace && rt->fib6_metric == nrt->fib6_metric) { if (mlxsw_sp_fib6_rt_can_mp(rt) == mlxsw_sp_fib6_rt_can_mp(nrt)) return fib6_entry; if (mlxsw_sp_fib6_rt_can_mp(nrt)) fallback = fallback ?: fib6_entry; } - if (rt->rt6i_metric > nrt->rt6i_metric) + if (rt->fib6_metric > nrt->fib6_metric) return fallback ?: fib6_entry; } @@ -5215,7 +5215,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry, list_for_each_entry(last, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last); - if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id) + if (nrt->fib6_table->tb6_id > rt->fib6_table->tb6_id) break; fib6_entry = last; } @@ -5275,22 +5275,22 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib *fib; struct mlxsw_sp_vr *vr; - vr = mlxsw_sp_vr_find(mlxsw_sp, rt->rt6i_table->tb6_id); + vr = mlxsw_sp_vr_find(mlxsw_sp, rt->fib6_table->tb6_id); if (!vr) return NULL; fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV6); - fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->rt6i_dst.addr, - sizeof(rt->rt6i_dst.addr), - rt->rt6i_dst.plen); + fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->fib6_dst.addr, + sizeof(rt->fib6_dst.addr), + rt->fib6_dst.plen); if (!fib_node) return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry); - if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id && - rt->rt6i_metric == iter_rt->rt6i_metric && + if (rt->fib6_table->tb6_id == iter_rt->fib6_table->tb6_id && + rt->fib6_metric == iter_rt->fib6_metric && mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt)) return fib6_entry; } @@ -5325,16 +5325,16 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, if (mlxsw_sp->router->aborted) return 0; - if (rt->rt6i_src.plen) + if (rt->fib6_src.plen) return -EINVAL; if (mlxsw_sp_fib6_rt_should_ignore(rt)) return 0; - fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->rt6i_table->tb6_id, - &rt->rt6i_dst.addr, - sizeof(rt->rt6i_dst.addr), - rt->rt6i_dst.plen, + fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->fib6_table->tb6_id, + &rt->fib6_dst.addr, + sizeof(rt->fib6_dst.addr), + rt->fib6_dst.plen, MLXSW_SP_L3_PROTO_IPV6); if (IS_ERR(fib_node)) return PTR_ERR(fib_node); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a36116b92100..994dd67207fb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -134,34 +134,34 @@ struct fib6_nh { }; struct fib6_info { - struct fib6_table *rt6i_table; + struct fib6_table *fib6_table; struct fib6_info __rcu *rt6_next; - struct fib6_node __rcu *rt6i_node; + struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; + struct list_head fib6_siblings; + unsigned int fib6_nsiblings; - atomic_t rt6i_ref; - struct inet6_dev *rt6i_idev; + atomic_t fib6_ref; + struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct rt6key rt6i_dst; - u32 rt6i_flags; - struct rt6key rt6i_src; - struct rt6key rt6i_prefsrc; + struct rt6key fib6_dst; + u32 fib6_flags; + struct rt6key fib6_src; + struct rt6key fib6_prefsrc; struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - u32 rt6i_metric; - u8 rt6i_protocol; + u32 fib6_metric; + u8 fib6_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, @@ -206,7 +206,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) static inline void fib6_clean_expires(struct fib6_info *f6i) { - f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } @@ -214,12 +214,12 @@ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; - f6i->rt6i_flags |= RTF_EXPIRES; + f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { - if (f6i->rt6i_flags & RTF_EXPIRES) + if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } @@ -250,14 +250,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = fn->fn_sernum; @@ -295,12 +295,12 @@ void fib6_info_destroy(struct fib6_info *f6i); static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&f6i->rt6i_ref); + atomic_inc(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) fib6_info_destroy(f6i); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d5fb1e4ae7ac..b4b85109984f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,9 +66,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; } @@ -102,23 +102,23 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct fib6_info *rt); -int ip6_del_rt(struct net *net, struct fib6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i); -void rt6_flush_exceptions(struct fib6_info *rt); -void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *f6i); +void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; + struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; + if (f6i && f6i->fib6_prefsrc.plen) + *saddr = f6i->fib6_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, prefs, saddr); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct fib6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct fib6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *f6i); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); @@ -274,7 +274,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->rt6i_idev == b->rt6i_idev && + a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a294e86a9b25..f38ea829c28b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1178,19 +1178,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) { - struct fib6_info *rt; + struct fib6_info *f6i; - rt = addrconf_get_prefix_route(&ifp->addr, + f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, 0, RTF_GATEWAY | RTF_DEFAULT); - if (rt) { + if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), rt); + ip6_del_rt(dev_net(ifp->idev->dev), f6i); else { - if (!(rt->rt6i_flags & RTF_EXPIRES)) - fib6_set_expires(rt, expires); - fib6_info_release(rt); + if (!(f6i->fib6_flags & RTF_EXPIRES)) + fib6_set_expires(f6i, expires); + fib6_info_release(f6i); } } } @@ -2370,9 +2370,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; - if ((rt->rt6i_flags & flags) != flags) + if ((rt->fib6_flags & flags) != flags) continue; - if ((rt->rt6i_flags & noflags) != 0) + if ((rt->fib6_flags & noflags) != 0) continue; fib6_info_hold(rt); break; @@ -3341,11 +3341,11 @@ static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - /* !rt6i_node means the host route was removed from the + /* !fib6_node means the host route was removed from the * FIB, for example, if 'lo' device is taken down. In that * case regenerate the host route. */ - if (!ifp->rt || !ifp->rt->rt6i_node) { + if (!ifp->rt || !ifp->rt->fib6_node) { struct fib6_info *rt, *prev; rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, @@ -5612,7 +5612,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!rcu_access_pointer(ifp->rt->rt6i_node)) + if (!rcu_access_pointer(ifp->rt->fib6_node)) ip6_ins_rt(net, ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0e35657501a7..eed3bf63bd05 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -218,10 +218,10 @@ static void aca_put(struct ifacaddr6 *ac) } } -static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, +static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { - struct inet6_dev *idev = rt->rt6i_idev; + struct inet6_dev *idev = f6i->fib6_idev; struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); @@ -231,8 +231,8 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, aca->aca_addr = *addr; in6_dev_hold(idev); aca->aca_idev = idev; - fib6_info_hold(rt); - aca->aca_rt = rt; + fib6_info_hold(f6i); + aca->aca_rt = f6i; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2ab49b7cac22..353f0b3e7b0d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -105,12 +105,12 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct net *net, struct fib6_info *rt) +void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); } @@ -159,10 +159,10 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) return NULL; } - INIT_LIST_HEAD(&f6i->rt6i_siblings); + INIT_LIST_HEAD(&f6i->fib6_siblings); f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - atomic_inc(&f6i->rt6i_ref); + atomic_inc(&f6i->fib6_ref); return f6i; } @@ -172,7 +172,7 @@ void fib6_info_destroy(struct fib6_info *f6i) struct rt6_exception_bucket *bucket; struct dst_metrics *m; - WARN_ON(f6i->rt6i_node); + WARN_ON(f6i->fib6_node); bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); if (bucket) { @@ -197,8 +197,8 @@ void fib6_info_destroy(struct fib6_info *f6i) } } - if (f6i->rt6i_idev) - in6_dev_put(f6i->rt6i_idev); + if (f6i->fib6_idev) + in6_dev_put(f6i->fib6_idev); if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); @@ -401,7 +401,7 @@ static int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->rt6i_table->fib_seq++; + rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } @@ -483,10 +483,10 @@ static int fib6_dump_node(struct fib6_walker *w) * last sibling of this route (no need to dump the * sibling routes again) */ - if (rt->rt6i_nsiblings) - rt = list_last_entry(&rt->rt6i_siblings, + if (rt->fib6_nsiblings) + rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, - rt6i_siblings); + fib6_siblings); } w->leaf = NULL; return 0; @@ -810,7 +810,7 @@ insert_above: RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; atomic_inc(&rcu_dereference_protected(in->leaf, - lockdep_is_held(&table->tb6_lock))->rt6i_ref); + lockdep_is_held(&table->tb6_lock))->fib6_ref); /* update parent pointer */ if (dir) @@ -865,9 +865,9 @@ insert_above: static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; - if (atomic_read(&rt->rt6i_ref) != 1) { + if (atomic_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes @@ -880,7 +880,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); - atomic_inc(&new_leaf->rt6i_ref); + atomic_inc(&new_leaf->fib6_ref); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); @@ -919,7 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; @@ -939,12 +939,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, for (iter = leaf; iter; iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock))) { + lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ - if (iter->rt6i_metric == rt->rt6i_metric) { + if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ @@ -964,11 +964,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, } if (rt6_duplicate_nexthop(iter, rt)) { - if (rt->rt6i_nsiblings) - rt->rt6i_nsiblings = 0; - if (!(iter->rt6i_flags & RTF_EXPIRES)) + if (rt->fib6_nsiblings) + rt->fib6_nsiblings = 0; + if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->rt6i_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); @@ -988,10 +988,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->rt6i_nsiblings++; + rt->fib6_nsiblings++; } - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; next_iter: @@ -1002,7 +1002,7 @@ next_iter: /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } @@ -1011,34 +1011,34 @@ next_iter: fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->rt6i_nsiblings) { - unsigned int rt6i_nsiblings; + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; while (sibling) { - if (sibling->rt6i_metric == rt->rt6i_metric && + if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->rt6i_siblings, - &sibling->rt6i_siblings); + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - rt6i_nsiblings = 0; + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->rt6i_siblings, rt6i_siblings) { - sibling->rt6i_nsiblings++; - BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); - rt6i_nsiblings++; + &rt->fib6_siblings, fib6_siblings) { + sibling->fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } @@ -1059,8 +1059,8 @@ add: return err; rcu_assign_pointer(rt->rt6_next, iter); - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -1087,8 +1087,8 @@ add: if (err) return err; - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) @@ -1097,8 +1097,8 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->rt6i_nsiblings; - iter->rt6i_node = NULL; + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; @@ -1108,13 +1108,13 @@ add: /* Replacing an ECMP route, remove all siblings */ ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->rt6_next; - iter->rt6i_node = NULL; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; @@ -1125,7 +1125,7 @@ add: ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1137,7 +1137,7 @@ add: static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & RTF_EXPIRES)) + (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -1152,15 +1152,15 @@ void fib6_force_start_gc(struct net *net) static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { fn->fn_sernum = sernum; fn = rcu_dereference_protected(fn->parent, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } } @@ -1179,7 +1179,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1196,8 +1196,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, - &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - offsetof(struct fib6_info, rt6i_dst), allow_create, + &rt->fib6_dst.addr, rt->fib6_dst.plen, + offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -1208,7 +1208,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, pn = fn; #ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen) { + if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { @@ -1229,7 +1229,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref); + atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; @@ -1237,8 +1237,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct fib6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1256,8 +1256,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct fib6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1272,7 +1272,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { - atomic_inc(&rt->rt6i_ref); + atomic_inc(&rt->fib6_ref); rcu_assign_pointer(fn->leaf, rt); } } @@ -1421,12 +1421,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct fib6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct fib6_info, rt6i_src), + .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif @@ -1511,7 +1511,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct fib6_info, rt6i_dst), + offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES @@ -1522,7 +1522,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, - offsetof(struct fib6_info, rt6i_src), + offsetof(struct fib6_info, fib6_src), exact_match); } } @@ -1706,7 +1706,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Unlink it */ *rtp = rt->rt6_next; - rt->rt6i_node = NULL; + rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1718,14 +1718,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fn->rr_ptr = NULL; /* Remove this entry from other siblings */ - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) - sibling->rt6i_nsiblings--; - rt->rt6i_nsiblings = 0; - list_del_init(&rt->rt6i_siblings); + &rt->fib6_siblings, fib6_siblings) + sibling->fib6_nsiblings--; + rt->fib6_nsiblings = 0; + list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -1765,9 +1765,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct fib6_table *table = rt->rt6i_table; + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + struct fib6_table *table = rt->fib6_table; struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; @@ -1951,17 +1951,17 @@ static int fib6_clean_node(struct fib6_walker *w) #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, - rcu_access_pointer(rt->rt6i_node), + rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { - if (WARN_ON(!rt->rt6i_nsiblings)) + if (WARN_ON(!rt->fib6_nsiblings)) continue; - rt = list_last_entry(&rt->rt6i_siblings, - struct fib6_info, rt6i_siblings); + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); @@ -2045,7 +2045,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) { + if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; @@ -2243,22 +2243,22 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) struct ipv6_route_iter *iter = seq->private; const struct net_device *dev; - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->fib6_flags & RTF_GATEWAY) seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); else seq_puts(seq, "00000000000000000000000000000000"); dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0, - rt->rt6i_flags, dev ? dev->name : ""); + rt->fib6_metric, atomic_read(&rt->fib6_ref), 0, + rt->fib6_flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 102645298692..9ac5366064e3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } neigh->flags |= NTF_ROUTER; } else if (rt) { - rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f9c363327d62..e23ab0784e65 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -284,10 +284,10 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { }; static const struct fib6_info fib6_null_entry_template = { - .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32)0, - .rt6i_ref = ATOMIC_INIT(1), + .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), + .fib6_protocol = RTPROT_KERNEL, + .fib6_metric = ~(u32)0, + .fib6_ref = ATOMIC_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; @@ -429,8 +429,8 @@ static struct fib6_info *rt6_multipath_select(const struct net *net, if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; - list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, - rt6i_siblings) { + list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, + fib6_siblings) { int nh_upper_bound; nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); @@ -472,12 +472,12 @@ static inline struct fib6_info *rt6_device_match(struct net *net, if (dev->ifindex == oif) return sprt; if (dev->flags & IFF_LOOPBACK) { - if (!sprt->rt6i_idev || - sprt->rt6i_idev->dev->ifindex != oif) { + if (!sprt->fib6_idev || + sprt->fib6_idev->dev->ifindex != oif) { if (flags & RT6_LOOKUP_F_IFACE) continue; if (local && - local->rt6i_idev->dev->ifindex == oif) + local->fib6_idev->dev->ifindex == oif) continue; } local = sprt; @@ -534,7 +534,7 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) + if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) return; nh_gw = &rt->fib6_nh.nh_gw; @@ -550,7 +550,7 @@ static void rt6_probe(struct fib6_info *rt) if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + - rt->rt6i_idev->cnf.rtr_probe_interval)) { + rt->fib6_idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); @@ -587,7 +587,7 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif) if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && - rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) + rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif) return 1; return 0; } @@ -597,8 +597,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->rt6i_flags & RTF_NONEXTHOP || - !(rt->rt6i_flags & RTF_GATEWAY)) + if (rt->fib6_flags & RTF_NONEXTHOP || + !(rt->fib6_flags & RTF_GATEWAY)) return RT6_NUD_SUCCEED; rcu_read_lock_bh(); @@ -632,7 +632,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict) if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; #endif if (strict & RT6_LOOKUP_F_REACHABLE) { int n = rt6_check_neigh(rt); @@ -648,7 +648,7 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; - struct inet6_dev *idev = rt->rt6i_idev; + struct inet6_dev *idev = rt->fib6_idev; if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; @@ -694,7 +694,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -704,7 +704,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, for (rt = leaf; rt && rt != rr_head; rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -741,30 +741,30 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ - key_plen = rt0->rt6i_dst.plen; + key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES - if (rt0->rt6i_src.plen) - key_plen = rt0->rt6i_src.plen; + if (rt0->fib6_src.plen) + key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) return net->ipv6.fib6_null_entry; - match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, &do_rr); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ - if (!next || next->rt6i_metric != rt0->rt6i_metric) + if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { - spin_lock_bh(&leaf->rt6i_table->tb6_lock); + spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ - if (next->rt6i_node) + if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); - spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } @@ -773,7 +773,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { - return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); + return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -837,8 +837,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) - rt->rt6i_flags = RTF_ROUTEINFO | - (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = RTF_ROUTEINFO | + (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { if (!addrconf_finite_timeout(lifetime)) @@ -861,13 +861,13 @@ static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh.nh_dev; - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { + if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->rt6i_dst.addr)) + !rt6_need_strict(&rt->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; @@ -939,7 +939,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.flags |= fib6_info_dst_flags(ort); - if (ort->rt6i_flags & RTF_REJECT) { + if (ort->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; } @@ -949,7 +949,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) if (ort->fib6_type == RTN_LOCAL) { rt->dst.input = ip6_input; - } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) { + } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; @@ -979,17 +979,17 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { ip6_rt_init_dst(rt, ort); - rt->rt6i_dst = ort->rt6i_dst; - rt->rt6i_idev = ort->rt6i_idev; + rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_idev = ort->fib6_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->rt6i_gateway = ort->fib6_nh.nh_gw; - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->fib6_flags; rt6_set_from(rt, ort); #ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->rt6i_src; + rt->rt6i_src = ort->fib6_src; #endif - rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_prefsrc = ort->fib6_prefsrc; rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } @@ -1064,7 +1064,7 @@ restart: } else { f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0) + if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) f6i = rt6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb, flags); } @@ -1142,7 +1142,7 @@ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, int err; struct fib6_table *table; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); @@ -1182,8 +1182,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + if (ort->fib6_dst.plen != 128 && + ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1375,7 +1375,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; - mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6; + mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6; mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); @@ -1416,14 +1416,14 @@ static int rt6_insert_exception(struct rt6_info *nrt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (ort->rt6i_src.plen) + if (ort->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* Update rt6i_prefsrc as it could be changed * in rt6_remove_prefsrc() */ - nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + nrt->rt6i_prefsrc = ort->fib6_prefsrc; /* rt6_mtu_change() might lower mtu on ort. * Only insert this exception route if its mtu * is less than ort's mtu value. @@ -1457,9 +1457,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->rt6i_table->tb6_lock); + spin_lock_bh(&ort->fib6_table->tb6_lock); fib6_update_sernum(net, ort); - spin_unlock_bh(&ort->rt6i_table->tb6_lock); + spin_unlock_bh(&ort->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1514,7 +1514,7 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (rt->rt6i_src.plen) + if (rt->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); @@ -1551,7 +1551,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1592,7 +1592,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, @@ -1810,7 +1810,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); - if (f6i->rt6i_nsiblings) + if (f6i->fib6_nsiblings) f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -1842,7 +1842,7 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(f6i->rt6i_flags & RTF_GATEWAY))) { + !(f6i->fib6_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -2206,7 +2206,7 @@ static void ip6_link_failure(struct sk_buff *skb) struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->from->rt6i_node); + fn = rcu_dereference(rt->from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) fn->fn_sernum = -1; rcu_read_unlock(); @@ -2372,9 +2372,9 @@ restart: continue; if (fib6_check_expired(rt)) continue; - if (rt->rt6i_flags & RTF_REJECT) + if (rt->fib6_flags & RTF_REJECT) break; - if (!(rt->rt6i_flags & RTF_GATEWAY)) + if (!(rt->fib6_flags & RTF_GATEWAY)) continue; if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) continue; @@ -2400,7 +2400,7 @@ restart: if (!rt) rt = net->ipv6.fib6_null_entry; - else if (rt->rt6i_flags & RTF_REJECT) { + else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } @@ -2907,7 +2907,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->rt6i_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; addr_type = ipv6_addr_type(&cfg->fc_dst); @@ -2922,17 +2922,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); } - ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); - rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) + ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->fib6_dst.plen = cfg->fc_dst_len; + if (rt->fib6_dst.plen == 128) rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); - rt->rt6i_src.plen = cfg->fc_src_len; + ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); + rt->fib6_src.plen = cfg->fc_src_len; #endif - rt->rt6i_metric = cfg->fc_metric; + rt->fib6_metric = cfg->fc_metric; rt->fib6_nh.nh_weight = 1; rt->fib6_type = cfg->fc_type; @@ -2958,7 +2958,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } } - rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; goto install_route; } @@ -2992,21 +2992,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, err = -EINVAL; goto out; } - rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; - rt->rt6i_prefsrc.plen = 128; + rt->fib6_prefsrc.addr = cfg->fc_prefsrc; + rt->fib6_prefsrc.plen = 128; } else - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; - rt->rt6i_flags = cfg->fc_flags; + rt->fib6_flags = cfg->fc_flags; install_route: - if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->fib6_nh.nh_dev = dev; - rt->rt6i_idev = idev; - rt->rt6i_table = table; + rt->fib6_idev = idev; + rt->fib6_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); @@ -3048,7 +3048,7 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) goto out; } - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); @@ -3075,10 +3075,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.fib6_null_entry) goto out_put; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); - if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { + if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; /* prefer to send a single notification with all hops */ @@ -3096,8 +3096,8 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) } list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, - rt6i_siblings) { + &rt->fib6_siblings, + fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; @@ -3176,9 +3176,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) + if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; fib6_info_hold(rt); rcu_read_unlock(); @@ -3336,7 +3336,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_dev->ifindex != ifindex) continue; - if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; @@ -3396,7 +3396,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->fib6_nh.nh_dev && - ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && + ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } @@ -3445,8 +3445,8 @@ static void __rt6_purge_dflt_routers(struct net *net, restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) { fib6_info_hold(rt); rcu_read_unlock(); ip6_del_rt(net, rt); @@ -3607,26 +3607,26 @@ struct fib6_info *addrconf_dst_alloc(struct net *net, rt->dst_nocount = true; in6_dev_hold(idev); - rt->rt6i_idev = idev; + rt->fib6_idev = idev; rt->dst_host = true; - rt->rt6i_protocol = RTPROT_KERNEL; - rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + rt->fib6_protocol = RTPROT_KERNEL; + rt->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { rt->fib6_type = RTN_ANYCAST; - rt->rt6i_flags |= RTF_ANYCAST; + rt->fib6_flags |= RTF_ANYCAST; } else { rt->fib6_type = RTN_LOCAL; - rt->rt6i_flags |= RTF_LOCAL; + rt->fib6_flags |= RTF_LOCAL; } rt->fib6_nh.nh_gw = *addr; dev_hold(dev); rt->fib6_nh.nh_dev = dev; - rt->rt6i_dst.addr = *addr; - rt->rt6i_dst.plen = 128; + rt->fib6_dst.addr = *addr; + rt->fib6_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - rt->rt6i_table = fib6_get_table(net, tb_id); + rt->fib6_table = fib6_get_table(net, tb_id); return rt; } @@ -3646,10 +3646,10 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && - ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; /* need to update cache as well */ rt6_exceptions_remove_prefsrc(rt); spin_unlock_bh(&rt6_exception_lock); @@ -3675,7 +3675,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { return -1; } @@ -3707,16 +3707,16 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) struct fib6_info *iter; struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric == rt->rt6i_metric && + if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; @@ -3726,7 +3726,7 @@ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + rt->fib6_idev->cnf.ignore_routes_with_linkdown)) return true; return false; @@ -3740,7 +3740,7 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) if (!rt6_is_dead(rt)) total += rt->fib6_nh.nh_weight; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh.nh_weight; } @@ -3767,7 +3767,7 @@ static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) rt6_upper_bound_set(rt, &weight, total); - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } @@ -3780,7 +3780,7 @@ void rt6_multipath_rebalance(struct fib6_info *rt) * then there is no need to rebalance upon the removal of every * sibling route. */ - if (!rt->rt6i_nsiblings || rt->should_flush) + if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to @@ -3831,7 +3831,7 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, if (rt->fib6_nh.nh_dev == dev) return true; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == dev) return true; @@ -3843,7 +3843,7 @@ static void rt6_multipath_flush(struct fib6_info *rt) struct fib6_info *iter; rt->should_flush = 1; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } @@ -3856,7 +3856,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, if (rt->fib6_nh.nh_dev == down_dev || rt->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == down_dev || iter->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; @@ -3872,7 +3872,7 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, if (rt->fib6_nh.nh_dev == dev) rt->fib6_nh.nh_flags |= nh_flags; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == dev) iter->fib6_nh.nh_flags |= nh_flags; } @@ -3893,13 +3893,13 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) case NETDEV_DOWN: if (rt->should_flush) return -1; - if (!rt->rt6i_nsiblings) + if (!rt->fib6_nsiblings) return rt->fib6_nh.nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); - if (rt->rt6i_nsiblings + 1 == count) { + if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } @@ -3911,7 +3911,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) return -2; case NETDEV_CHANGE: if (rt->fib6_nh.nh_dev != dev || - rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); @@ -4188,10 +4188,10 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ - if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { - rt = list_first_entry(&rt_last->rt6i_siblings, + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { + rt = list_first_entry(&rt_last->fib6_siblings, struct fib6_info, - rt6i_siblings); + fib6_siblings); } if (rt) @@ -4410,13 +4410,13 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) { int nexthop_len = 0; - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); - nexthop_len *= rt->rt6i_nsiblings; + nexthop_len *= rt->fib6_nsiblings; } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4444,11 +4444,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; - if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + if (rt->fib6_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; } - if (rt->rt6i_flags & RTF_GATEWAY) { + if (rt->fib6_flags & RTF_GATEWAY) { if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) goto nla_put_failure; } @@ -4518,11 +4518,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; - rtm->rtm_dst_len = rt->rt6i_dst.plen; - rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_dst_len = rt->fib6_dst.plen; + rtm->rtm_src_len = rt->fib6_src.plen; rtm->rtm_tos = 0; - if (rt->rt6i_table) - table = rt->rt6i_table->tb6_id; + if (rt->fib6_table) + table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table; @@ -4532,9 +4532,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = rt->rt6i_protocol; + rtm->rtm_protocol = rt->fib6_protocol; - if (rt->rt6i_flags & RTF_CACHE) + if (rt->fib6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { @@ -4542,7 +4542,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { @@ -4550,12 +4550,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE - if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) @@ -4573,9 +4573,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_prefsrc.plen) { + if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; - saddr_buf = rt->rt6i_prefsrc.addr; + saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -4584,13 +4584,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; - if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) + if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -4602,7 +4602,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) { + &rt->fib6_siblings, fib6_siblings) { if (rt6_add_nexthop(skb, sibling) < 0) goto nla_put_failure; } @@ -4613,7 +4613,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_flags & RTF_EXPIRES) { + if (rt->fib6_flags & RTF_EXPIRES) { expires = dst ? dst->expires : rt->expires; expires -= jiffies; } @@ -4621,7 +4621,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; - if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) goto nla_put_failure; @@ -4646,7 +4646,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg) /* user wants prefix routes only */ if (rtm->rtm_flags & RTM_F_PREFIX && - !(rt->rt6i_flags & RTF_PREFIX_RT)) { + !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return 1; } @@ -4820,7 +4820,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; - net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4834,7 +4834,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5157,7 +5157,7 @@ void __init ip6_route_init_special_entries(void) * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; - init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From 360a9887c8c01a715b2b4b131f7c7462f7cce576 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:00 -0700 Subject: net/ipv6: Rename addrconf_dst_alloc addrconf_dst_alloc now returns a fib6_info. Update the name and its users to reflect the change. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- net/ipv6/addrconf.c | 28 ++++++++++++++-------------- net/ipv6/anycast.c | 14 +++++++------- net/ipv6/route.c | 44 ++++++++++++++++++++++---------------------- 4 files changed, 44 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b4b85109984f..31e24f821ef6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -136,7 +136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, +struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f38ea829c28b..e6dd886f8f1f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct fib6_info *rt = NULL; + struct fib6_info *f6i = NULL; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1036,16 +1036,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; + f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); + f6i = NULL; goto out; } if (net->ipv6.devconf_all->disable_policy || idev->cnf.disable_policy) - rt->dst_nopolicy = true; + f6i->dst_nopolicy = true; neigh_parms_data_state_setall(idev->nd_parms); @@ -1067,7 +1067,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->cstamp = ifa->tstamp = jiffies; ifa->tokenized = false; - ifa->rt = rt; + ifa->rt = f6i; ifa->idev = idev; in6_dev_hold(idev); @@ -1101,7 +1101,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: if (unlikely(err < 0)) { - fib6_info_release(rt); + fib6_info_release(f6i); if (ifa) { if (ifa->idev) @@ -3346,17 +3346,17 @@ static int fixup_permanent_addr(struct net *net, * case regenerate the host route. */ if (!ifp->rt || !ifp->rt->fib6_node) { - struct fib6_info *rt, *prev; + struct fib6_info *f6i, *prev; - rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, - GFP_ATOMIC); - if (IS_ERR(rt)) - return PTR_ERR(rt); + f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, + GFP_ATOMIC); + if (IS_ERR(f6i)) + return PTR_ERR(f6i); /* ifp->rt can be accessed outside of rtnl */ spin_lock(&ifp->lock); prev = ifp->rt; - ifp->rt = rt; + ifp->rt = f6i; spin_unlock(&ifp->lock); fib6_info_release(prev); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index eed3bf63bd05..5c3e74d05018 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct fib6_info *rt; + struct fib6_info *f6i; struct net *net; int err; @@ -268,14 +268,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); goto out; } - aca = aca_alloc(rt, addr); + aca = aca_alloc(f6i, addr); if (!aca) { - fib6_info_release(rt); + fib6_info_release(f6i); err = -ENOMEM; goto out; } @@ -289,7 +289,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); - ip6_ins_rt(net, rt); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e23ab0784e65..e44f82848143 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3591,44 +3591,44 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct fib6_info *addrconf_dst_alloc(struct net *net, - struct inet6_dev *idev, - const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) +struct fib6_info *addrconf_f6i_alloc(struct net *net, + struct inet6_dev *idev, + const struct in6_addr *addr, + bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; - struct fib6_info *rt; + struct fib6_info *f6i; - rt = fib6_info_alloc(gfp_flags); - if (!rt) + f6i = fib6_info_alloc(gfp_flags); + if (!f6i) return ERR_PTR(-ENOMEM); - rt->dst_nocount = true; + f6i->dst_nocount = true; in6_dev_hold(idev); - rt->fib6_idev = idev; + f6i->fib6_idev = idev; - rt->dst_host = true; - rt->fib6_protocol = RTPROT_KERNEL; - rt->fib6_flags = RTF_UP | RTF_NONEXTHOP; + f6i->dst_host = true; + f6i->fib6_protocol = RTPROT_KERNEL; + f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { - rt->fib6_type = RTN_ANYCAST; - rt->fib6_flags |= RTF_ANYCAST; + f6i->fib6_type = RTN_ANYCAST; + f6i->fib6_flags |= RTF_ANYCAST; } else { - rt->fib6_type = RTN_LOCAL; - rt->fib6_flags |= RTF_LOCAL; + f6i->fib6_type = RTN_LOCAL; + f6i->fib6_flags |= RTF_LOCAL; } - rt->fib6_nh.nh_gw = *addr; + f6i->fib6_nh.nh_gw = *addr; dev_hold(dev); - rt->fib6_nh.nh_dev = dev; - rt->fib6_dst.addr = *addr; - rt->fib6_dst.plen = 128; + f6i->fib6_nh.nh_dev = dev; + f6i->fib6_dst.addr = *addr; + f6i->fib6_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - rt->fib6_table = fib6_get_table(net, tb_id); + f6i->fib6_table = fib6_get_table(net, tb_id); - return rt; + return f6i; } /* remove deleted ip from prefsrc entries */ -- cgit v1.2.3 From 9ee8cbb2fd4a7d6f483a20c4b8e82d8b1cf685fa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:01 -0700 Subject: net/ipv6: Remove aca_idev aca_idev has only 1 user - inet6_fill_ifacaddr - and it only wants the device index which can be extracted from the fib6_info nexthop. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - include/net/ip6_fib.h | 5 +++++ net/ipv6/addrconf.c | 3 ++- net/ipv6/anycast.c | 4 ---- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d6089b2e64fe..db389253dc2a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -143,7 +143,6 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; - struct inet6_dev *aca_idev; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 994dd67207fb..bd11c990c353 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -410,6 +410,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); +static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_dev; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e6dd886f8f1f..6c42c5d5fafa 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4817,9 +4817,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, u32 portid, u32 seq, int event, unsigned int flags) { + struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); + int ifindex = dev ? dev->ifindex : 1; struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; - int ifindex = ifaca->aca_idev->dev->ifindex; if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 5c3e74d05018..0250d199e527 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -212,7 +212,6 @@ static void aca_get(struct ifacaddr6 *aca) static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - in6_dev_put(ac->aca_idev); fib6_info_release(ac->aca_rt); kfree(ac); } @@ -221,7 +220,6 @@ static void aca_put(struct ifacaddr6 *ac) static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { - struct inet6_dev *idev = f6i->fib6_idev; struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); @@ -229,8 +227,6 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return NULL; aca->aca_addr = *addr; - in6_dev_hold(idev); - aca->aca_idev = idev; fib6_info_hold(f6i); aca->aca_rt = f6i; aca->aca_users = 1; -- cgit v1.2.3 From 6fe749414446d86cfaae1c84bfdd556fe4fa8fca Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:03 -0700 Subject: net/ipv6: Change ip6_route_get_saddr to get dev from route Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host routes and anycast routes were installed with the device set to loopback (or VRF device once that feature was added). In the older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev was used to denote the actual interface. Commit 4832c30d5458 changed the code to have dst.dev pointing to the real device with the switch to lo or vrf device done on dst clones. As a consequence of this change ip6_route_get_saddr can just pass the nexthop device to ipv6_dev_get_saddr. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 31e24f821ef6..c0620330035c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,14 +114,15 @@ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) + if (f6i && f6i->fib6_prefsrc.plen) { *saddr = f6i->fib6_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); + } else { + struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + + err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); + } return err; } -- cgit v1.2.3 From 647d4c1363a85bec63ecf929d4ab4aae78b2a960 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:04 -0700 Subject: net/ipv6: Remove compare of fib6_idev from rt6_duplicate_nexthop After 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") the comparison of idev does not add value since it correlates to the nexthop device which is already compared. Remove the idev comparison. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index c0620330035c..8df4ff798b04 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -275,7 +275,6 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } -- cgit v1.2.3 From dcd1f572954f9d66d7b4a65df894ed5b4c467368 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:05 -0700 Subject: net/ipv6: Remove fib6_idev fib6_idev can be obtained from __in6_dev_get on the nexthop device rather than caching it in the fib6_info. Remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - net/ipv6/ip6_fib.c | 2 -- net/ipv6/route.c | 66 ++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index bd11c990c353..642211174692 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -147,7 +147,6 @@ struct fib6_info { unsigned int fib6_nsiblings; atomic_t fib6_ref; - struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 353f0b3e7b0d..f936e91a8c65 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -197,8 +197,6 @@ void fib6_info_destroy(struct fib6_info *f6i) } } - if (f6i->fib6_idev) - in6_dev_put(f6i->fib6_idev); if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8cf4f0623229..08cae7323776 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -525,15 +525,17 @@ static void rt6_probe(struct fib6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { + struct inet6_dev *idev; + if (neigh->nud_state & NUD_VALID) goto out; + idev = __in6_dev_get(dev); work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, - neigh->updated + - rt->fib6_idev->cnf.rtr_probe_interval)) { + neigh->updated + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); @@ -622,18 +624,32 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict) return m; } +/* called with rc_read_lock held */ +static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) +{ + const struct net_device *dev = fib6_info_nh_dev(f6i); + bool rc = false; + + if (dev) { + const struct inet6_dev *idev = __in6_dev_get(dev); + + rc = !!idev->cnf.ignore_routes_with_linkdown; + } + + return rc; +} + static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, int *mpri, struct fib6_info *match, bool *do_rr) { int m; bool match_do_rr = false; - struct inet6_dev *idev = rt->fib6_idev; if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; - if (idev->cnf.ignore_routes_with_linkdown && + if (fib6_ignore_linkdown(rt) && rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -957,12 +973,12 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { + struct net_device *dev = fib6_info_nh_dev(ort); + ip6_rt_init_dst(rt, ort); rt->rt6i_dst = ort->fib6_dst; - rt->rt6i_idev = ort->fib6_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); + rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->fib6_flags; rt6_set_from(rt, ort); @@ -1355,7 +1371,18 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; - mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6; + if (rt->fib6_pmtu) { + mtu = rt->fib6_pmtu; + } else { + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + } + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); @@ -2985,11 +3012,13 @@ install_route: rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->fib6_nh.nh_dev = dev; - rt->fib6_idev = idev; rt->fib6_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); + if (idev) + in6_dev_put(idev); + return rt; out: if (dev) @@ -3425,8 +3454,11 @@ static void __rt6_purge_dflt_routers(struct net *net, restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) { + (!idev || idev->cnf.accept_ra != 2)) { fib6_info_hold(rt); rcu_read_unlock(); ip6_del_rt(net, rt); @@ -3585,10 +3617,6 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, return ERR_PTR(-ENOMEM); f6i->dst_nocount = true; - - in6_dev_hold(idev); - f6i->fib6_idev = idev; - f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; @@ -3706,7 +3734,7 @@ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - rt->fib6_idev->cnf.ignore_routes_with_linkdown)) + fib6_ignore_linkdown(rt))) return true; return false; @@ -4424,8 +4452,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; - if (rt->fib6_idev->cnf.ignore_routes_with_linkdown) + + rcu_read_lock(); + if (fib6_ignore_linkdown(rt)) *flags |= RTNH_F_DEAD; + rcu_read_unlock(); } if (rt->fib6_flags & RTF_GATEWAY) { @@ -4800,7 +4831,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; - net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4814,7 +4844,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5137,7 +5166,6 @@ void __init ip6_route_init_special_entries(void) * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; - init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From 69b693f0aefa0ed521e8bd02260523b5ae446ad7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:57 -0700 Subject: bpf: btf: Introduce BPF Type Format (BTF) This patch introduces BPF type Format (BTF). BTF (BPF Type Format) is the meta data format which describes the data types of BPF program/map. Hence, it basically focus on the C programming language which the modern BPF is primary using. The first use case is to provide a generic pretty print capability for a BPF map. BTF has its root from CTF (Compact C-Type format). To simplify the handling of BTF data, BTF removes the differences between small and big type/struct-member. Hence, BTF consistently uses u32 instead of supporting both "one u16" and "two u32 (+padding)" in describing type and struct-member. It also raises the number of types (and functions) limit from 0x7fff to 0x7fffffff. Due to the above changes, the format is not compatible to CTF. Hence, BTF starts with a new BTF_MAGIC and version number. This patch does the first verification pass to the BTF. The first pass checks: 1. meta-data size (e.g. It does not go beyond the total btf's size) 2. name_offset is valid 3. Each BTF_KIND (e.g. int, enum, struct....) does its own check of its meta-data. Some other checks, like checking a struct's member is referring to a valid type, can only be done in the second pass. The second verification pass will be implemented in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 130 +++++++ kernel/bpf/Makefile | 1 + kernel/bpf/btf.c | 915 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1046 insertions(+) create mode 100644 include/uapi/linux/btf.h create mode 100644 kernel/bpf/btf.c (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h new file mode 100644 index 000000000000..74a30b1090df --- /dev/null +++ b/include/uapi/linux/btf.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_MAGIC_SWAP 0x9FeB +#define BTF_VERSION 1 +#define BTF_FLAGS_COMPR 0x01 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + + __u32 parent_label; + __u32 parent_name; + + /* All offsets are in bytes relative to the end of this header */ + __u32 label_off; /* offset of label section */ + __u32 object_off; /* offset of data object section*/ + __u32 func_off; /* offset of function section */ + __u32 type_off; /* offset of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x7fffffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x7fffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +/* The type id is referring to a parent BTF */ +#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) +#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) + +/* String is in the ELF string section */ +#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) +#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) + +struct btf_type { + __u32 name; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bits 31: root + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) + +#define BTF_KIND_UNKN 0 /* Unknown */ +#define BTF_KIND_INT 1 /* Integer */ +#define BTF_KIND_PTR 2 /* Pointer */ +#define BTF_KIND_ARRAY 3 /* Array */ +#define BTF_KIND_STRUCT 4 /* Struct */ +#define BTF_KIND_UNION 5 /* Union */ +#define BTF_KIND_ENUM 6 /* Enumeration */ +#define BTF_KIND_FWD 7 /* Forward */ +#define BTF_KIND_TYPEDEF 8 /* Typedef */ +#define BTF_KIND_VOLATILE 9 /* Volatile */ +#define BTF_KIND_CONST 10 /* Const */ +#define BTF_KIND_RESTRICT 11 /* Restrict */ +#define BTF_KIND_MAX 11 +#define NR_BTF_KINDS 12 + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED 0x1 +#define BTF_INT_CHAR 0x2 +#define BTF_INT_BOOL 0x4 +#define BTF_INT_VARARGS 0x8 + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name; + __u32 type; + __u32 offset; /* offset in bits */ +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a713fd23ec88..35c485fa9ea3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,6 +4,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c new file mode 100644 index 000000000000..26e9ed7cea5f --- /dev/null +++ b/kernel/bpf/btf.c @@ -0,0 +1,915 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* BTF (BPF Type Format) is the meta data format which describes + * the data types of BPF program/map. Hence, it basically focus + * on the C programming language which the modern BPF is primary + * using. + * + * ELF Section: + * ~~~~~~~~~~~ + * The BTF data is stored under the ".BTF" ELF section + * + * struct btf_type: + * ~~~~~~~~~~~~~~~ + * Each 'struct btf_type' object describes a C data type. + * Depending on the type it is describing, a 'struct btf_type' + * object may be followed by more data. F.e. + * To describe an array, 'struct btf_type' is followed by + * 'struct btf_array'. + * + * 'struct btf_type' and any extra data following it are + * 4 bytes aligned. + * + * Type section: + * ~~~~~~~~~~~~~ + * The BTF type section contains a list of 'struct btf_type' objects. + * Each one describes a C type. Recall from the above section + * that a 'struct btf_type' object could be immediately followed by extra + * data in order to desribe some particular C types. + * + * type_id: + * ~~~~~~~ + * Each btf_type object is identified by a type_id. The type_id + * is implicitly implied by the location of the btf_type object in + * the BTF type section. The first one has type_id 1. The second + * one has type_id 2...etc. Hence, an earlier btf_type has + * a smaller type_id. + * + * A btf_type object may refer to another btf_type object by using + * type_id (i.e. the "type" in the "struct btf_type"). + * + * NOTE that we cannot assume any reference-order. + * A btf_type object can refer to an earlier btf_type object + * but it can also refer to a later btf_type object. + * + * For example, to describe "const void *". A btf_type + * object describing "const" may refer to another btf_type + * object describing "void *". This type-reference is done + * by specifying type_id: + * + * [1] CONST (anon) type_id=2 + * [2] PTR (anon) type_id=0 + * + * The above is the btf_verifier debug log: + * - Each line started with "[?]" is a btf_type object + * - [?] is the type_id of the btf_type object. + * - CONST/PTR is the BTF_KIND_XXX + * - "(anon)" is the name of the type. It just + * happens that CONST and PTR has no name. + * - type_id=XXX is the 'u32 type' in btf_type + * + * NOTE: "void" has type_id 0 + * + * String section: + * ~~~~~~~~~~~~~~ + * The BTF string section contains the names used by the type section. + * Each string is referred by an "offset" from the beginning of the + * string section. + * + * Each string is '\0' terminated. + * + * The first character in the string section must be '\0' + * which is used to mean 'anonymous'. Some btf_type may not + * have a name. + */ + +/* BTF verification: + * + * To verify BTF data, two passes are needed. + * + * Pass #1 + * ~~~~~~~ + * The first pass is to collect all btf_type objects to + * an array: "btf->types". + * + * Depending on the C type that a btf_type is describing, + * a btf_type may be followed by extra data. We don't know + * how many btf_type is there, and more importantly we don't + * know where each btf_type is located in the type section. + * + * Without knowing the location of each type_id, most verifications + * cannot be done. e.g. an earlier btf_type may refer to a later + * btf_type (recall the "const void *" above), so we cannot + * check this type-reference in the first pass. + * + * In the first pass, it still does some verifications (e.g. + * checking the name is a valid offset to the string section). + */ + +#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +#define BITS_ROUNDUP_BYTES(bits) \ + (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) + +/* 16MB for 64k structs and each has 16 members and + * a few MB spaces for the string section. + * The hard limit is S32_MAX. + */ +#define BTF_MAX_SIZE (16 * 1024 * 1024) +/* 64k. We can raise it later. The hard limit is S32_MAX. */ +#define BTF_MAX_NR_TYPES 65535 + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +struct btf { + union { + struct btf_header *hdr; + void *data; + }; + struct btf_type **types; + const char *strings; + void *nohdr_data; + u32 nr_types; + u32 types_size; + u32 data_size; +}; + +struct btf_verifier_env { + struct btf *btf; + struct bpf_verifier_log log; + u32 log_type_id; +}; + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", +}; + +struct btf_kind_operations { + s32 (*check_meta)(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left); + void (*log_details)(struct btf_verifier_env *env, + const struct btf_type *t); +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; +static struct btf_type btf_void; + +static const char *btf_int_encoding_str(u8 encoding) +{ + if (encoding == 0) + return "(none)"; + else if (encoding == BTF_INT_SIGNED) + return "SIGNED"; + else if (encoding == BTF_INT_CHAR) + return "CHAR"; + else if (encoding == BTF_INT_BOOL) + return "BOOL"; + else if (encoding == BTF_INT_VARARGS) + return "VARARGS"; + else + return "UNKN"; +} + +static u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const struct btf_array *btf_type_array(const struct btf_type *t) +{ + return (const struct btf_array *)(t + 1); +} + +static const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + +static const struct btf_enum *btf_type_enum(const struct btf_type *t) +{ + return (const struct btf_enum *)(t + 1); +} + +static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) +{ + return kind_ops[BTF_INFO_KIND(t->info)]; +} + +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +{ + return !BTF_STR_TBL_ELF_ID(offset) && + BTF_STR_OFFSET(offset) < btf->hdr->str_len; +} + +static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (!BTF_STR_OFFSET(offset)) + return "(anon)"; + else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) + return &btf->strings[BTF_STR_OFFSET(offset)]; + else + return "(invalid-name-offset)"; +} + +__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, + const struct btf_type *t, + bool log_details, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + u8 kind = BTF_INFO_KIND(t->info); + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "[%u] %s %s%s", + env->log_type_id, + btf_kind_str[kind], + btf_name_by_offset(btf, t->name), + log_details ? " " : ""); + + if (log_details) + btf_type_ops(t)->log_details(env, t); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +#define btf_verifier_log_type(env, t, ...) \ + __btf_verifier_log_type((env), (t), true, __VA_ARGS__) +#define btf_verifier_log_basic(env, t, ...) \ + __btf_verifier_log_type((env), (t), false, __VA_ARGS__) + +__printf(4, 5) +static void btf_verifier_log_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + btf_name_by_offset(btf, member->name), + member->type, member->offset); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +static void btf_verifier_log_hdr(struct btf_verifier_env *env) +{ + struct bpf_verifier_log *log = &env->log; + const struct btf *btf = env->btf; + const struct btf_header *hdr; + + if (!bpf_verifier_log_needed(log)) + return; + + hdr = btf->hdr; + __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); + __btf_verifier_log(log, "version: %u\n", hdr->version); + __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); + __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); + __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); + __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); + __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); + __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); + __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); + __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); + __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); +} + +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) +{ + struct btf *btf = env->btf; + + /* < 2 because +1 for btf_void which is always in btf->types[0]. + * btf_void is not accounted in btf->nr_types because btf_void + * does not come from the BTF file. + */ + if (btf->types_size - btf->nr_types < 2) { + /* Expand 'types' array */ + + struct btf_type **new_types; + u32 expand_by, new_size; + + if (btf->types_size == BTF_MAX_NR_TYPES) { + btf_verifier_log(env, "Exceeded max num of types"); + return -E2BIG; + } + + expand_by = max_t(u32, btf->types_size >> 2, 16); + new_size = min_t(u32, BTF_MAX_NR_TYPES, + btf->types_size + expand_by); + + new_types = kvzalloc(new_size * sizeof(*new_types), + GFP_KERNEL | __GFP_NOWARN); + if (!new_types) + return -ENOMEM; + + if (btf->nr_types == 0) + new_types[0] = &btf_void; + else + memcpy(new_types, btf->types, + sizeof(*btf->types) * (btf->nr_types + 1)); + + kvfree(btf->types); + btf->types = new_types; + btf->types_size = new_size; + } + + btf->types[++(btf->nr_types)] = t; + + return 0; +} + +static void btf_free(struct btf *btf) +{ + kvfree(btf->types); + kvfree(btf->data); + kfree(btf); +} + +static void btf_verifier_env_free(struct btf_verifier_env *env) +{ + kfree(env); +} + +static s32 btf_int_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 int_data, nr_bits, meta_needed = sizeof(int_data); + u16 encoding; + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); + + if (nr_bits > BITS_PER_U64) { + btf_verifier_log_type(env, t, "nr_bits exceeds %zu", + BITS_PER_U64); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { + btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); + return -EINVAL; + } + + encoding = BTF_INT_ENCODING(int_data); + if (encoding && + encoding != BTF_INT_SIGNED && + encoding != BTF_INT_CHAR && + encoding != BTF_INT_BOOL && + encoding != BTF_INT_VARARGS) { + btf_verifier_log_type(env, t, "Unsupported encoding"); + return -ENOTSUPP; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_int_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + int int_data = btf_type_int(t); + + btf_verifier_log(env, + "size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(int_data), + BTF_INT_BITS(int_data), + btf_int_encoding_str(BTF_INT_ENCODING(int_data))); +} + +static const struct btf_kind_operations int_ops = { + .check_meta = btf_int_check_meta, + .log_details = btf_int_log, +}; + +static int btf_ref_type_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (BTF_TYPE_PARENT(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static void btf_ref_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "type_id=%u", t->type); +} + +static struct btf_kind_operations modifier_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static struct btf_kind_operations ptr_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static struct btf_kind_operations fwd_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static s32 btf_array_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_array *array = btf_type_array(t); + u32 meta_needed = sizeof(*array); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + /* We are a little forgiving on array->index_type since + * the kernel is not using it. + */ + /* Array elem cannot be in type void, + * so !array->type is not allowed. + */ + if (!array->type || BTF_TYPE_PARENT(array->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_array_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_array *array = btf_type_array(t); + + btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", + array->type, array->index_type, array->nelems); +} + +static struct btf_kind_operations array_ops = { + .check_meta = btf_array_check_meta, + .log_details = btf_array_log, +}; + +static s32 btf_struct_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; + const struct btf_member *member; + struct btf *btf = env->btf; + u32 struct_size = t->size; + u32 meta_needed; + u16 i; + + meta_needed = btf_type_vlen(t) * sizeof(*member); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_member(i, t, member) { + if (!btf_name_offset_valid(btf, member->name)) { + btf_verifier_log_member(env, t, member, + "Invalid member name_offset:%u", + member->name); + return -EINVAL; + } + + /* A member cannot be in type void */ + if (!member->type || BTF_TYPE_PARENT(member->type)) { + btf_verifier_log_member(env, t, member, + "Invalid type_id"); + return -EINVAL; + } + + if (is_union && member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + btf_verifier_log_member(env, t, member, + "Memmber bits_offset exceeds its struct size"); + return -EINVAL; + } + + btf_verifier_log_member(env, t, member, NULL); + } + + return meta_needed; +} + +static void btf_struct_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static struct btf_kind_operations struct_ops = { + .check_meta = btf_struct_check_meta, + .log_details = btf_struct_log, +}; + +static s32 btf_enum_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum *enums = btf_type_enum(t); + struct btf *btf = env->btf; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size != sizeof(int)) { + btf_verifier_log_type(env, t, "Expected size:%zu", + sizeof(int)); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", + btf_name_by_offset(btf, enums[i].name), + enums[i].val); + } + + return meta_needed; +} + +static void btf_enum_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static struct btf_kind_operations enum_ops = { + .check_meta = btf_enum_check_meta, + .log_details = btf_enum_log, +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { + [BTF_KIND_INT] = &int_ops, + [BTF_KIND_PTR] = &ptr_ops, + [BTF_KIND_ARRAY] = &array_ops, + [BTF_KIND_STRUCT] = &struct_ops, + [BTF_KIND_UNION] = &struct_ops, + [BTF_KIND_ENUM] = &enum_ops, + [BTF_KIND_FWD] = &fwd_ops, + [BTF_KIND_TYPEDEF] = &modifier_ops, + [BTF_KIND_VOLATILE] = &modifier_ops, + [BTF_KIND_CONST] = &modifier_ops, + [BTF_KIND_RESTRICT] = &modifier_ops, +}; + +static s32 btf_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 saved_meta_left = meta_left; + s32 var_meta_size; + + if (meta_left < sizeof(*t)) { + btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", + env->log_type_id, meta_left, sizeof(*t)); + return -EINVAL; + } + meta_left -= sizeof(*t); + + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || + BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { + btf_verifier_log(env, "[%u] Invalid kind:%u", + env->log_type_id, BTF_INFO_KIND(t->info)); + return -EINVAL; + } + + if (!btf_name_offset_valid(env->btf, t->name)) { + btf_verifier_log(env, "[%u] Invalid name_offset:%u", + env->log_type_id, t->name); + return -EINVAL; + } + + var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); + if (var_meta_size < 0) + return var_meta_size; + + meta_left -= var_meta_size; + + return saved_meta_left - meta_left; +} + +static int btf_check_all_metas(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + struct btf_header *hdr; + void *cur, *end; + + hdr = btf->hdr; + cur = btf->nohdr_data + hdr->type_off; + end = btf->nohdr_data + hdr->str_off; + + env->log_type_id = 1; + while (cur < end) { + struct btf_type *t = cur; + s32 meta_size; + + meta_size = btf_check_meta(env, t, end - cur); + if (meta_size < 0) + return meta_size; + + btf_add_type(env, t); + cur += meta_size; + env->log_type_id++; + } + + return 0; +} + +static int btf_parse_type_sec(struct btf_verifier_env *env) +{ + return btf_check_all_metas(env); +} + +static int btf_parse_str_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + const char *start, *end; + + hdr = btf->hdr; + start = btf->nohdr_data + hdr->str_off; + end = start + hdr->str_len; + + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + start[0] || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + + btf->strings = start; + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + u32 meta_left; + + if (btf->data_size < sizeof(*hdr)) { + btf_verifier_log(env, "btf_header not found"); + return -EINVAL; + } + + btf_verifier_log_hdr(env); + + hdr = btf->hdr; + if (hdr->magic != BTF_MAGIC) { + btf_verifier_log(env, "Invalid magic"); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + btf_verifier_log(env, "Unsupported version"); + return -ENOTSUPP; + } + + if (hdr->flags) { + btf_verifier_log(env, "Unsupported flags"); + return -ENOTSUPP; + } + + meta_left = btf->data_size - sizeof(*hdr); + if (!meta_left) { + btf_verifier_log(env, "No data"); + return -EINVAL; + } + + if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || + /* Type section must align to 4 bytes */ + hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Invalid type_off"); + return -EINVAL; + } + + if (meta_left < hdr->str_off || + meta_left - hdr->str_off < hdr->str_len) { + btf_verifier_log(env, "Invalid str_off or str_len"); + return -EINVAL; + } + + btf->nohdr_data = btf->hdr + 1; + + return 0; +} + +static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, + u32 log_level, char __user *log_ubuf, u32 log_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + u8 *data; + int err; + + if (btf_data_size > BTF_MAX_SIZE) + return ERR_PTR(-E2BIG); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + if (log_level || log_ubuf || log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log->level = log_level; + log->ubuf = log_ubuf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) { + err = -EINVAL; + goto errout; + } + } + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + + data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + if (!data) { + err = -ENOMEM; + goto errout; + } + + btf->data = data; + btf->data_size = btf_data_size; + + if (copy_from_user(data, btf_data, btf_data_size)) { + err = -EFAULT; + goto errout; + } + + env->btf = btf; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_parse_type_sec(env); + if (err) + goto errout; + + if (!err && log->level && bpf_verifier_log_full(log)) { + err = -ENOSPC; + goto errout; + } + + if (!err) { + btf_verifier_env_free(env); + return btf; + } + +errout: + btf_verifier_env_free(env); + if (btf) + btf_free(btf); + return ERR_PTR(err); +} -- cgit v1.2.3 From eb3f595dab40b61011c5f123507a7db2df6f0e65 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:58 -0700 Subject: bpf: btf: Validate type reference After collecting all btf_type in the first pass in an earlier patch, the second pass (in this patch) can validate the reference types (e.g. the referring type does exist and it does not refer to itself). While checking the reference type, it also gathers other information (e.g. the size of an array). This info will be useful in checking the struct's members in a later patch. They will also be useful in doing pretty print later. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 37 +++ kernel/bpf/btf.c | 666 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 702 insertions(+), 1 deletion(-) create mode 100644 include/linux/btf.h (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h new file mode 100644 index 000000000000..f14b60368753 --- /dev/null +++ b/include/linux/btf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#ifndef _LINUX_BTF_H +#define _LINUX_BTF_H 1 + +#include + +struct btf; +struct btf_type; + +/* Figure out the size of a type_id. If type_id is a modifier + * (e.g. const), it will be resolved to find out the type with size. + * + * For example: + * In describing "const void *", type_id is "const" and "const" + * refers to "void *". The return type will be "void *". + * + * If type_id is a simple "int", then return type will be "int". + * + * @btf: struct btf object + * @type_id: Find out the size of type_id. The type_id of the return + * type is set to *type_id. + * @ret_size: It can be NULL. If not NULL, the size of the return + * type is set to *ret_size. + * Return: The btf_type (resolved to another type with size info if needed). + * NULL is returned if type_id itself does not have size info + * (e.g. void) or it cannot be resolved to another type that + * has size info. + * *type_id and *ret_size will not be changed in the + * NULL return case. + */ +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, + u32 *ret_size); + +#endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 26e9ed7cea5f..18bf266ceeda 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -105,6 +105,50 @@ * * In the first pass, it still does some verifications (e.g. * checking the name is a valid offset to the string section). + * + * Pass #2 + * ~~~~~~~ + * The main focus is to resolve a btf_type that is referring + * to another type. + * + * We have to ensure the referring type: + * 1) does exist in the BTF (i.e. in btf->types[]) + * 2) does not cause a loop: + * struct A { + * struct B b; + * }; + * + * struct B { + * struct A a; + * }; + * + * btf_type_needs_resolve() decides if a btf_type needs + * to be resolved. + * + * The needs_resolve type implements the "resolve()" ops which + * essentially does a DFS and detects backedge. + * + * During resolve (or DFS), different C types have different + * "RESOLVED" conditions. + * + * When resolving a BTF_KIND_STRUCT, we need to resolve all its + * members because a member is always referring to another + * type. A struct's member can be treated as "RESOLVED" if + * it is referring to a BTF_KIND_PTR. Otherwise, the + * following valid C struct would be rejected: + * + * struct A { + * int m; + * struct A *a; + * }; + * + * When resolving a BTF_KIND_PTR, it needs to keep resolving if + * it is referring to another BTF_KIND_PTR. Otherwise, we cannot + * detect a pointer loop, e.g.: + * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + + * ^ | + * +-----------------------------------------+ + * */ #define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) @@ -127,12 +171,19 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_member_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_member(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + struct btf { union { struct btf_header *hdr; void *data; }; struct btf_type **types; + u32 *resolved_ids; + u32 *resolved_sizes; const char *strings; void *nohdr_data; u32 nr_types; @@ -140,10 +191,42 @@ struct btf { u32 data_size; }; +enum verifier_phase { + CHECK_META, + CHECK_TYPE, +}; + +struct resolve_vertex { + const struct btf_type *t; + u32 type_id; + u16 next_member; +}; + +enum visit_state { + NOT_VISITED, + VISITED, + RESOLVED, +}; + +enum resolve_mode { + RESOLVE_TBD, /* To Be Determined */ + RESOLVE_PTR, /* Resolving for Pointer */ + RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union + * or array + */ +}; + +#define MAX_RESOLVE_DEPTH 32 + struct btf_verifier_env { struct btf *btf; + u8 *visit_states; + struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; struct bpf_verifier_log log; u32 log_type_id; + u32 top_stack; + enum verifier_phase phase; + enum resolve_mode resolve_mode; }; static const char * const btf_kind_str[NR_BTF_KINDS] = { @@ -165,6 +248,8 @@ struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left); + int (*resolve)(struct btf_verifier_env *env, + const struct resolve_vertex *v); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); }; @@ -172,6 +257,101 @@ struct btf_kind_operations { static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; +static bool btf_type_is_modifier(const struct btf_type *t) +{ + /* Some of them is not strictly a C modifier + * but they are grouped into the same bucket + * for BTF concern: + * A type (t) that refers to another + * type through t->type AND its size cannot + * be determined without following the t->type. + * + * ptr does not fall into this bucket + * because its size is always sizeof(void *). + */ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return true; + } + + return false; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + /* void => no type and size info. + * Hence, FWD is also treated as void. + */ + return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + +static bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +/* What types need to be resolved? + * + * btf_type_is_modifier() is an obvious one. + * + * btf_type_is_struct() because its member refers to + * another type (through member->type). + + * btf_type_is_array() because its element (array->type) + * refers to another type. Array can be thought of a + * special case of struct while array just has the same + * member-type repeated by array->nelems of times. + */ +static bool btf_type_needs_resolve(const struct btf_type *t) +{ + return btf_type_is_modifier(t) || + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t); +} + +/* t->size can be used */ +static bool btf_type_has_size(const struct btf_type *t) +{ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + return true; + } + + return false; +} + static const char *btf_int_encoding_str(u8 encoding) { if (encoding == 0) @@ -234,6 +414,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } +static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +{ + if (type_id > btf->nr_types) + return NULL; + + return btf->types[type_id]; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -308,6 +496,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + /* The CHECK_META phase already did a btf dump. + * + * If member is logged again, it must hit an error in + * parsing this member. It is useful to print out which + * struct this member belongs to. + */ + if (env->phase != CHECK_META) + btf_verifier_log_type(env, struct_type, NULL); + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", btf_name_by_offset(btf, member->name), member->type, member->offset); @@ -393,15 +590,183 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) static void btf_free(struct btf *btf) { kvfree(btf->types); + kvfree(btf->resolved_sizes); + kvfree(btf->resolved_ids); kvfree(btf->data); kfree(btf); } +static int env_resolve_init(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 nr_types = btf->nr_types; + u32 *resolved_sizes = NULL; + u32 *resolved_ids = NULL; + u8 *visit_states = NULL; + + /* +1 for btf_void */ + resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_sizes) + goto nomem; + + resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_ids) + goto nomem; + + visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states), + GFP_KERNEL | __GFP_NOWARN); + if (!visit_states) + goto nomem; + + btf->resolved_sizes = resolved_sizes; + btf->resolved_ids = resolved_ids; + env->visit_states = visit_states; + + return 0; + +nomem: + kvfree(resolved_sizes); + kvfree(resolved_ids); + kvfree(visit_states); + return -ENOMEM; +} + static void btf_verifier_env_free(struct btf_verifier_env *env) { + kvfree(env->visit_states); kfree(env); } +static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, + const struct btf_type *next_type) +{ + switch (env->resolve_mode) { + case RESOLVE_TBD: + /* int, enum or void is a sink */ + return !btf_type_needs_resolve(next_type); + case RESOLVE_PTR: + /* int, enum, void, struct or array is a sink for ptr */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_ptr(next_type); + case RESOLVE_STRUCT_OR_ARRAY: + /* int, enum, void or ptr is a sink for struct and array */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_array(next_type) && + !btf_type_is_struct(next_type); + default: + BUG_ON(1); + } +} + +static bool env_type_is_resolved(const struct btf_verifier_env *env, + u32 type_id) +{ + return env->visit_states[type_id] == RESOLVED; +} + +static int env_stack_push(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + struct resolve_vertex *v; + + if (env->top_stack == MAX_RESOLVE_DEPTH) + return -E2BIG; + + if (env->visit_states[type_id] != NOT_VISITED) + return -EEXIST; + + env->visit_states[type_id] = VISITED; + + v = &env->stack[env->top_stack++]; + v->t = t; + v->type_id = type_id; + v->next_member = 0; + + if (env->resolve_mode == RESOLVE_TBD) { + if (btf_type_is_ptr(t)) + env->resolve_mode = RESOLVE_PTR; + else if (btf_type_is_struct(t) || btf_type_is_array(t)) + env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; + } + + return 0; +} + +static void env_stack_set_next_member(struct btf_verifier_env *env, + u16 next_member) +{ + env->stack[env->top_stack - 1].next_member = next_member; +} + +static void env_stack_pop_resolved(struct btf_verifier_env *env, + u32 resolved_type_id, + u32 resolved_size) +{ + u32 type_id = env->stack[--(env->top_stack)].type_id; + struct btf *btf = env->btf; + + btf->resolved_sizes[type_id] = resolved_size; + btf->resolved_ids[type_id] = resolved_type_id; + env->visit_states[type_id] = RESOLVED; +} + +static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) +{ + return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; +} + +/* The input param "type_id" must point to a needs_resolve type */ +static const struct btf_type *btf_type_id_resolve(const struct btf *btf, + u32 *type_id) +{ + *type_id = btf->resolved_ids[*type_id]; + return btf_type_by_id(btf, *type_id); +} + +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, u32 *ret_size) +{ + const struct btf_type *size_type; + u32 size_type_id = *type_id; + u32 size = 0; + + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void_or_null(size_type)) + return NULL; + + if (btf_type_has_size(size_type)) { + size = size_type->size; + } else if (btf_type_is_array(size_type)) { + size = btf->resolved_sizes[size_type_id]; + } else if (btf_type_is_ptr(size_type)) { + size = sizeof(void *); + } else { + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + return NULL; + + size = btf->resolved_sizes[size_type_id]; + size_type_id = btf->resolved_ids[size_type_id]; + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void(size_type)) + return NULL; + } + + *type_id = size_type_id; + if (ret_size) + *ret_size = size; + + return size_type; +} + +static int btf_df_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + btf_verifier_log_basic(env, v->t, "Unsupported resolve"); + return -EINVAL; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -464,6 +829,7 @@ static void btf_int_log(struct btf_verifier_env *env, static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, + .resolve = btf_df_resolve, .log_details = btf_int_log, }; @@ -486,6 +852,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_modifier_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + const struct btf_type *next_type; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "typedef void new_void", "const void"...etc */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* Figure out the resolved next_type_id with size. + * They will be stored in the current modifier's + * resolved_ids and resolved_sizes such that it can + * save us a few type-following when we use it later (e.g. in + * pretty print). + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + +static int btf_ptr_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "void *" */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, + * the modifier may have stopped resolving when it was resolved + * to a ptr (last-resolved-ptr). + * + * We now need to continue from the last-resolved-ptr to + * ensure the last-resolved-ptr will not referring back to + * the currenct ptr (t). + */ + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, 0); + + return 0; +} + static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -494,16 +958,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env, static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_modifier_resolve, .log_details = btf_ref_type_log, }; static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_ptr_resolve, .log_details = btf_ref_type_log, }; static struct btf_kind_operations fwd_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_df_resolve, .log_details = btf_ref_type_log, }; @@ -542,6 +1009,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return meta_needed; } +static int btf_array_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_array *array = btf_type_array(v->t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + struct btf *btf = env->btf; + u32 elem_size; + + elem_type = btf_type_by_id(btf, elem_type_id); + if (btf_type_is_void_or_null(elem_type)) { + btf_verifier_log_type(env, v->t, + "Invalid elem"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, elem_type) && + !env_type_is_resolved(env, elem_type_id)) + return env_stack_push(env, elem_type, elem_type_id); + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + if (!elem_type) { + btf_verifier_log_type(env, v->t, "Invalid elem"); + return -EINVAL; + } + + if (btf_type_is_int(elem_type)) { + int int_type_data = btf_type_int(elem_type); + u16 nr_bits = BTF_INT_BITS(int_type_data); + u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + /* Put more restriction on array of int. The int cannot + * be a bit field and it must be either u8/u16/u32/u64. + */ + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_type_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + btf_verifier_log_type(env, v->t, + "Invalid array of int"); + return -EINVAL; + } + } + + if (array->nelems && elem_size > U32_MAX / array->nelems) { + btf_verifier_log_type(env, v->t, + "Array size overflows U32_MAX"); + return -EINVAL; + } + + env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); + + return 0; +} + static void btf_array_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -553,6 +1075,7 @@ static void btf_array_log(struct btf_verifier_env *env, static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, + .resolve = btf_array_resolve, .log_details = btf_array_log, }; @@ -610,6 +1133,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return meta_needed; } +static int btf_struct_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_member *member; + u16 i; + + /* Before continue resolving the next_member, + * ensure the last member is indeed resolved to a + * type with size info. + */ + if (v->next_member) { + const struct btf_member *last_member; + u16 last_member_type_id; + + last_member = btf_type_member(v->t) + v->next_member - 1; + last_member_type_id = last_member->type; + if (WARN_ON_ONCE(!env_type_is_resolved(env, + last_member_type_id))) + return -EINVAL; + } + + for_each_member_from(i, v->next_member, v->t, member) { + u32 member_type_id = member->type; + const struct btf_type *member_type = btf_type_by_id(env->btf, + member_type_id); + + if (btf_type_is_void_or_null(member_type)) { + btf_verifier_log_member(env, v->t, member, + "Invalid member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, member_type) && + !env_type_is_resolved(env, member_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, member_type, member_type_id); + } + } + + env_stack_pop_resolved(env, 0, 0); + + return 0; +} + static void btf_struct_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -618,6 +1185,7 @@ static void btf_struct_log(struct btf_verifier_env *env, static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, + .resolve = btf_struct_resolve, .log_details = btf_struct_log, }; @@ -671,6 +1239,7 @@ static void btf_enum_log(struct btf_verifier_env *env, static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, + .resolve = btf_df_resolve, .log_details = btf_enum_log, }; @@ -751,9 +1320,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env) return 0; } +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + else if (err == -EEXIST) + btf_verifier_log_type(env, t, "Loop detected"); + + return err; +} + +static bool btf_resolve_valid(struct btf_verifier_env *env, + const struct btf_type *t, + u32 type_id) +{ + struct btf *btf = env->btf; + + if (!env_type_is_resolved(env, type_id)) + return false; + + if (btf_type_is_struct(t)) + return !btf->resolved_ids[type_id] && + !btf->resolved_sizes[type_id]; + + if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + t = btf_type_id_resolve(btf, &type_id); + return t && !btf_type_is_modifier(t); + } + + if (btf_type_is_array(t)) { + const struct btf_array *array = btf_type_array(t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + u32 elem_size; + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + return elem_type && !btf_type_is_modifier(elem_type) && + (array->nelems * elem_size == + btf->resolved_sizes[type_id]); + } + + return false; +} + +static int btf_check_all_types(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 type_id; + int err; + + err = env_resolve_init(env); + if (err) + return err; + + env->phase++; + for (type_id = 1; type_id <= btf->nr_types; type_id++) { + const struct btf_type *t = btf_type_by_id(btf, type_id); + + env->log_type_id = type_id; + if (btf_type_needs_resolve(t) && + !env_type_is_resolved(env, type_id)) { + err = btf_resolve(env, t, type_id); + if (err) + return err; + } + + if (btf_type_needs_resolve(t) && + !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + return -EINVAL; + } + } + + return 0; +} + static int btf_parse_type_sec(struct btf_verifier_env *env) { - return btf_check_all_metas(env); + int err; + + err = btf_check_all_metas(env); + if (err) + return err; + + return btf_check_all_types(env); } static int btf_parse_str_sec(struct btf_verifier_env *env) -- cgit v1.2.3 From b00b8daec828dd59af7d1f7a42acd6e5867f80c6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:00 -0700 Subject: bpf: btf: Add pretty print capability for data with BTF type info This patch adds pretty print capability for data with BTF type info. The current usage is to allow pretty print for a BPF map. The next few patches will allow a read() on a pinned map with BTF type info for its key and value. This patch uses the seq_printf() infra. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 2 + kernel/bpf/btf.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index f14b60368753..d8bdab0280ba 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -33,5 +33,7 @@ struct btf_type; const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m); #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4e31249f6c61..10ee41589da2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -256,6 +257,9 @@ struct btf_kind_operations { const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); + void (*seq_show)(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; @@ -781,6 +785,13 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } +static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m) +{ + seq_printf(m, "", BTF_INFO_KIND(t->info)); +} + static int btf_int_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -879,11 +890,96 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u16 nr_bits = BTF_INT_BITS(int_data); + u16 total_bits_offset; + u16 nr_copy_bytes; + u16 nr_copy_bits; + u8 nr_upper_bits; + union { + u64 u64_num; + u8 u8_nums[8]; + } print_num; + + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + nr_copy_bits = nr_bits + bits_offset; + nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); + + print_num.u64_num = 0; + memcpy(&print_num.u64_num, data, nr_copy_bytes); + + /* Ditch the higher order bits */ + nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); + if (nr_upper_bits) { + /* We need to mask out some bits of the upper byte. */ + u8 mask = (1 << nr_upper_bits) - 1; + + print_num.u8_nums[nr_copy_bytes - 1] &= mask; + } + + print_num.u64_num >>= bits_offset; + + seq_printf(m, "0x%llx", print_num.u64_num); +} + +static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 encoding = BTF_INT_ENCODING(int_data); + bool sign = encoding & BTF_INT_SIGNED; + u32 nr_bits = BTF_INT_BITS(int_data); + + if (bits_offset || BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(nr_bits)) { + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + return; + } + + switch (nr_bits) { + case 64: + if (sign) + seq_printf(m, "%lld", *(s64 *)data); + else + seq_printf(m, "%llu", *(u64 *)data); + break; + case 32: + if (sign) + seq_printf(m, "%d", *(s32 *)data); + else + seq_printf(m, "%u", *(u32 *)data); + break; + case 16: + if (sign) + seq_printf(m, "%d", *(s16 *)data); + else + seq_printf(m, "%u", *(u16 *)data); + break; + case 8: + if (sign) + seq_printf(m, "%d", *(s8 *)data); + else + seq_printf(m, "%u", *(u8 *)data); + break; + default: + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + } +} + static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, .log_details = btf_int_log, + .seq_show = btf_int_seq_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, @@ -1054,6 +1150,24 @@ resolved: return 0; } +static void btf_modifier_seq_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + +static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + /* It is a hashed value */ + seq_printf(m, "%p", *(void **)data); +} + static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -1065,6 +1179,7 @@ static struct btf_kind_operations modifier_ops = { .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_modifier_seq_show, }; static struct btf_kind_operations ptr_ops = { @@ -1072,6 +1187,7 @@ static struct btf_kind_operations ptr_ops = { .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_ptr_seq_show, }; static struct btf_kind_operations fwd_ops = { @@ -1079,6 +1195,7 @@ static struct btf_kind_operations fwd_ops = { .resolve = btf_df_resolve, .check_member = btf_df_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, }; static int btf_array_check_member(struct btf_verifier_env *env, @@ -1209,11 +1326,36 @@ static void btf_array_log(struct btf_verifier_env *env, array->type, array->index_type, array->nelems); } +static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_array *array = btf_type_array(t); + const struct btf_kind_operations *elem_ops; + const struct btf_type *elem_type; + u32 i, elem_size, elem_type_id; + + elem_type_id = array->type; + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_ops = btf_type_ops(elem_type); + seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { + if (i) + seq_puts(m, ","); + + elem_ops->seq_show(btf, elem_type, elem_type_id, data, + bits_offset, m); + data += elem_size; + } + seq_puts(m, "]"); +} + static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, .log_details = btf_array_log, + .seq_show = btf_array_seq_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, @@ -1361,11 +1503,39 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; + const struct btf_member *member; + u32 i; + + seq_puts(m, "{"); + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + u32 member_offset = member->offset; + u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + const struct btf_kind_operations *ops; + + if (i) + seq_puts(m, seq); + + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } + seq_puts(m, "}"); +} + static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, .log_details = btf_struct_log, + .seq_show = btf_struct_seq_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, @@ -1441,11 +1611,31 @@ static void btf_enum_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_enum *enums = btf_type_enum(t); + u32 i, nr_enums = btf_type_vlen(t); + int v = *(int *)data; + + for (i = 0; i < nr_enums; i++) { + if (v == enums[i].val) { + seq_printf(m, "%s", + btf_name_by_offset(btf, enums[i].name)); + return; + } + } + + seq_printf(m, "%d", v); +} + static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .log_details = btf_enum_log, + .seq_show = btf_enum_seq_show, }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { @@ -1782,3 +1972,11 @@ errout: btf_free(btf); return ERR_PTR(err); } + +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); +} -- cgit v1.2.3 From f56a653c1fd13a197076dec4461c656fd2adec73 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:01 -0700 Subject: bpf: btf: Add BPF_BTF_LOAD command This patch adds a BPF_BTF_LOAD command which 1) loads and verifies the BTF (implemented in earlier patches) 2) returns a BTF fd to userspace. In the next patch, the BTF fd can be specified during BPF_MAP_CREATE. It currently limits to CAP_SYS_ADMIN. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 4 +++ include/uapi/linux/bpf.h | 9 +++++++ kernel/bpf/btf.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 17 ++++++++++++ 4 files changed, 97 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index d8bdab0280ba..a7c7072535ea 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -8,7 +8,11 @@ struct btf; struct btf_type; +union bpf_attr; +void btf_put(struct btf *btf); +int btf_new_fd(const union bpf_attr *attr); +struct btf *btf_get_by_fd(int fd); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9a2d1a04eb24..795bcd577750 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_cmd { BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, }; enum bpf_map_type { @@ -363,6 +364,14 @@ union bpf_attr { __u64 name; __u32 prog_fd; } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 10ee41589da2..2322340694cf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -190,6 +192,7 @@ struct btf { u32 nr_types; u32 types_size; u32 data_size; + refcount_t refcnt; }; enum verifier_phase { @@ -604,6 +607,17 @@ static void btf_free(struct btf *btf) kfree(btf); } +static void btf_get(struct btf *btf) +{ + refcount_inc(&btf->refcnt); +} + +void btf_put(struct btf *btf) +{ + if (btf && refcount_dec_and_test(&btf->refcnt)) + btf_free(btf); +} + static int env_resolve_init(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -1963,6 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (!err) { btf_verifier_env_free(env); + btf_get(btf); return btf; } @@ -1980,3 +1995,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); } + +static int btf_release(struct inode *inode, struct file *filp) +{ + btf_put(filp->private_data); + return 0; +} + +static const struct file_operations btf_fops = { + .release = btf_release, +}; + +int btf_new_fd(const union bpf_attr *attr) +{ + struct btf *btf; + int fd; + + btf = btf_parse(u64_to_user_ptr(attr->btf), + attr->btf_size, attr->btf_log_level, + u64_to_user_ptr(attr->btf_log_buf), + attr->btf_log_size); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + fd = anon_inode_getfd("btf", &btf_fops, btf, + O_RDONLY | O_CLOEXEC); + if (fd < 0) + btf_put(btf); + + return fd; +} + +struct btf *btf_get_by_fd(int fd) +{ + struct btf *btf; + struct fd f; + + f = fdget(fd); + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &btf_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + btf = f.file->private_data; + btf_get(btf); + fdput(f); + + return btf; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ca46df19c9a..cd8ebadc66eb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include #include @@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } +#define BPF_BTF_LOAD_LAST_FIELD btf_log_level + +static int bpf_btf_load(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_LOAD)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_new_fd(attr); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; + case BPF_BTF_LOAD: + err = bpf_btf_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 60197cfb6e11ffc03aa0ed23765b2f7e70b2e2d4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:02 -0700 Subject: bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd. The original BTF data, which was used to create the BTF fd during the earlier BPF_BTF_LOAD call, will be returned. The userspace is expected to allocate buffer to info.info and the buffer size is set to info.info_len before calling BPF_OBJ_GET_INFO_BY_FD. The original BTF data is copied to the userspace buffer (info.info). Only upto the user's specified info.info_len will be copied. The original BTF data size is set to info.info_len. The userspace needs to check if it is bigger than its allocated buffer size. If it is, the userspace should realloc with the kernel-returned info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 17 ++++++++++++++++- kernel/bpf/syscall.c | 2 ++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a7c7072535ea..a966dc6d61ee 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -10,9 +10,14 @@ struct btf; struct btf_type; union bpf_attr; +extern const struct file_operations btf_fops; + void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2322340694cf..eb56ac760547 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2002,7 +2002,7 @@ static int btf_release(struct inode *inode, struct file *filp) return 0; } -static const struct file_operations btf_fops = { +const struct file_operations btf_fops = { .release = btf_release, }; @@ -2047,3 +2047,18 @@ struct btf *btf_get_by_fd(int fd) return btf; } + +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *udata = u64_to_user_ptr(attr->info.info); + u32 copy_len = min_t(u32, btf->data_size, + attr->info.info_len); + + if (copy_to_user(udata, btf->data, copy_len) || + put_user(btf->data_size, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd8ebadc66eb..0a4924a0a8da 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2017,6 +2017,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &btf_fops) + err = btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; -- cgit v1.2.3 From a26ca7c982cb576749cbdd01e8ecde4bf010d60a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:03 -0700 Subject: bpf: btf: Add pretty print support to the basic arraymap This patch adds pretty print support to the basic arraymap. Support for other bpf maps can be added later. This patch adds new attrs to the BPF_MAP_CREATE command to allow specifying the btf_fd, btf_key_id and btf_value_id. The BPF_MAP_CREATE can then associate the btf to the map if the creating map supports BTF. A BTF supported map needs to implement two new map ops, map_seq_show_elem() and map_check_btf(). This patch has implemented these new map ops for the basic arraymap. It also adds file_operations, bpffs_map_fops, to the pinned map such that the pinned map can be opened and read. After that, the user has an intuitive way to do "cat bpffs/pathto/a-pinned-map" instead of getting an error. bpffs_map_fops should not be extended further to support other operations. Other operations (e.g. write/key-lookup...) should be realized by the userspace tools (e.g. bpftool) through the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc. Follow up patches will allow the userspace to obtain the BTF from a map-fd. Here is a sample output when reading a pinned arraymap with the following map's value: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_array_map: 0: {1,2} 1: {3,4} 2: {5,6} ... Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 20 +++++- include/uapi/linux/bpf.h | 3 + kernel/bpf/arraymap.c | 50 +++++++++++++++ kernel/bpf/inode.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 32 +++++++++- 5 files changed, 254 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95a7abd0ee92..ee5275e7d4df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,8 @@ struct perf_event; struct bpf_prog; struct bpf_map; struct sock; +struct seq_file; +struct btf; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -43,10 +45,14 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); + void (*map_seq_show_elem)(struct bpf_map *map, void *key, + struct seq_file *m); + int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, + u32 key_type_id, u32 value_type_id); }; struct bpf_map { - /* 1st cacheline with read-mostly members of which some + /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; @@ -62,10 +68,13 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + u32 btf_key_id; + u32 btf_value_id; + struct btf *btf; bool unpriv_array; - /* 7 bytes hole */ + /* 55 bytes hole */ - /* 2nd cacheline with misc members to avoid false sharing + /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ struct user_struct *user ____cacheline_aligned; @@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_support_seq_show(const struct bpf_map *map) +{ + return map->ops->map_seq_show_elem && map->ops->map_check_btf; +} + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 795bcd577750..c8383a289f7b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -280,6 +280,9 @@ union bpf_attr { */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_id; /* BTF type_id of the key */ + __u32 btf_value_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 14750e7c5ee4..02a189339381 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,11 +11,13 @@ * General Public License for more details. */ #include +#include #include #include #include #include #include +#include #include "map_in_map.h" @@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } +static void array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = array_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + seq_printf(m, "%u: ", *(u32 *)key); + btf_type_seq_show(map->btf, map->btf_value_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + u32 int_data; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + /* bpf array can only take a u32 key. This check makes + * sure that the btf matches the attr used during map_create. + */ + if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || + BTF_INT_OFFSET(int_data)) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size > map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_seq_show_elem = array_map_seq_show_elem, + .map_check_btf = array_map_check_btf, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bf6da59ae0d0..a41343009ccc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } +struct map_iter { + void *key; + bool done; +}; + +static struct map_iter *map_iter(struct seq_file *m) +{ + return m->private; +} + +static struct bpf_map *seq_file_to_map(struct seq_file *m) +{ + return file_inode(m->file)->i_private; +} + +static void map_iter_free(struct map_iter *iter) +{ + if (iter) { + kfree(iter->key); + kfree(iter); + } +} + +static struct map_iter *map_iter_alloc(struct bpf_map *map) +{ + struct map_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); + if (!iter) + goto error; + + iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); + if (!iter->key) + goto error; + + return iter; + +error: + map_iter_free(iter); + return NULL; +} + +static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (map_iter(m)->done) + return NULL; + + if (unlikely(v == SEQ_START_TOKEN)) + goto done; + + if (map->ops->map_get_next_key(map, key, key)) { + map_iter(m)->done = true; + return NULL; + } + +done: + ++(*pos); + return key; +} + +static void *map_seq_start(struct seq_file *m, loff_t *pos) +{ + if (map_iter(m)->done) + return NULL; + + return *pos ? map_iter(m)->key : SEQ_START_TOKEN; +} + +static void map_seq_stop(struct seq_file *m, void *v) +{ +} + +static int map_seq_show(struct seq_file *m, void *v) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (unlikely(v == SEQ_START_TOKEN)) { + seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); + seq_puts(m, "# WARNING!! The output format will change\n"); + } else { + map->ops->map_seq_show_elem(map, key, m); + } + + return 0; +} + +static const struct seq_operations bpffs_map_seq_ops = { + .start = map_seq_start, + .next = map_seq_next, + .show = map_seq_show, + .stop = map_seq_stop, +}; + +static int bpffs_map_open(struct inode *inode, struct file *file) +{ + struct bpf_map *map = inode->i_private; + struct map_iter *iter; + struct seq_file *m; + int err; + + iter = map_iter_alloc(map); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &bpffs_map_seq_ops); + if (err) { + map_iter_free(iter); + return err; + } + + m = file->private_data; + m->private = iter; + + return 0; +} + +static int bpffs_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + map_iter_free(map_iter(m)); + + return seq_release(inode, file); +} + +/* bpffs_map_fops should only implement the basic + * read operation for a BPF map. The purpose is to + * provide a simple user intuitive way to do + * "cat bpffs/pathto/a-pinned-map". + * + * Other operations (e.g. write, lookup...) should be realized by + * the userspace tools (e.g. bpftool) through the + * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update + * interface. + */ +static const struct file_operations bpffs_map_fops = { + .open = bpffs_map_open, + .read = seq_read, + .release = bpffs_map_release, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, - const struct inode_operations *iops) + const struct inode_operations *iops, + const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); @@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, return PTR_ERR(inode); inode->i_op = iops; + inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); @@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); + struct bpf_map *map = arg; + + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, + map->btf ? &bpffs_map_fops : NULL); } static struct dentry * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0a4924a0a8da..fe23dc5a3ec4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -251,6 +252,7 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_uncharge_memlock(map); security_bpf_map_free(map); + btf_put(map->btf); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -416,7 +418,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_ifindex +#define BPF_MAP_CREATE_LAST_FIELD btf_value_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -450,6 +452,33 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); + if (bpf_map_support_seq_show(map) && + (attr->btf_key_id || attr->btf_value_id)) { + struct btf *btf; + + if (!attr->btf_key_id || !attr->btf_value_id) { + err = -EINVAL; + goto free_map_nouncharge; + } + + btf = btf_get_by_fd(attr->btf_fd); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + goto free_map_nouncharge; + } + + err = map->ops->map_check_btf(map, btf, attr->btf_key_id, + attr->btf_value_id); + if (err) { + btf_put(btf); + goto free_map_nouncharge; + } + + map->btf = btf; + map->btf_key_id = attr->btf_key_id; + map->btf_value_id = attr->btf_value_id; + } + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; @@ -482,6 +511,7 @@ free_map: free_map_sec: security_bpf_map_free(map); free_map_nouncharge: + btf_put(map->btf); map->ops->map_free(map); return err; } -- cgit v1.2.3 From 9e4d60938a2b2aae3c2c213c923d9eb8d0a87ba2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:50 +0200 Subject: net: phy: mdio-gpio: Remove reset function The platform data can contain a function to call to reset the bit banging interface. It is not used, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 1 - include/linux/platform_data/mdio-gpio.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 369f5f35d6fd..570b87b82abc 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -141,7 +141,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, goto out; bitbang->ctrl.ops = &mdio_gpio_ops; - bitbang->ctrl.reset = pdata->reset; mdc = pdata->mdc; bitbang->mdc = gpio_to_desc(mdc); if (pdata->mdc_active_low) diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 11f00cdabe3d..6e8f01a570f2 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -26,8 +26,6 @@ struct mdio_gpio_platform_data { u32 phy_mask; u32 phy_ignore_ta_mask; int irqs[PHY_MAX_ADDR]; - /* reset callback */ - int (*reset)(struct mii_bus *bus); }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From a3283e2576736463e07ddc684efb2e587a3bbda2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:51 +0200 Subject: net: phy: mdio-bitbang: Remove reset support The mdio-gpio driver was the only user of the interface reset option. Since it no longer uses it, remove it from the bit banging code. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-bitbang.c | 9 --------- include/linux/mdio-bitbang.h | 2 -- 2 files changed, 11 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c index 403b085f0a89..15352f987bdf 100644 --- a/drivers/net/phy/mdio-bitbang.c +++ b/drivers/net/phy/mdio-bitbang.c @@ -205,14 +205,6 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) return 0; } -static int mdiobb_reset(struct mii_bus *bus) -{ - struct mdiobb_ctrl *ctrl = bus->priv; - if (ctrl->reset) - ctrl->reset(bus); - return 0; -} - struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl) { struct mii_bus *bus; @@ -225,7 +217,6 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl) bus->read = mdiobb_read; bus->write = mdiobb_write; - bus->reset = mdiobb_reset; bus->priv = ctrl; return bus; diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h index a8ac9cfa014c..5d71e8a8500f 100644 --- a/include/linux/mdio-bitbang.h +++ b/include/linux/mdio-bitbang.h @@ -33,8 +33,6 @@ struct mdiobb_ops { struct mdiobb_ctrl { const struct mdiobb_ops *ops; - /* reset callback */ - int (*reset)(struct mii_bus *bus); }; /* The returned bus is not yet registered with the phy layer. */ -- cgit v1.2.3 From c1b3eb04682f80af74572aa25601dbb555c6bc6e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:52 +0200 Subject: net: phy: mdio-gpio: remove support for ignoring turn around This is not needed any more by devices using platform data, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 1 - include/linux/platform_data/mdio-gpio.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 570b87b82abc..28ad9ca7b9e7 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -163,7 +163,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, new_bus->name = "GPIO Bitbanged MDIO"; new_bus->phy_mask = pdata->phy_mask; - new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask; memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq)); new_bus->parent = dev; diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 6e8f01a570f2..b8f914a30126 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -24,7 +24,6 @@ struct mdio_gpio_platform_data { bool mdo_active_low; u32 phy_mask; - u32 phy_ignore_ta_mask; int irqs[PHY_MAX_ADDR]; }; -- cgit v1.2.3 From 185a16b60a239f9db3fe8b8ce931a2fd61330853 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:53 +0200 Subject: net: phy: mdio-gpio: remove support for phy mask This is not needed any more by devices using platform data, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 4 ---- include/linux/platform_data/mdio-gpio.h | 1 - 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 28ad9ca7b9e7..676ba0dd04be 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -162,13 +162,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, new_bus->name = "GPIO Bitbanged MDIO"; - new_bus->phy_mask = pdata->phy_mask; memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq)); new_bus->parent = dev; - if (new_bus->phy_mask == ~0) - goto out_free_bus; - for (i = 0; i < PHY_MAX_ADDR; i++) if (!new_bus->irq[i]) new_bus->irq[i] = PHY_POLL; diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index b8f914a30126..7d55dfef56dc 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -23,7 +23,6 @@ struct mdio_gpio_platform_data { bool mdio_active_low; bool mdo_active_low; - u32 phy_mask; int irqs[PHY_MAX_ADDR]; }; -- cgit v1.2.3 From 68abb4f25d7eaf4d5f40108aa748621ddab68209 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:54 +0200 Subject: net: phy: mdio-gpio: Remove support for IRQs in platform data No current devices use IRQs in platform data, so remove support for it. The MDIO core will also initialise the new bus such that all addresses are polled, so remove the unneeded re-initialisation. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 7 ------- include/linux/platform_data/mdio-gpio.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 676ba0dd04be..0f8c748c8edd 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -130,7 +130,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, { struct mii_bus *new_bus; struct mdio_gpio_info *bitbang; - int i; int mdc, mdio, mdo; unsigned long mdc_flags = GPIOF_OUT_INIT_LOW; unsigned long mdio_flags = GPIOF_DIR_IN; @@ -161,14 +160,8 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, goto out; new_bus->name = "GPIO Bitbanged MDIO"; - - memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq)); new_bus->parent = dev; - for (i = 0; i < PHY_MAX_ADDR; i++) - if (!new_bus->irq[i]) - new_bus->irq[i] = PHY_POLL; - if (bus_id != -1) snprintf(new_bus->id, MII_BUS_ID_SIZE, "gpio-%x", bus_id); else diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 7d55dfef56dc..af3be0c4ff9b 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -22,8 +22,6 @@ struct mdio_gpio_platform_data { bool mdc_active_low; bool mdio_active_low; bool mdo_active_low; - - int irqs[PHY_MAX_ADDR]; }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From c82fc4814a93f36c9b536b5af8dd617e4ee1380f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:55 +0200 Subject: net: phy: mdio-gpio: Swap to using gpio descriptors This simplifies the code, removing the need to handle active low flags, etc. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 73 ++++++++------------------------- include/linux/platform_data/mdio-gpio.h | 10 ++--- 2 files changed, 20 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 0f8c748c8edd..b999f32374e2 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -35,35 +35,25 @@ struct mdio_gpio_info { struct gpio_desc *mdc, *mdio, *mdo; }; -static void *mdio_gpio_of_get_data(struct platform_device *pdev) +static void *mdio_gpio_of_get_data(struct device *dev) { - struct device_node *np = pdev->dev.of_node; struct mdio_gpio_platform_data *pdata; - enum of_gpio_flags flags; - int ret; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return NULL; - ret = of_get_gpio_flags(np, 0, &flags); - if (ret < 0) - return NULL; - - pdata->mdc = ret; - pdata->mdc_active_low = flags & OF_GPIO_ACTIVE_LOW; + pdata->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW); + if (IS_ERR(pdata->mdc)) + return ERR_CAST(pdata->mdc); - ret = of_get_gpio_flags(np, 1, &flags); - if (ret < 0) - return NULL; - pdata->mdio = ret; - pdata->mdio_active_low = flags & OF_GPIO_ACTIVE_LOW; + pdata->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN); + if (IS_ERR(pdata->mdio)) + return ERR_CAST(pdata->mdio); - ret = of_get_gpio_flags(np, 2, &flags); - if (ret > 0) { - pdata->mdo = ret; - pdata->mdo_active_low = flags & OF_GPIO_ACTIVE_LOW; - } + pdata->mdo = devm_gpiod_get_index_optional(dev, NULL, 2, GPIOD_OUT_LOW); + if (IS_ERR(pdata->mdo)) + return ERR_CAST(pdata->mdo); return pdata; } @@ -130,34 +120,19 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, { struct mii_bus *new_bus; struct mdio_gpio_info *bitbang; - int mdc, mdio, mdo; - unsigned long mdc_flags = GPIOF_OUT_INIT_LOW; - unsigned long mdio_flags = GPIOF_DIR_IN; - unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH; bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL); if (!bitbang) - goto out; + return NULL; bitbang->ctrl.ops = &mdio_gpio_ops; - mdc = pdata->mdc; - bitbang->mdc = gpio_to_desc(mdc); - if (pdata->mdc_active_low) - mdc_flags = GPIOF_OUT_INIT_HIGH | GPIOF_ACTIVE_LOW; - mdio = pdata->mdio; - bitbang->mdio = gpio_to_desc(mdio); - if (pdata->mdio_active_low) - mdio_flags |= GPIOF_ACTIVE_LOW; - mdo = pdata->mdo; - if (mdo) { - bitbang->mdo = gpio_to_desc(mdo); - if (pdata->mdo_active_low) - mdo_flags = GPIOF_OUT_INIT_LOW | GPIOF_ACTIVE_LOW; - } + bitbang->mdc = pdata->mdc; + bitbang->mdio = pdata->mdio; + bitbang->mdo = pdata->mdo; new_bus = alloc_mdio_bitbang(&bitbang->ctrl); if (!new_bus) - goto out; + return NULL; new_bus->name = "GPIO Bitbanged MDIO"; new_bus->parent = dev; @@ -167,23 +142,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, else strncpy(new_bus->id, "gpio", MII_BUS_ID_SIZE); - if (devm_gpio_request_one(dev, mdc, mdc_flags, "mdc")) - goto out_free_bus; - - if (devm_gpio_request_one(dev, mdio, mdio_flags, "mdio")) - goto out_free_bus; - - if (mdo && devm_gpio_request_one(dev, mdo, mdo_flags, "mdo")) - goto out_free_bus; - dev_set_drvdata(dev, new_bus); return new_bus; - -out_free_bus: - free_mdio_bitbang(new_bus); -out: - return NULL; } static void mdio_gpio_bus_deinit(struct device *dev) @@ -208,7 +169,7 @@ static int mdio_gpio_probe(struct platform_device *pdev) int ret, bus_id; if (pdev->dev.of_node) { - pdata = mdio_gpio_of_get_data(pdev); + pdata = mdio_gpio_of_get_data(&pdev->dev); bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio"); if (bus_id < 0) { dev_warn(&pdev->dev, "failed to get alias id\n"); diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index af3be0c4ff9b..bd91fa98a3aa 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -15,13 +15,9 @@ struct mdio_gpio_platform_data { /* GPIO numbers for bus pins */ - unsigned int mdc; - unsigned int mdio; - unsigned int mdo; - - bool mdc_active_low; - bool mdio_active_low; - bool mdo_active_low; + struct gpio_desc *mdc; + struct gpio_desc *mdio; + struct gpio_desc *mdo; }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From fb78a95e22123da57cb79b98bdc62eb33965ca8a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:58 +0200 Subject: net: phy: mdio-gpio: Add #defines for the GPIO index's The GPIOs are described in device tree using a list, without names. Add defines to indicate what each index in the list means. These defines should also be used by platform devices passing GPIOs via a GPIO lookup table. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 10 +++++++--- include/linux/mdio-gpio.h | 9 +++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 include/linux/mdio-gpio.h (limited to 'include') diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 74b982835ac1..281c905ef9fd 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -38,15 +40,17 @@ struct mdio_gpio_info { static int mdio_gpio_get_data(struct device *dev, struct mdio_gpio_info *bitbang) { - bitbang->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW); + bitbang->mdc = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDC, + GPIOD_OUT_LOW); if (IS_ERR(bitbang->mdc)) return PTR_ERR(bitbang->mdc); - bitbang->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN); + bitbang->mdio = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDIO, + GPIOD_IN); if (IS_ERR(bitbang->mdio)) return PTR_ERR(bitbang->mdio); - bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, 2, + bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, MDIO_GPIO_MDO, GPIOD_OUT_LOW); return PTR_ERR_OR_ZERO(bitbang->mdo); } diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h new file mode 100644 index 000000000000..cea443a672cb --- /dev/null +++ b/include/linux/mdio-gpio.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MDIO_GPIO_H +#define __LINUX_MDIO_GPIO_H + +#define MDIO_GPIO_MDC 0 +#define MDIO_GPIO_MDIO 1 +#define MDIO_GPIO_MDO 2 + +#endif -- cgit v1.2.3 From 0207dd1173fe31c153ffd439c4bb33d1341829b1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:59 +0200 Subject: net: phy: mdio-gpio: Remove redundant platform data header The platform data header file is now unused. Remove it, but add an extra include which it brought in. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 1 - drivers/net/phy/mdio-gpio.c | 2 +- include/linux/platform_data/mdio-gpio.h | 23 ----------------------- 3 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 include/linux/platform_data/mdio-gpio.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index b60179d948bb..a7321687cae9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5321,7 +5321,6 @@ F: include/linux/*mdio*.h F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h -F: include/linux/platform_data/mdio-gpio.h F: include/linux/platform_data/mdio-bcm-unimac.h F: include/trace/events/mdio.h F: include/uapi/linux/mdio.h diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 281c905ef9fd..b501221819e1 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h deleted file mode 100644 index bd91fa98a3aa..000000000000 --- a/include/linux/platform_data/mdio-gpio.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * MDIO-GPIO bus platform data structures - * - * Copyright (C) 2008, Paulius Zaleckas - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ - -#ifndef __LINUX_MDIO_GPIO_H -#define __LINUX_MDIO_GPIO_H - -#include - -struct mdio_gpio_platform_data { - /* GPIO numbers for bus pins */ - struct gpio_desc *mdc; - struct gpio_desc *mdio; - struct gpio_desc *mdo; -}; - -#endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From 27cced20192d25ae528db8fe694c95c7656f3d56 Mon Sep 17 00:00:00 2001 From: Michael Karcher Date: Thu, 19 Apr 2018 14:05:22 +1200 Subject: net-next: ax88796: Add block_input/output hooks to ax_plat_data Add platform specific hooks for block transfer reads/writes of packet buffer data, superseding the default provided ax_block_input/output. Currently used for m68k Amiga XSurf100. Signed-off-by: Michael Karcher Signed-off-by: Michael Schmitz Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/ax88796.c | 10 ++++++++-- include/net/ax88796.h | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index d3f30f1c40b2..939a572a19ff 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -758,8 +758,14 @@ static int ax_init_dev(struct net_device *dev) #endif ei_local->reset_8390 = &ax_reset_8390; - ei_local->block_input = &ax_block_input; - ei_local->block_output = &ax_block_output; + if (ax->plat->block_input) + ei_local->block_input = ax->plat->block_input; + else + ei_local->block_input = &ax_block_input; + if (ax->plat->block_output) + ei_local->block_output = ax->plat->block_output; + else + ei_local->block_output = &ax_block_output; ei_local->get_8390_hdr = &ax_get_8390_hdr; ei_local->priv = 0; diff --git a/include/net/ax88796.h b/include/net/ax88796.h index b9a3beca0ce4..363b0ca5f7e8 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -12,6 +12,9 @@ #ifndef __NET_AX88796_PLAT_H #define __NET_AX88796_PLAT_H +struct sk_buff; +struct net_device; + #define AXFLG_HAS_EEPROM (1<<0) #define AXFLG_MAC_FROMDEV (1<<1) /* device already has MAC */ #define AXFLG_HAS_93CX6 (1<<2) /* use eeprom_93cx6 driver */ @@ -26,6 +29,12 @@ struct ax_plat_data { u32 *reg_offsets; /* register offsets */ u8 *mac_addr; /* MAC addr (only used when AXFLG_MAC_FROMPLATFORM is used */ + + /* uses default ax88796 buffer if set to NULL */ + void (*block_output)(struct net_device *dev, int count, + const unsigned char *buf, int star_page); + void (*block_input)(struct net_device *dev, int count, + struct sk_buff *skb, int ring_offset); }; #endif /* __NET_AX88796_PLAT_H */ -- cgit v1.2.3 From cec4c1c54a643608c262bd9bb72cf9bbec64f44a Mon Sep 17 00:00:00 2001 From: Michael Karcher Date: Thu, 19 Apr 2018 14:05:23 +1200 Subject: net-next: ax88796: add interrupt status callback to platform data To be able to tell the ax88796 driver whether it is sensible to enter the 8390 interrupt handler, an "is this interrupt caused by the 88796" callback has been added to the ax_plat_data structure (with NULL being compatible to the previous behaviour). Signed-off-by: Michael Karcher Signed-off-by: Michael Schmitz Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/ax88796.c | 23 +++++++++++++++++++++-- include/net/ax88796.h | 5 +++++ 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index 939a572a19ff..d283ed092519 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -163,6 +163,21 @@ static void ax_reset_8390(struct net_device *dev) ei_outb(ENISR_RESET, addr + EN0_ISR); /* Ack intr. */ } +/* Wrapper for __ei_interrupt for platforms that have a platform-specific + * way to find out whether the interrupt request might be caused by + * the ax88796 chip. + */ +static irqreturn_t ax_ei_interrupt_filtered(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct ax_device *ax = to_ax_dev(dev); + struct platform_device *pdev = to_platform_device(dev->dev.parent); + + if (!ax->plat->check_irq(pdev)) + return IRQ_NONE; + + return ax_ei_interrupt(irq, dev_id); +} static void ax_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr, int ring_page) @@ -482,8 +497,12 @@ static int ax_open(struct net_device *dev) if (ret) goto failed_mii; - ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags, - dev->name, dev); + if (ax->plat->check_irq) + ret = request_irq(dev->irq, ax_ei_interrupt_filtered, + ax->irqflags, dev->name, dev); + else + ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags, + dev->name, dev); if (ret) goto failed_request_irq; diff --git a/include/net/ax88796.h b/include/net/ax88796.h index 363b0ca5f7e8..84b3785d0e66 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -14,6 +14,7 @@ struct sk_buff; struct net_device; +struct platform_device; #define AXFLG_HAS_EEPROM (1<<0) #define AXFLG_MAC_FROMDEV (1<<1) /* device already has MAC */ @@ -35,6 +36,10 @@ struct ax_plat_data { const unsigned char *buf, int star_page); void (*block_input)(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset); + /* returns nonzero if a pending interrupt request might by caused by + * the ax88786. Handles all interrupts if set to NULL + */ + int (*check_irq)(struct platform_device *pdev); }; #endif /* __NET_AX88796_PLAT_H */ -- cgit v1.2.3 From a597043304a13defc646bb1f16514e4903b36c3c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Apr 2018 15:06:05 +0200 Subject: clk: Remove clk_init_cb typedef Since commit c08ee14cc6634457 ("clk: ti: change clock init to use generic of_clk_init"), there is only a single (private) user left of the (public) clk_init_cb typedef. Hence expand its single user in the core clock code, and remove the typedef. Signed-off-by: Geert Uytterhoeven Signed-off-by: Michael Turquette Link: lkml.kernel.org/r/1523365565-17124-1-git-send-email-geert+renesas@glider.be --- drivers/clk/clk.c | 2 +- include/linux/clk-provider.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index ea67ac81c6f9..972f1ea4b63f 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3906,7 +3906,7 @@ int of_clk_parent_fill(struct device_node *np, const char **parents, EXPORT_SYMBOL_GPL(of_clk_parent_fill); struct clock_provider { - of_clk_init_cb_t clk_init_cb; + void (*clk_init_cb)(struct device_node *); struct device_node *np; struct list_head node; }; diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 210a890008f9..410a8627b8c0 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -802,8 +802,6 @@ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate); struct of_device_id; -typedef void (*of_clk_init_cb_t)(struct device_node *); - struct clk_onecell_data { struct clk **clks; unsigned int clk_num; -- cgit v1.2.3 From 669ca0303ac93adba0e046d414165250861efdb7 Mon Sep 17 00:00:00 2001 From: ryang Date: Thu, 19 Apr 2018 12:18:50 -0400 Subject: regulator: tps6586x: Add support for TPS658624 This version is exists in the Samsung Galaxy Tab 10.1 which is based on the Nvidia Tegra 2 board. The TPS658624 has the same SM2 voltage table as TPS658623. Signed-off-by: ryang Acked-by: Lee Jones Signed-off-by: Mark Brown --- drivers/regulator/tps6586x-regulator.c | 1 + include/linux/mfd/tps6586x.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/regulator/tps6586x-regulator.c b/drivers/regulator/tps6586x-regulator.c index 9e9d22038017..ba3dae7b2d2d 100644 --- a/drivers/regulator/tps6586x-regulator.c +++ b/drivers/regulator/tps6586x-regulator.c @@ -342,6 +342,7 @@ static struct tps6586x_regulator *find_regulator_info(int id, int version) switch (version) { case TPS658623: + case TPS658624: table = tps658623_regulator; num = ARRAY_SIZE(tps658623_regulator); break; diff --git a/include/linux/mfd/tps6586x.h b/include/linux/mfd/tps6586x.h index 2fe68e481230..b19c2801a30e 100644 --- a/include/linux/mfd/tps6586x.h +++ b/include/linux/mfd/tps6586x.h @@ -18,6 +18,7 @@ #define TPS658621A 0x15 #define TPS658621CD 0x2c #define TPS658623 0x1b +#define TPS658624 0x0a #define TPS658640 0x01 #define TPS658640v2 0x02 #define TPS658643 0x03 -- cgit v1.2.3 From 02f3703934a42417021405ef336fe45add13c3d1 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 18 Apr 2018 08:54:18 -0700 Subject: regulator: Don't return or expect -errno from of_map_mode() In of_get_regulation_constraints() we were taking the result of of_map_mode() (an unsigned int) and assigning it to an int. We were then checking whether this value was -EINVAL. Some implementers of of_map_mode() were returning -EINVAL (even though the return type of their function needed to be unsigned int) because they needed to signal an error back to of_get_regulation_constraints(). In general in the regulator framework the mode is always referred to as an unsigned int. While we could fix this to be a signed int (the highest value we store in there right now is 0x8), it's actually pretty clean to just define the regulator mode 0x0 (the lack of any bits set) as an invalid mode. Let's do that. Fixes: 5e5e3a42c653 ("regulator: of: Add support for parsing initial and suspend modes") Suggested-by: Javier Martinez Canillas Signed-off-by: Douglas Anderson Reviewed-by: Javier Martinez Canillas Signed-off-by: Mark Brown --- drivers/regulator/cpcap-regulator.c | 2 +- drivers/regulator/of_regulator.c | 13 +++++++------ drivers/regulator/twl-regulator.c | 2 +- include/linux/regulator/consumer.h | 1 + 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/regulator/cpcap-regulator.c b/drivers/regulator/cpcap-regulator.c index f541b80f1b54..bd910fe123d9 100644 --- a/drivers/regulator/cpcap-regulator.c +++ b/drivers/regulator/cpcap-regulator.c @@ -222,7 +222,7 @@ static unsigned int cpcap_map_mode(unsigned int mode) case CPCAP_BIT_AUDIO_LOW_PWR: return REGULATOR_MODE_STANDBY; default: - return -EINVAL; + return REGULATOR_MODE_INVALID; } } diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index f47264fa1940..0d3f73eacb99 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -31,6 +31,7 @@ static void of_get_regulation_constraints(struct device_node *np, struct regulation_constraints *constraints = &(*init_data)->constraints; struct regulator_state *suspend_state; struct device_node *suspend_np; + unsigned int mode; int ret, i; u32 pval; @@ -124,11 +125,11 @@ static void of_get_regulation_constraints(struct device_node *np, if (!of_property_read_u32(np, "regulator-initial-mode", &pval)) { if (desc && desc->of_map_mode) { - ret = desc->of_map_mode(pval); - if (ret == -EINVAL) + mode = desc->of_map_mode(pval); + if (mode == REGULATOR_MODE_INVALID) pr_err("%s: invalid mode %u\n", np->name, pval); else - constraints->initial_mode = ret; + constraints->initial_mode = mode; } else { pr_warn("%s: mapping for mode %d not defined\n", np->name, pval); @@ -163,12 +164,12 @@ static void of_get_regulation_constraints(struct device_node *np, if (!of_property_read_u32(suspend_np, "regulator-mode", &pval)) { if (desc && desc->of_map_mode) { - ret = desc->of_map_mode(pval); - if (ret == -EINVAL) + mode = desc->of_map_mode(pval); + if (mode == REGULATOR_MODE_INVALID) pr_err("%s: invalid mode %u\n", np->name, pval); else - suspend_state->mode = ret; + suspend_state->mode = mode; } else { pr_warn("%s: mapping for mode %d not defined\n", np->name, pval); diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c index a4456db5849d..884c7505ed91 100644 --- a/drivers/regulator/twl-regulator.c +++ b/drivers/regulator/twl-regulator.c @@ -274,7 +274,7 @@ static inline unsigned int twl4030reg_map_mode(unsigned int mode) case RES_STATE_SLEEP: return REGULATOR_MODE_STANDBY; default: - return -EINVAL; + return REGULATOR_MODE_INVALID; } } diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index df176d7c2b87..25602afd4844 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -80,6 +80,7 @@ struct regmap; * These modes can be OR'ed together to make up a mask of valid register modes. */ +#define REGULATOR_MODE_INVALID 0x0 #define REGULATOR_MODE_FAST 0x1 #define REGULATOR_MODE_NORMAL 0x2 #define REGULATOR_MODE_IDLE 0x4 -- cgit v1.2.3 From 95d1544eb643847e05df06c3de252609593c9073 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 23 Mar 2018 16:59:52 -0400 Subject: media: rc: add ioctl to get the current timeout Since the kernel now modifies the timeout, make it possible to retrieve the current value. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/lirc-func.rst | 1 + Documentation/media/uapi/rc/lirc-set-rec-timeout.rst | 14 +++++++++----- drivers/media/rc/lirc_dev.c | 7 +++++++ include/uapi/linux/lirc.h | 6 ++++++ 4 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/media/uapi/rc/lirc-func.rst b/Documentation/media/uapi/rc/lirc-func.rst index ddb4620de294..9656423a3f28 100644 --- a/Documentation/media/uapi/rc/lirc-func.rst +++ b/Documentation/media/uapi/rc/lirc-func.rst @@ -17,6 +17,7 @@ LIRC Function Reference lirc-get-rec-resolution lirc-set-send-duty-cycle lirc-get-timeout + lirc-get-rec-timeout lirc-set-rec-timeout lirc-set-rec-carrier lirc-set-rec-carrier-range diff --git a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst index b3e16bbdbc90..a833a6a4c25a 100644 --- a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst +++ b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst @@ -1,19 +1,23 @@ .. -*- coding: utf-8; mode: rst -*- .. _lirc_set_rec_timeout: +.. _lirc_get_rec_timeout: -************************** -ioctl LIRC_SET_REC_TIMEOUT -************************** +*************************************************** +ioctl LIRC_GET_REC_TIMEOUT and LIRC_SET_REC_TIMEOUT +*************************************************** Name ==== -LIRC_SET_REC_TIMEOUT - sets the integer value for IR inactivity timeout. +LIRC_GET_REC_TIMEOUT/LIRC_SET_REC_TIMEOUT - Get/set the integer value for IR inactivity timeout. Synopsis ======== +.. c:function:: int ioctl( int fd, LIRC_GET_REC_TIMEOUT, __u32 *timeout ) + :name: LIRC_GET_REC_TIMEOUT + .. c:function:: int ioctl( int fd, LIRC_SET_REC_TIMEOUT, __u32 *timeout ) :name: LIRC_SET_REC_TIMEOUT @@ -30,7 +34,7 @@ Arguments Description =========== -Sets the integer value for IR inactivity timeout. +Get and set the integer value for IR inactivity timeout. If supported by the hardware, setting it to 0 disables all hardware timeouts and data should be reported as soon as possible. If the exact value diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 247e6fc3dc0c..6b4755e9fa25 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -575,6 +575,13 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, } break; + case LIRC_GET_REC_TIMEOUT: + if (!dev->timeout) + ret = -ENOTTY; + else + val = DIV_ROUND_UP(dev->timeout, 1000); + break; + case LIRC_SET_REC_TIMEOUT_REPORTS: if (!dev->timeout) ret = -ENOTTY; diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index f189931042a7..6b319581882f 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -133,6 +133,12 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * Return the recording timeout, which is either set by + * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. + */ +#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) + /* * struct lirc_scancode - decoded scancode with protocol for use with * LIRC_MODE_SCANCODE -- cgit v1.2.3 From f991f01571281b82e5e6ca74445f5d3f42d72212 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 May 2015 23:13:15 +0200 Subject: y2038: asm-generic: Extend sysvipc data structures Most architectures now use the asm-generic copy of the sysvipc data structures (msqid64_ds, semid64_ds, shmid64_ds), which use 32-bit __kernel_time_t on 32-bit architectures but have padding behind them to allow extending the type to 64-bit. Unfortunately, that fails on all big-endian architectures, which have the padding on the wrong side. As so many of them get it wrong, we decided to not bother even trying to fix it up when we introduced the asm-generic copy. Instead we always use the padding word now to provide the upper 32 bits of the seconds value, regardless of the endianess. A libc implementation on a typical big-endian system can deal with this by providing its own copy of the structure definition to user space, and swapping the two 32-bit words before returning from the semctl/shmctl/msgctl system calls. Note that msqid64_ds and shmid64_ds were broken on x32 since commit f4b4aae18288 ("x86/headers/uapi: Fix __BITS_PER_LONG value for x32 builds"). I have sent a separate fix for that, but as we no longer have to worry about x32 here, I no longer worry about x32 here and use 'unsigned long' instead of __kernel_ulong_t. Signed-off-by: Arnd Bergmann --- include/uapi/asm-generic/msgbuf.h | 27 +++++++++++++------------- include/uapi/asm-generic/sembuf.h | 26 +++++++++++++++---------- include/uapi/asm-generic/shmbuf.h | 41 +++++++++++++++++++-------------------- 3 files changed, 49 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/uapi/asm-generic/msgbuf.h b/include/uapi/asm-generic/msgbuf.h index fb306ebdb36f..9fe4881557cb 100644 --- a/include/uapi/asm-generic/msgbuf.h +++ b/include/uapi/asm-generic/msgbuf.h @@ -18,31 +18,30 @@ * On big-endian systems, the padding is in the wrong place. * * Pad space is left for: - * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values */ struct msqid64_ds { struct ipc64_perm msg_perm; +#if __BITS_PER_LONG == 64 __kernel_time_t msg_stime; /* last msgsnd time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif __kernel_time_t msg_rtime; /* last msgrcv time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif __kernel_time_t msg_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; +#else + unsigned long msg_stime; /* last msgsnd time */ + unsigned long msg_stime_high; + unsigned long msg_rtime; /* last msgrcv time */ + unsigned long msg_rtime_high; + unsigned long msg_ctime; /* last change time */ + unsigned long msg_ctime_high; #endif - __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */ - __kernel_ulong_t msg_qnum; /* number of messages in queue */ - __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */ + unsigned long msg_cbytes; /* current number of bytes on queue */ + unsigned long msg_qnum; /* number of messages in queue */ + unsigned long msg_qbytes; /* max number of bytes on queue */ __kernel_pid_t msg_lspid; /* pid of last msgsnd */ __kernel_pid_t msg_lrpid; /* last receive pid */ - __kernel_ulong_t __unused4; - __kernel_ulong_t __unused5; + unsigned long __unused4; + unsigned long __unused5; }; #endif /* __ASM_GENERIC_MSGBUF_H */ diff --git a/include/uapi/asm-generic/sembuf.h b/include/uapi/asm-generic/sembuf.h index cbf9cfe977d6..0bae010f1b64 100644 --- a/include/uapi/asm-generic/sembuf.h +++ b/include/uapi/asm-generic/sembuf.h @@ -13,23 +13,29 @@ * everyone just ended up making identical copies without specific * optimizations, so we may just as well all use the same one. * - * 64 bit architectures typically define a 64 bit __kernel_time_t, + * 64 bit architectures use a 64-bit __kernel_time_t here, while + * 32 bit architectures have a pair of unsigned long values. * so they do not need the first two padding words. - * On big-endian systems, the padding is in the wrong place. * - * Pad space is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values + * On big-endian systems, the padding is in the wrong place for + * historic reasons, so user space has to reconstruct a time_t + * value using + * + * user_semid_ds.sem_otime = kernel_semid64_ds.sem_otime + + * ((long long)kernel_semid64_ds.sem_otime_high << 32) + * + * Pad space is left for 2 miscellaneous 32-bit values */ struct semid64_ds { struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ +#if __BITS_PER_LONG == 64 __kernel_time_t sem_otime; /* last semop time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif __kernel_time_t sem_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; +#else + unsigned long sem_otime; /* last semop time */ + unsigned long sem_otime_high; + unsigned long sem_ctime; /* last change time */ + unsigned long sem_ctime_high; #endif unsigned long sem_nsems; /* no. of semaphores in array */ unsigned long __unused3; diff --git a/include/uapi/asm-generic/shmbuf.h b/include/uapi/asm-generic/shmbuf.h index 2b6c3bb97f97..e504422fc501 100644 --- a/include/uapi/asm-generic/shmbuf.h +++ b/include/uapi/asm-generic/shmbuf.h @@ -19,42 +19,41 @@ * * * Pad space is left for: - * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values */ struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ size_t shm_segsz; /* size of segment (bytes) */ +#if __BITS_PER_LONG == 64 __kernel_time_t shm_atime; /* last attach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif __kernel_time_t shm_dtime; /* last detach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif __kernel_time_t shm_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; +#else + unsigned long shm_atime; /* last attach time */ + unsigned long shm_atime_high; + unsigned long shm_dtime; /* last detach time */ + unsigned long shm_dtime_high; + unsigned long shm_ctime; /* last change time */ + unsigned long shm_ctime_high; #endif __kernel_pid_t shm_cpid; /* pid of creator */ __kernel_pid_t shm_lpid; /* pid of last operator */ - __kernel_ulong_t shm_nattch; /* no. of current attaches */ - __kernel_ulong_t __unused4; - __kernel_ulong_t __unused5; + unsigned long shm_nattch; /* no. of current attaches */ + unsigned long __unused4; + unsigned long __unused5; }; struct shminfo64 { - __kernel_ulong_t shmmax; - __kernel_ulong_t shmmin; - __kernel_ulong_t shmmni; - __kernel_ulong_t shmseg; - __kernel_ulong_t shmall; - __kernel_ulong_t __unused1; - __kernel_ulong_t __unused2; - __kernel_ulong_t __unused3; - __kernel_ulong_t __unused4; + unsigned long shmmax; + unsigned long shmmin; + unsigned long shmmni; + unsigned long shmseg; + unsigned long shmall; + unsigned long __unused1; + unsigned long __unused2; + unsigned long __unused3; + unsigned long __unused4; }; #endif /* __ASM_GENERIC_SHMBUF_H */ -- cgit v1.2.3 From 21fc538d817ce671f1a28a03996c715247c2ac89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Apr 2018 13:58:00 +0200 Subject: y2038: ipc: Use __kernel_timespec This is a preparatation for changing over __kernel_timespec to 64-bit times, which involves assigning new system call numbers for mq_timedsend(), mq_timedreceive() and semtimedop() for compatibility with future y2038 proof user space. The existing ABIs will remain available through compat code. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 6 +++--- ipc/mqueue.c | 6 +++--- ipc/sem.c | 4 ++-- ipc/util.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9a2a2601852..b92cb79d38c3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -680,8 +680,8 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info); /* ipc/mqueue.c */ asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); -asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); -asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct __kernel_timespec __user *abs_timeout); +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); @@ -698,7 +698,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg); asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg); asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops, unsigned nsops, - const struct timespec __user *timeout); + const struct __kernel_timespec __user *timeout); asmlinkage long sys_semop(int semid, struct sembuf __user *sops, unsigned nsops); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index a808f29d4c5a..9610afcfa2e5 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -691,7 +691,7 @@ static void __do_notify(struct mqueue_inode_info *info) wake_up(&info->wait_q); } -static int prepare_timeout(const struct timespec __user *u_abs_timeout, +static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout, struct timespec64 *ts) { if (get_timespec64(ts, u_abs_timeout)) @@ -1128,7 +1128,7 @@ out: SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, size_t, msg_len, unsigned int, msg_prio, - const struct timespec __user *, u_abs_timeout) + const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { @@ -1142,7 +1142,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, size_t, msg_len, unsigned int __user *, u_msg_prio, - const struct timespec __user *, u_abs_timeout) + const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { diff --git a/ipc/sem.c b/ipc/sem.c index 8935cd8cf166..b951e25ba2db 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2176,7 +2176,7 @@ out_free: } long ksys_semtimedop(int semid, struct sembuf __user *tsops, - unsigned int nsops, const struct timespec __user *timeout) + unsigned int nsops, const struct __kernel_timespec __user *timeout) { if (timeout) { struct timespec64 ts; @@ -2188,7 +2188,7 @@ long ksys_semtimedop(int semid, struct sembuf __user *tsops, } SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, - unsigned int, nsops, const struct timespec __user *, timeout) + unsigned int, nsops, const struct __kernel_timespec __user *, timeout) { return ksys_semtimedop(semid, tsops, nsops, timeout); } diff --git a/ipc/util.h b/ipc/util.h index acc5159e96d0..975c6de2df9d 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -251,7 +251,7 @@ static inline int compat_ipc_parse_version(int *cmd) /* for __ARCH_WANT_SYS_IPC */ long ksys_semtimedop(int semid, struct sembuf __user *tsops, unsigned int nsops, - const struct timespec __user *timeout); + const struct __kernel_timespec __user *timeout); long ksys_semget(key_t key, int nsems, int semflg); long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg); long ksys_msgget(key_t key, int msgflg); -- cgit v1.2.3 From c7f1770f369ae0ec632b3ddc2bd6677b0e485128 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 5 Apr 2018 10:54:14 -0400 Subject: media: omap: omap-iommu.h: allow building drivers with COMPILE_TEST Drivers that depend on omap-iommu.h (currently, just omap3isp) need a stub implementation in order to be built with COMPILE_TEST. Acked-by: Sakari Ailus Acked-by: Hans Verkuil Acked-by: Arnd Bergmann Acked-by: Tony Lindgren Signed-off-by: Mauro Carvalho Chehab --- include/linux/omap-iommu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/omap-iommu.h b/include/linux/omap-iommu.h index c1aede46718b..ce1b7c6283ee 100644 --- a/include/linux/omap-iommu.h +++ b/include/linux/omap-iommu.h @@ -13,7 +13,12 @@ #ifndef _OMAP_IOMMU_H_ #define _OMAP_IOMMU_H_ +#ifdef CONFIG_OMAP_IOMMU extern void omap_iommu_save_ctx(struct device *dev); extern void omap_iommu_restore_ctx(struct device *dev); +#else +static inline void omap_iommu_save_ctx(struct device *dev) {} +static inline void omap_iommu_restore_ctx(struct device *dev) {} +#endif #endif -- cgit v1.2.3 From a4dfa72d0acd1c99a160e25c099849ae37ad13fd Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Thu, 19 Apr 2018 11:06:18 +0200 Subject: tipc: set default MTU for UDP media Currently, all bearers are configured with MTU value same as the underlying L2 device. However, in case of bearers with media type UDP, higher throughput is possible with a fixed and higher emulated MTU value than adapting to the underlying L2 MTU. In this commit, we introduce a parameter mtu in struct tipc_media and a default value is set for UDP. A default value of 14k was determined by experimentation and found to have a higher throughput than 16k. MTU for UDP bearers are assigned the above set value of media MTU. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- include/uapi/linux/tipc_config.h | 5 +++++ net/tipc/udp_media.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 3f29e3c8ed06..4b2c93b1934c 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -185,6 +185,11 @@ #define TIPC_DEF_LINK_WIN 50 #define TIPC_MAX_LINK_WIN 8191 +/* + * Default MTU for UDP media + */ + +#define TIPC_DEF_LINK_UDP_MTU 14000 struct tipc_node_info { __be32 addr; /* network address of node */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index e7d91f5d5cae..9783101bc4a9 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, err = -EINVAL; goto err; } - b->mtu = dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr); + b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { udp_conf.family = AF_INET6; @@ -803,6 +802,7 @@ struct tipc_media udp_media_info = { .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .window = TIPC_DEF_LINK_WIN, + .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, .name = "udp" -- cgit v1.2.3 From 901271e0403af638c224987c2a4e55cebade7e91 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Thu, 19 Apr 2018 11:06:19 +0200 Subject: tipc: implement configuration of UDP media MTU In previous commit, we changed the default emulated MTU for UDP bearers to 14k. This commit adds the functionality to set/change the default value by configuring new MTU for UDP media. UDP bearer(s) have to be disabled and enabled back for the new MTU to take effect. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/bearer.c | 13 +++++++++++++ net/tipc/bearer.h | 3 +++ net/tipc/udp_media.h | 14 ++++++++++++++ 4 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 0affb682e5e3..85c11982c89b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -266,6 +266,7 @@ enum { TIPC_NLA_PROP_PRIO, /* u32 */ TIPC_NLA_PROP_TOL, /* u32 */ TIPC_NLA_PROP_WIN, /* u32 */ + TIPC_NLA_PROP_MTU, /* u32 */ __TIPC_NLA_PROP_MAX, TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index f7d47c89d658..a22caf9e5a18 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1029,6 +1029,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) goto prop_msg_full; + if (media->type_id == TIPC_MEDIA_TYPE_UDP) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) + goto prop_msg_full; nla_nest_end(msg->skb, prop); nla_nest_end(msg->skb, attrs); @@ -1158,6 +1161,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (props[TIPC_NLA_PROP_MTU]) { + if (m->type_id != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; +#ifdef CONFIG_TIPC_MEDIA_UDP + if (tipc_udp_mtu_bad(nla_get_u32 + (props[TIPC_NLA_PROP_MTU]))) + return -EINVAL; + m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); +#endif + } } return 0; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 6efcee63a381..394290cbbb1d 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -94,6 +94,8 @@ struct tipc_bearer; * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure * @window: default window (in packets) before declaring link congestion + * @mtu: max packet size bearer can support for media type not dependent on + * underlying device MTU * @type_id: TIPC media identifier * @hwaddr_len: TIPC media address len * @name: media name @@ -118,6 +120,7 @@ struct tipc_media { u32 priority; u32 tolerance; u32 window; + u32 mtu; u32 type_id; u32 hwaddr_len; char name[TIPC_MAX_MEDIA_NAME]; diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h index 281bbae87726..e7455cc73e16 100644 --- a/net/tipc/udp_media.h +++ b/net/tipc/udp_media.h @@ -38,9 +38,23 @@ #ifndef _TIPC_UDP_MEDIA_H #define _TIPC_UDP_MEDIA_H +#include +#include + int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb); +/* check if configured MTU is too low for tipc headers */ +static inline bool tipc_udp_mtu_bad(u32 mtu) +{ + if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) + + sizeof(struct udphdr))) + return false; + + pr_warn("MTU too low for tipc bearer\n"); + return true; +} + #endif #endif -- cgit v1.2.3 From 809c45a091d93e05c6e9b5d53bb3f1185273286b Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Thu, 19 Apr 2018 05:50:12 -0700 Subject: qed* : Add new TLV to request PF to update MAC in bulletin board There may be a need for VF driver to request PF to explicitly update its bulletin with a MAC address. e.g. When user assigns a MAC address to VF while VF is still down, and PF's bulletin board contains different MAC address, in this case, when VF's interface is brought up, it gets loaded with MAC address from bulletin board which is not desirable. To handle this corner case, we need a new TLV to request PF to update its bulletin board with suggested MAC. This request will be honored only for trusted VFs. Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_l2.c | 19 +++++++++++++ drivers/net/ethernet/qlogic/qed/qed_sriov.c | 37 ++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_vf.c | 29 ++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_vf.h | 21 +++++++++++++++ drivers/net/ethernet/qlogic/qede/qede_filter.c | 4 +++ include/linux/qed/qed_eth_if.h | 1 + 6 files changed, 111 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index e874504e8b28..8b1b7e8ca56c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -2850,6 +2850,24 @@ static int qed_fp_cqe_completion(struct qed_dev *dev, cqe); } +static int qed_req_bulletin_update_mac(struct qed_dev *cdev, u8 *mac) +{ + int i, ret; + + if (IS_PF(cdev)) + return 0; + + for_each_hwfn(cdev, i) { + struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; + + ret = qed_vf_pf_bulletin_update_mac(p_hwfn, mac); + if (ret) + return ret; + } + + return 0; +} + #ifdef CONFIG_QED_SRIOV extern const struct qed_iov_hv_ops qed_iov_ops_pass; #endif @@ -2887,6 +2905,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = { .ntuple_filter_config = &qed_ntuple_arfs_filter_config, .configure_arfs_searcher = &qed_configure_arfs_searcher, .get_coalesce = &qed_get_coalesce, + .req_bulletin_update_mac = &qed_req_bulletin_update_mac, }; const struct qed_eth_ops *qed_get_eth_ops(void) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 77376fda8eae..f01bf52bc381 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3820,6 +3820,40 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn, __qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin); } +static int +qed_iov_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_vf_info *p_vf) +{ + struct qed_bulletin_content *p_bulletin = p_vf->bulletin.p_virt; + struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx; + struct vfpf_bulletin_update_mac_tlv *p_req; + u8 status = PFVF_STATUS_SUCCESS; + int rc = 0; + + if (!p_vf->p_vf_info.is_trusted_configured) { + DP_VERBOSE(p_hwfn, + QED_MSG_IOV, + "Blocking bulletin update request from untrusted VF[%d]\n", + p_vf->abs_vf_id); + status = PFVF_STATUS_NOT_SUPPORTED; + rc = -EINVAL; + goto send_status; + } + + p_req = &mbx->req_virt->bulletin_update_mac; + ether_addr_copy(p_bulletin->mac, p_req->mac); + DP_VERBOSE(p_hwfn, QED_MSG_IOV, + "Updated bulletin of VF[%d] with requested MAC[%pM]\n", + p_vf->abs_vf_id, p_req->mac); + +send_status: + qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, + CHANNEL_TLV_BULLETIN_UPDATE_MAC, + sizeof(struct pfvf_def_resp_tlv), status); + return rc; +} + static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, int vfid) { @@ -3899,6 +3933,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn, case CHANNEL_TLV_COALESCE_READ: qed_iov_vf_pf_get_coalesce(p_hwfn, p_ptt, p_vf); break; + case CHANNEL_TLV_BULLETIN_UPDATE_MAC: + qed_iov_vf_pf_bulletin_update_mac(p_hwfn, p_ptt, p_vf); + break; } } else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) { DP_VERBOSE(p_hwfn, QED_MSG_IOV, diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 91b5e9f02a62..2d7fcd6a0777 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -1374,6 +1374,35 @@ exit: return rc; } +int +qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, + u8 *p_mac) +{ + struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info; + struct vfpf_bulletin_update_mac_tlv *p_req; + struct pfvf_def_resp_tlv *p_resp; + int rc; + + if (!p_mac) + return -EINVAL; + + /* clear mailbox and prep header tlv */ + p_req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_BULLETIN_UPDATE_MAC, + sizeof(*p_req)); + ether_addr_copy(p_req->mac, p_mac); + DP_VERBOSE(p_hwfn, QED_MSG_IOV, + "Requesting bulletin update for MAC[%pM]\n", p_mac); + + /* add list termination tlv */ + qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END, + sizeof(struct channel_list_end_tlv)); + + p_resp = &p_iov->pf2vf_reply->default_resp; + rc = qed_send_msg2pf(p_hwfn, &p_resp->hdr.status, sizeof(*p_resp)); + qed_vf_pf_req_end(p_hwfn, rc); + return rc; +} + int qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn, u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h index 97d44dfb38ca..4f05d5eb3cf5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.h +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h @@ -518,6 +518,12 @@ struct pfvf_read_coal_resp_tlv { u8 padding[6]; }; +struct vfpf_bulletin_update_mac_tlv { + struct vfpf_first_tlv first_tlv; + u8 mac[ETH_ALEN]; + u8 padding[2]; +}; + union vfpf_tlvs { struct vfpf_first_tlv first_tlv; struct vfpf_acquire_tlv acquire; @@ -532,6 +538,7 @@ union vfpf_tlvs { struct vfpf_update_tunn_param_tlv tunn_param_update; struct vfpf_update_coalesce update_coalesce; struct vfpf_read_coal_req_tlv read_coal_req; + struct vfpf_bulletin_update_mac_tlv bulletin_update_mac; struct tlv_buffer_size tlv_buf_size; }; @@ -650,6 +657,7 @@ enum { CHANNEL_TLV_COALESCE_UPDATE, CHANNEL_TLV_QID, CHANNEL_TLV_COALESCE_READ, + CHANNEL_TLV_BULLETIN_UPDATE_MAC, CHANNEL_TLV_MAX, /* Required for iterating over vport-update tlvs. @@ -1042,6 +1050,13 @@ int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn, struct qed_tunnel_info *p_tunn); u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id); +/** + * @brief - Ask PF to update the MAC address in it's bulletin board + * + * @param p_mac - mac address to be updated in bulletin board + */ +int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, u8 *p_mac); + #else static inline void qed_vf_get_link_params(struct qed_hwfn *p_hwfn, struct qed_mcp_link_params *params) @@ -1228,6 +1243,12 @@ static inline int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn, return -EINVAL; } +static inline int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, + u8 *p_mac) +{ + return -EINVAL; +} + static inline u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 8094f035b705..43569b1839be 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1160,6 +1160,10 @@ int qede_set_mac_addr(struct net_device *ndev, void *p) if (edev->state != QEDE_STATE_OPEN) { DP_VERBOSE(edev, NETIF_MSG_IFDOWN, "The device is currently down\n"); + /* Ask PF to explicitly update a copy in bulletin board */ + if (IS_VF(edev) && edev->ops->req_bulletin_update_mac) + edev->ops->req_bulletin_update_mac(edev->cdev, + ndev->dev_addr); goto out; } diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 147d08ccf813..7f9756fe9180 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -352,6 +352,7 @@ struct qed_eth_ops { int (*configure_arfs_searcher)(struct qed_dev *cdev, enum qed_filter_config_mode mode); int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle); + int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac); }; const struct qed_eth_ops *qed_get_eth_ops(void); -- cgit v1.2.3 From 1827b0678863bc97a1653fdf5308762b2aefcd56 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Thu, 19 Apr 2018 17:59:39 +0100 Subject: lan78xx: Read LED states from Device Tree Add support for DT property "microchip,led-modes", a vector of zero to four cells (u32s) in the range 0-15, each of which sets the mode for one of the LEDs. Some possible values are: 0=link/activity 1=link1000/activity 2=link100/activity 3=link10/activity 4=link100/1000/activity 5=link10/1000/activity 6=link10/100/activity 14=off 15=on These values are given symbolic constants in a dt-bindings header. Also use the presence of the DT property to indicate that the LEDs should be enabled - necessary in the event that no valid OTP or EEPROM is available. Signed-off-by: Phil Elwell Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 1 + drivers/net/phy/microchip.c | 25 ++++++++++++++++++++++ drivers/net/usb/lan78xx.c | 32 ++++++++++++++++++++++++++++- include/dt-bindings/net/microchip-lan78xx.h | 21 +++++++++++++++++++ include/linux/microchipphy.h | 3 +++ 5 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/net/microchip-lan78xx.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index a7321687cae9..c952d3076a65 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14572,6 +14572,7 @@ M: Microchip Linux Driver Support L: netdev@vger.kernel.org S: Maintained F: drivers/net/usb/lan78xx.* +F: include/dt-bindings/net/microchip-lan78xx.h USB MASS STORAGE DRIVER M: Alan Stern diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c index 0f293ef28935..ef5e160fe9dc 100644 --- a/drivers/net/phy/microchip.c +++ b/drivers/net/phy/microchip.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #define DRIVER_AUTHOR "WOOJUNG HUH " #define DRIVER_DESC "Microchip LAN88XX PHY driver" @@ -70,6 +72,8 @@ static int lan88xx_probe(struct phy_device *phydev) { struct device *dev = &phydev->mdio.dev; struct lan88xx_priv *priv; + u32 led_modes[4]; + int len; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -77,6 +81,27 @@ static int lan88xx_probe(struct phy_device *phydev) priv->wolopts = 0; + len = of_property_read_variable_u32_array(dev->of_node, + "microchip,led-modes", + led_modes, + 0, + ARRAY_SIZE(led_modes)); + if (len >= 0) { + u32 reg = 0; + int i; + + for (i = 0; i < len; i++) { + if (led_modes[i] > 15) + return -EINVAL; + reg |= led_modes[i] << (i * 4); + } + for (; i < ARRAY_SIZE(led_modes); i++) + reg |= LAN78XX_FORCE_LED_OFF << (i * 4); + (void)phy_write(phydev, LAN78XX_PHY_LED_MODE_SELECT, reg); + } else if (len == -EOVERFLOW) { + return -EINVAL; + } + /* these values can be used to identify internal PHY */ priv->chip_id = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_ID); priv->chip_rev = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_REV); diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index a823f010de30..6b03b973083e 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "lan78xx.h" @@ -1760,6 +1761,7 @@ done: static int lan78xx_mdio_init(struct lan78xx_net *dev) { + struct device_node *node; int ret; dev->mdiobus = mdiobus_alloc(); @@ -1788,7 +1790,13 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev) break; } - ret = mdiobus_register(dev->mdiobus); + node = of_get_child_by_name(dev->udev->dev.of_node, "mdio"); + if (node) { + ret = of_mdiobus_register(dev->mdiobus, node); + of_node_put(node); + } else { + ret = mdiobus_register(dev->mdiobus); + } if (ret) { netdev_err(dev->net, "can't register MDIO bus\n"); goto exit1; @@ -2077,6 +2085,28 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control); phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv); + if (phydev->mdio.dev.of_node) { + u32 reg; + int len; + + len = of_property_count_elems_of_size(phydev->mdio.dev.of_node, + "microchip,led-modes", + sizeof(u32)); + if (len >= 0) { + /* Ensure the appropriate LEDs are enabled */ + lan78xx_read_reg(dev, HW_CFG, ®); + reg &= ~(HW_CFG_LED0_EN_ | + HW_CFG_LED1_EN_ | + HW_CFG_LED2_EN_ | + HW_CFG_LED3_EN_); + reg |= (len > 0) * HW_CFG_LED0_EN_ | + (len > 1) * HW_CFG_LED1_EN_ | + (len > 2) * HW_CFG_LED2_EN_ | + (len > 3) * HW_CFG_LED3_EN_; + lan78xx_write_reg(dev, HW_CFG, reg); + } + } + genphy_config_aneg(phydev); dev->fc_autoneg = phydev->autoneg; diff --git a/include/dt-bindings/net/microchip-lan78xx.h b/include/dt-bindings/net/microchip-lan78xx.h new file mode 100644 index 000000000000..0742ff075307 --- /dev/null +++ b/include/dt-bindings/net/microchip-lan78xx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H +#define _DT_BINDINGS_MICROCHIP_LAN78XX_H + +/* LED modes for LAN7800/LAN7850 embedded PHY */ + +#define LAN78XX_LINK_ACTIVITY 0 +#define LAN78XX_LINK_1000_ACTIVITY 1 +#define LAN78XX_LINK_100_ACTIVITY 2 +#define LAN78XX_LINK_10_ACTIVITY 3 +#define LAN78XX_LINK_100_1000_ACTIVITY 4 +#define LAN78XX_LINK_10_1000_ACTIVITY 5 +#define LAN78XX_LINK_10_100_ACTIVITY 6 +#define LAN78XX_DUPLEX_COLLISION 8 +#define LAN78XX_COLLISION 9 +#define LAN78XX_ACTIVITY 10 +#define LAN78XX_AUTONEG_FAULT 12 +#define LAN78XX_FORCE_LED_OFF 14 +#define LAN78XX_FORCE_LED_ON 15 + +#endif diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h index eb492d47f717..8e4015e7f8d3 100644 --- a/include/linux/microchipphy.h +++ b/include/linux/microchipphy.h @@ -70,4 +70,7 @@ #define LAN88XX_MMD3_CHIP_ID (32877) #define LAN88XX_MMD3_CHIP_REV (32878) +/* Registers specific to the LAN7800/LAN7850 embedded phy */ +#define LAN78XX_PHY_LED_MODE_SELECT (0x1D) + #endif /* _MICROCHIPPHY_H */ -- cgit v1.2.3 From 13c935bb09948aef0202574ee12bb089459eb43b Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 9 Apr 2018 15:54:46 +0200 Subject: crypto: api - laying defines and checks for statically allocated buffers In preparation for the removal of VLAs[1] from crypto code. We create 2 new compile-time constants: all ciphers implemented in Linux have a block size less than or equal to 16 bytes and the most demanding hw require 16 bytes alignment for the block buffer. We also enforce these limits in crypto_check_alg when a new cipher is registered. [1] http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Signed-off-by: Salvatore Mesoraca Signed-off-by: Herbert Xu --- crypto/algapi.c | 10 ++++++++++ include/crypto/algapi.h | 8 ++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/crypto/algapi.c b/crypto/algapi.c index 2a0271b5f62a..c0755cf4f53f 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -10,6 +10,7 @@ * */ +#include #include #include #include @@ -59,6 +60,15 @@ static int crypto_check_alg(struct crypto_alg *alg) if (alg->cra_blocksize > PAGE_SIZE / 8) return -EINVAL; + if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_CIPHER) { + if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK) + return -EINVAL; + + if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE) + return -EINVAL; + } + if (alg->cra_priority < 0) return -EINVAL; diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 1aba888241dd..bd5e8ccf1687 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -17,6 +17,14 @@ #include #include +/* + * Maximum values for blocksize and alignmask, used to allocate + * static buffers that are big enough for any combination of + * ciphers and architectures. + */ +#define MAX_CIPHER_BLOCKSIZE 16 +#define MAX_CIPHER_ALIGNMASK 15 + struct crypto_aead; struct crypto_instance; struct module; -- cgit v1.2.3 From 8af70cd2ca508061088d5059ba8a8218aca7ddf1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 20 Apr 2018 10:14:27 -0700 Subject: memory: aemif: add support for board files Currently aemif is supported in two places separately. By the platform driver in drivers/memory and by a hand crafted driver in mach-davinci. We want to drop the latter but also keep the legacy mode. Add support for board files to the aemif driver. The new structure in platform data currently only contains the chip select number, since currently existing users don't require anything else, but it can be extended in the future. While extending the platform data struct, add kernel docs describing its members. Signed-off-by: Bartosz Golaszewski Signed-off-by: Santosh Shilimkar --- drivers/memory/ti-aemif.c | 58 ++++++++++++++++++++++------------ include/linux/platform_data/ti-aemif.h | 25 +++++++++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/memory/ti-aemif.c b/drivers/memory/ti-aemif.c index 588e58d40d1b..31112f622b88 100644 --- a/drivers/memory/ti-aemif.c +++ b/drivers/memory/ti-aemif.c @@ -339,9 +339,6 @@ static int aemif_probe(struct platform_device *pdev) struct aemif_platform_data *pdata; struct of_dev_auxdata *dev_lookup; - if (np == NULL) - return 0; - aemif = devm_kzalloc(dev, sizeof(*aemif), GFP_KERNEL); if (!aemif) return -ENOMEM; @@ -363,8 +360,10 @@ static int aemif_probe(struct platform_device *pdev) aemif->clk_rate = clk_get_rate(aemif->clk) / MSEC_PER_SEC; - if (of_device_is_compatible(np, "ti,da850-aemif")) + if (np && of_device_is_compatible(np, "ti,da850-aemif")) aemif->cs_offset = 2; + else if (pdata) + aemif->cs_offset = pdata->cs_offset; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); aemif->base = devm_ioremap_resource(dev, res); @@ -373,15 +372,23 @@ static int aemif_probe(struct platform_device *pdev) goto error; } - /* - * For every controller device node, there is a cs device node that - * describe the bus configuration parameters. This functions iterate - * over these nodes and update the cs data array. - */ - for_each_available_child_of_node(np, child_np) { - ret = of_aemif_parse_abus_config(pdev, child_np); - if (ret < 0) - goto error; + if (np) { + /* + * For every controller device node, there is a cs device node + * that describe the bus configuration parameters. This + * functions iterate over these nodes and update the cs data + * array. + */ + for_each_available_child_of_node(np, child_np) { + ret = of_aemif_parse_abus_config(pdev, child_np); + if (ret < 0) + goto error; + } + } else if (pdata && pdata->num_abus_data > 0) { + for (i = 0; i < pdata->num_abus_data; i++, aemif->num_cs++) { + aemif->cs_data[i].cs = pdata->abus_data[i].cs; + aemif_get_hw_params(pdev, i); + } } for (i = 0; i < aemif->num_cs; i++) { @@ -394,14 +401,25 @@ static int aemif_probe(struct platform_device *pdev) } /* - * Create a child devices explicitly from here to - * guarantee that the child will be probed after the AEMIF timing - * parameters are set. + * Create a child devices explicitly from here to guarantee that the + * child will be probed after the AEMIF timing parameters are set. */ - for_each_available_child_of_node(np, child_np) { - ret = of_platform_populate(child_np, NULL, dev_lookup, dev); - if (ret < 0) - goto error; + if (np) { + for_each_available_child_of_node(np, child_np) { + ret = of_platform_populate(child_np, NULL, + dev_lookup, dev); + if (ret < 0) + goto error; + } + } else { + for (i = 0; i < pdata->num_sub_devices; i++) { + pdata->sub_devices[i].dev.parent = dev; + ret = platform_device_register(&pdata->sub_devices[i]); + if (ret) { + dev_warn(dev, "Error register sub device %s\n", + pdata->sub_devices[i].name); + } + } } return 0; diff --git a/include/linux/platform_data/ti-aemif.h b/include/linux/platform_data/ti-aemif.h index ac72e115093c..e6407bafcbf8 100644 --- a/include/linux/platform_data/ti-aemif.h +++ b/include/linux/platform_data/ti-aemif.h @@ -16,8 +16,33 @@ #include +/** + * struct aemif_abus_data - Async bus configuration parameters. + * + * @cs - Chip-select number. + */ +struct aemif_abus_data { + u32 cs; +}; + +/** + * struct aemif_platform_data - Data to set up the TI aemif driver. + * + * @dev_lookup: of_dev_auxdata passed to of_platform_populate() for aemif + * subdevices. + * @cs_offset: Lowest allowed chip-select number. + * @abus_data: Array of async bus configuration entries. + * @num_abus_data: Number of abus entries. + * @sub_devices: Array of platform subdevices. + * @num_sub_devices: Number of subdevices. + */ struct aemif_platform_data { struct of_dev_auxdata *dev_lookup; + u32 cs_offset; + struct aemif_abus_data *abus_data; + size_t num_abus_data; + struct platform_device *sub_devices; + size_t num_sub_devices; }; #endif /* __TI_DAVINCI_AEMIF_DATA_H__ */ -- cgit v1.2.3 From dbef91ec5482239055dd2db8ec656fc13d382add Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 18 Apr 2018 01:35:06 +0200 Subject: scsi: ilog2: create truly constant version for sparse Sparse emits errors about ilog2() in array indices because of the use of __ilog2_32() and __ilog2_64(), rightly so (https://www.spinics.net/lists/linux-sparse/msg03471.html). Create a const_ilog2() variant that works with sparse for this scenario. (Note: checkpatch.pl complains about missing parentheses, but that appears to be a false positive. I can get rid of the warning simply by inserting whitespace, making checkpatch "see" the whole macro). Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- include/linux/log2.h | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/log2.h b/include/linux/log2.h index 41a1ae010993..2af7f77866d0 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -72,16 +72,13 @@ unsigned long __rounddown_pow_of_two(unsigned long n) } /** - * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value + * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value * @n: parameter * - * constant-capable log of base 2 calculation - * - this can be used to initialise global variables from constant data, hence - * the massive ternary operator construction - * - * selects the appropriately-sized optimised version depending on sizeof(n) + * Use this where sparse expects a true constant expression, e.g. for array + * indices. */ -#define ilog2(n) \ +#define const_ilog2(n) \ ( \ __builtin_constant_p(n) ? ( \ (n) < 2 ? 0 : \ @@ -147,10 +144,26 @@ unsigned long __rounddown_pow_of_two(unsigned long n) (n) & (1ULL << 4) ? 4 : \ (n) & (1ULL << 3) ? 3 : \ (n) & (1ULL << 2) ? 2 : \ - 1 ) : \ - (sizeof(n) <= 4) ? \ - __ilog2_u32(n) : \ - __ilog2_u64(n) \ + 1) : \ + -1) + +/** + * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value + * @n: parameter + * + * constant-capable log of base 2 calculation + * - this can be used to initialise global variables from constant data, hence + * the massive ternary operator construction + * + * selects the appropriately-sized optimised version depending on sizeof(n) + */ +#define ilog2(n) \ +( \ + __builtin_constant_p(n) ? \ + const_ilog2(n) : \ + (sizeof(n) <= 4) ? \ + __ilog2_u32(n) : \ + __ilog2_u64(n) \ ) /** -- cgit v1.2.3 From 1409880357ed33dc1c23eed080d88ea4410ed9a3 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 18 Apr 2018 01:35:08 +0200 Subject: scsi: devinfo: change blist_flag_t to 64bit Space for SCSI blist flags is gradually running out. Change the type to __u64 and fix a checkpatch complaint about symbolic mode flags in scsi_devinfo.c. Make checkpatch happy by replacing simple_strtoul() with kstrtoull(). Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_devinfo.c | 18 +++++++++++----- drivers/scsi/scsi_sysfs.c | 2 +- include/scsi/scsi_device.h | 2 +- include/scsi/scsi_devinfo.h | 50 ++++++++++++++++++++++----------------------- 4 files changed, 40 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index e5c82f13b7d9..bd04a33e4360 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -360,8 +360,16 @@ int scsi_dev_info_list_add_keyed(int compatible, char *vendor, char *model, scsi_strcpy_devinfo("model", devinfo->model, sizeof(devinfo->model), model, compatible); - if (strflags) - flags = (__force blist_flags_t)simple_strtoul(strflags, NULL, 0); + if (strflags) { + unsigned long long val; + int ret = kstrtoull(strflags, 0, &val); + + if (ret != 0) { + kfree(devinfo); + return ret; + } + flags = (__force blist_flags_t)val; + } devinfo->flags = flags; devinfo->compatible = compatible; @@ -614,7 +622,7 @@ static int devinfo_seq_show(struct seq_file *m, void *v) devinfo_table->name) seq_printf(m, "[%s]:\n", devinfo_table->name); - seq_printf(m, "'%.8s' '%.16s' 0x%x\n", + seq_printf(m, "'%.8s' '%.16s' 0x%llx\n", devinfo->vendor, devinfo->model, devinfo->flags); return 0; } @@ -733,9 +741,9 @@ MODULE_PARM_DESC(dev_flags, " list entries for vendor and model with an integer value of flags" " to the scsi device info list"); -module_param_named(default_dev_flags, scsi_default_dev_flags, int, S_IRUGO|S_IWUSR); +module_param_named(default_dev_flags, scsi_default_dev_flags, ullong, 0644); MODULE_PARM_DESC(default_dev_flags, - "scsi default device flag integer value"); + "scsi default device flag uint64_t value"); /** * scsi_exit_devinfo - remove /proc/scsi/device_info & the scsi_dev_info_list diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 5120e613a29c..7943b762c12d 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -968,7 +968,7 @@ sdev_show_wwid(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL); #define BLIST_FLAG_NAME(name) \ - [const_ilog2((__force unsigned int)BLIST_##name)] = #name + [const_ilog2((__force __u64)BLIST_##name)] = #name static const char *const sdev_bflags_name[] = { #include "scsi_devinfo_tbl.c" }; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 7ae177c8e399..4c36af6edd79 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -15,7 +15,7 @@ struct scsi_cmnd; struct scsi_lun; struct scsi_sense_hdr; -typedef unsigned int __bitwise blist_flags_t; +typedef __u64 __bitwise blist_flags_t; struct scsi_mode_data { __u32 length; diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index ea67c32e870e..e206d299f137 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -6,55 +6,55 @@ */ /* Only scan LUN 0 */ -#define BLIST_NOLUN ((__force blist_flags_t)(1 << 0)) +#define BLIST_NOLUN ((__force blist_flags_t)(1ULL << 0)) /* Known to have LUNs, force scanning. * DEPRECATED: Use max_luns=N */ -#define BLIST_FORCELUN ((__force blist_flags_t)(1 << 1)) +#define BLIST_FORCELUN ((__force blist_flags_t)(1ULL << 1)) /* Flag for broken handshaking */ -#define BLIST_BORKEN ((__force blist_flags_t)(1 << 2)) +#define BLIST_BORKEN ((__force blist_flags_t)(1ULL << 2)) /* unlock by special command */ -#define BLIST_KEY ((__force blist_flags_t)(1 << 3)) +#define BLIST_KEY ((__force blist_flags_t)(1ULL << 3)) /* Do not use LUNs in parallel */ -#define BLIST_SINGLELUN ((__force blist_flags_t)(1 << 4)) +#define BLIST_SINGLELUN ((__force blist_flags_t)(1ULL << 4)) /* Buggy Tagged Command Queuing */ -#define BLIST_NOTQ ((__force blist_flags_t)(1 << 5)) +#define BLIST_NOTQ ((__force blist_flags_t)(1ULL << 5)) /* Non consecutive LUN numbering */ -#define BLIST_SPARSELUN ((__force blist_flags_t)(1 << 6)) +#define BLIST_SPARSELUN ((__force blist_flags_t)(1ULL << 6)) /* Avoid LUNS >= 5 */ -#define BLIST_MAX5LUN ((__force blist_flags_t)(1 << 7)) +#define BLIST_MAX5LUN ((__force blist_flags_t)(1ULL << 7)) /* Treat as (removable) CD-ROM */ -#define BLIST_ISROM ((__force blist_flags_t)(1 << 8)) +#define BLIST_ISROM ((__force blist_flags_t)(1ULL << 8)) /* LUNs past 7 on a SCSI-2 device */ -#define BLIST_LARGELUN ((__force blist_flags_t)(1 << 9)) +#define BLIST_LARGELUN ((__force blist_flags_t)(1ULL << 9)) /* override additional length field */ -#define BLIST_INQUIRY_36 ((__force blist_flags_t)(1 << 10)) +#define BLIST_INQUIRY_36 ((__force blist_flags_t)(1ULL << 10)) /* do not do automatic start on add */ -#define BLIST_NOSTARTONADD ((__force blist_flags_t)(1 << 12)) +#define BLIST_NOSTARTONADD ((__force blist_flags_t)(1ULL << 12)) /* try REPORT_LUNS even for SCSI-2 devs (if HBA supports more than 8 LUNs) */ -#define BLIST_REPORTLUN2 ((__force blist_flags_t)(1 << 17)) +#define BLIST_REPORTLUN2 ((__force blist_flags_t)(1ULL << 17)) /* don't try REPORT_LUNS scan (SCSI-3 devs) */ -#define BLIST_NOREPORTLUN ((__force blist_flags_t)(1 << 18)) +#define BLIST_NOREPORTLUN ((__force blist_flags_t)(1ULL << 18)) /* don't use PREVENT-ALLOW commands */ -#define BLIST_NOT_LOCKABLE ((__force blist_flags_t)(1 << 19)) +#define BLIST_NOT_LOCKABLE ((__force blist_flags_t)(1ULL << 19)) /* device is actually for RAID config */ -#define BLIST_NO_ULD_ATTACH ((__force blist_flags_t)(1 << 20)) +#define BLIST_NO_ULD_ATTACH ((__force blist_flags_t)(1ULL << 20)) /* select without ATN */ -#define BLIST_SELECT_NO_ATN ((__force blist_flags_t)(1 << 21)) +#define BLIST_SELECT_NO_ATN ((__force blist_flags_t)(1ULL << 21)) /* retry HARDWARE_ERROR */ -#define BLIST_RETRY_HWERROR ((__force blist_flags_t)(1 << 22)) +#define BLIST_RETRY_HWERROR ((__force blist_flags_t)(1ULL << 22)) /* maximum 512 sector cdb length */ -#define BLIST_MAX_512 ((__force blist_flags_t)(1 << 23)) +#define BLIST_MAX_512 ((__force blist_flags_t)(1ULL << 23)) /* Disable T10 PI (DIF) */ -#define BLIST_NO_DIF ((__force blist_flags_t)(1 << 25)) +#define BLIST_NO_DIF ((__force blist_flags_t)(1ULL << 25)) /* Ignore SBC-3 VPD pages */ -#define BLIST_SKIP_VPD_PAGES ((__force blist_flags_t)(1 << 26)) +#define BLIST_SKIP_VPD_PAGES ((__force blist_flags_t)(1ULL << 26)) /* Attempt to read VPD pages */ -#define BLIST_TRY_VPD_PAGES ((__force blist_flags_t)(1 << 28)) +#define BLIST_TRY_VPD_PAGES ((__force blist_flags_t)(1ULL << 28)) /* don't try to issue RSOC */ -#define BLIST_NO_RSOC ((__force blist_flags_t)(1 << 29)) +#define BLIST_NO_RSOC ((__force blist_flags_t)(1ULL << 29)) /* maximum 1024 sector cdb length */ -#define BLIST_MAX_1024 ((__force blist_flags_t)(1 << 30)) +#define BLIST_MAX_1024 ((__force blist_flags_t)(1ULL << 30)) /* Use UNMAP limit for WRITE SAME */ -#define BLIST_UNMAP_LIMIT_WS ((__force blist_flags_t)(1 << 31)) +#define BLIST_UNMAP_LIMIT_WS ((__force blist_flags_t)(1ULL << 31)) #endif -- cgit v1.2.3 From 358fda5ff4c441dfa6e66fd3b1f7d8f882ecba8f Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 18 Apr 2018 01:35:09 +0200 Subject: scsi: devinfo: warn on undefined blist flags Warn if a device (or the user) sets blist flags which are unknown or have been removed. This should enable us to reuse freed blist bits in later releases. Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/scsi/Makefile | 2 +- drivers/scsi/scsi_devinfo.c | 6 ++++++ include/scsi/scsi_devinfo.h | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index f31145d6d472..eb30e558fc36 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -190,7 +190,7 @@ $(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h $(obj)/scsi_sysfs.o: $(obj)/scsi_devinfo_tbl.c quiet_cmd_bflags = GEN $@ - cmd_bflags = sed -n 's/.*BLIST_\([A-Z0-9_]*\) *.*/BLIST_FLAG_NAME(\1),/p' $< > $@ + cmd_bflags = sed -n 's/.*define *BLIST_\([A-Z0-9_]*\) *.*/BLIST_FLAG_NAME(\1),/p' $< > $@ $(obj)/scsi_devinfo_tbl.c: include/scsi/scsi_devinfo.h $(call if_changed,bflags) diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index bd04a33e4360..9603f3ef18aa 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -370,6 +370,12 @@ int scsi_dev_info_list_add_keyed(int compatible, char *vendor, char *model, } flags = (__force blist_flags_t)val; } + if (flags & __BLIST_UNUSED_MASK) { + pr_err("scsi_devinfo (%s:%s): unsupported flags 0x%llx", + vendor, model, flags & __BLIST_UNUSED_MASK); + kfree(devinfo); + return -EINVAL; + } devinfo->flags = flags; devinfo->compatible = compatible; diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index e206d299f137..3434e143feff 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -28,8 +28,13 @@ #define BLIST_LARGELUN ((__force blist_flags_t)(1ULL << 9)) /* override additional length field */ #define BLIST_INQUIRY_36 ((__force blist_flags_t)(1ULL << 10)) +#define __BLIST_UNUSED_11 ((__force blist_flags_t)(1ULL << 11)) /* do not do automatic start on add */ #define BLIST_NOSTARTONADD ((__force blist_flags_t)(1ULL << 12)) +#define __BLIST_UNUSED_13 ((__force blist_flags_t)(1ULL << 13)) +#define __BLIST_UNUSED_14 ((__force blist_flags_t)(1ULL << 14)) +#define __BLIST_UNUSED_15 ((__force blist_flags_t)(1ULL << 15)) +#define __BLIST_UNUSED_16 ((__force blist_flags_t)(1ULL << 16)) /* try REPORT_LUNS even for SCSI-2 devs (if HBA supports more than 8 LUNs) */ #define BLIST_REPORTLUN2 ((__force blist_flags_t)(1ULL << 17)) /* don't try REPORT_LUNS scan (SCSI-3 devs) */ @@ -44,10 +49,12 @@ #define BLIST_RETRY_HWERROR ((__force blist_flags_t)(1ULL << 22)) /* maximum 512 sector cdb length */ #define BLIST_MAX_512 ((__force blist_flags_t)(1ULL << 23)) +#define __BLIST_UNUSED_24 ((__force blist_flags_t)(1ULL << 24)) /* Disable T10 PI (DIF) */ #define BLIST_NO_DIF ((__force blist_flags_t)(1ULL << 25)) /* Ignore SBC-3 VPD pages */ #define BLIST_SKIP_VPD_PAGES ((__force blist_flags_t)(1ULL << 26)) +#define __BLIST_UNUSED_27 ((__force blist_flags_t)(1ULL << 27)) /* Attempt to read VPD pages */ #define BLIST_TRY_VPD_PAGES ((__force blist_flags_t)(1ULL << 28)) /* don't try to issue RSOC */ @@ -57,4 +64,18 @@ /* Use UNMAP limit for WRITE SAME */ #define BLIST_UNMAP_LIMIT_WS ((__force blist_flags_t)(1ULL << 31)) +#define __BLIST_LAST_USED BLIST_UNMAP_LIMIT_WS + +#define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \ + (__force blist_flags_t) \ + ((__force __u64)__BLIST_LAST_USED - 1ULL))) +#define __BLIST_UNUSED_MASK (__BLIST_UNUSED_11 | \ + __BLIST_UNUSED_13 | \ + __BLIST_UNUSED_14 | \ + __BLIST_UNUSED_15 | \ + __BLIST_UNUSED_16 | \ + __BLIST_UNUSED_24 | \ + __BLIST_UNUSED_27 | \ + __BLIST_HIGH_UNUSED) + #endif -- cgit v1.2.3 From 29cfc2ab71d9642c2f4fda6cd278309cc253ff82 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 18 Apr 2018 01:35:10 +0200 Subject: scsi: devinfo: add BLIST_RETRY_ITF for EMC Symmetrix EMC Symmetrix returns 'internal target error' for a variety of conditions, most of which will be transient. So we should always retry it, even with failfast set. Otherwise we'd get spurious path flaps with multipath. Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_devinfo.c | 3 ++- drivers/scsi/scsi_error.c | 4 ++++ include/scsi/scsi_devinfo.h | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 9603f3ef18aa..87bc50686ac6 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -161,7 +161,8 @@ static struct { {"DGC", "RAID", NULL, BLIST_SPARSELUN}, /* EMC CLARiiON, storage on LUN 0 */ {"DGC", "DISK", NULL, BLIST_SPARSELUN}, /* EMC CLARiiON, no storage on LUN 0 */ {"EMC", "Invista", "*", BLIST_SPARSELUN | BLIST_LARGELUN}, - {"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | BLIST_REPORTLUN2}, + {"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | + BLIST_REPORTLUN2 | BLIST_RETRY_ITF}, {"EMULEX", "MD21/S2 ESDI", NULL, BLIST_SINGLELUN}, {"easyRAID", "16P", NULL, BLIST_NOREPORTLUN}, {"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN}, diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 946039117bf4..633b198ed895 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "scsi_priv.h" @@ -525,6 +526,9 @@ int scsi_check_sense(struct scsi_cmnd *scmd) if (sshdr.asc == 0x10) /* DIF */ return SUCCESS; + if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF) + return ADD_TO_MLQUEUE; + return NEEDS_RETRY; case NOT_READY: case UNIT_ATTENTION: diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 3434e143feff..91a327edf1fa 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -63,8 +63,10 @@ #define BLIST_MAX_1024 ((__force blist_flags_t)(1ULL << 30)) /* Use UNMAP limit for WRITE SAME */ #define BLIST_UNMAP_LIMIT_WS ((__force blist_flags_t)(1ULL << 31)) +/* Always retry ABORTED_COMMAND with Internal Target Failure */ +#define BLIST_RETRY_ITF ((__force blist_flags_t)(1ULL << 32)) -#define __BLIST_LAST_USED BLIST_UNMAP_LIMIT_WS +#define __BLIST_LAST_USED BLIST_RETRY_ITF #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \ (__force blist_flags_t) \ -- cgit v1.2.3 From c360652006bba40837cf16d5099ea61f7ce16c63 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 18 Apr 2018 01:35:11 +0200 Subject: scsi: devinfo: BLIST_RETRY_ASC_C1 for Fujitsu ETERNUS On Fujitsu ETERNUS systems, sense code ABORTED COMMAND with ASC/Q C1/01 is used to indicate temporary condition where the storage-internal path to a target is switched from one controller to another. SCSI commands that return with this error code must be retried unconditionally (i.e. without the "maybe_retry" logic in scsi_decide_disposition); otherwise dm-multipath might initiate a failover from a healthy path e.g. for REQ_FAILFAST_DEV commands. Introduce a new blist flag for this case. [mkp: applied by hand] Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_devinfo.c | 1 + drivers/scsi/scsi_error.c | 3 +++ include/scsi/scsi_devinfo.h | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 87bc50686ac6..c4cbfd07b916 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -168,6 +168,7 @@ static struct { {"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN}, {"easyRAID", "F8", NULL, BLIST_NOREPORTLUN}, {"FSC", "CentricStor", "*", BLIST_SPARSELUN | BLIST_LARGELUN}, + {"FUJITSU", "ETERNUS_DXM", "*", BLIST_RETRY_ASC_C1}, {"Generic", "USB SD Reader", "1.00", BLIST_FORCELUN | BLIST_INQUIRY_36}, {"Generic", "USB Storage-SMC", NULL, BLIST_FORCELUN | BLIST_INQUIRY_36}, /* FW: 0180 and 0207 */ {"HITACHI", "DF400", "*", BLIST_REPORTLUN2}, diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 633b198ed895..94d2047e0096 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -528,6 +528,9 @@ int scsi_check_sense(struct scsi_cmnd *scmd) if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF) return ADD_TO_MLQUEUE; + if (sshdr.asc == 0xc1 && sshdr.ascq == 0x01 && + sdev->sdev_bflags & BLIST_RETRY_ASC_C1) + return ADD_TO_MLQUEUE; return NEEDS_RETRY; case NOT_READY: diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 91a327edf1fa..3fdb322d4c4b 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -65,8 +65,10 @@ #define BLIST_UNMAP_LIMIT_WS ((__force blist_flags_t)(1ULL << 31)) /* Always retry ABORTED_COMMAND with Internal Target Failure */ #define BLIST_RETRY_ITF ((__force blist_flags_t)(1ULL << 32)) +/* Always retry ABORTED_COMMAND with ASC 0xc1 */ +#define BLIST_RETRY_ASC_C1 ((__force blist_flags_t)(1ULL << 33)) -#define __BLIST_LAST_USED BLIST_RETRY_ITF +#define __BLIST_LAST_USED BLIST_RETRY_ASC_C1 #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \ (__force blist_flags_t) \ -- cgit v1.2.3 From 572ccdab50bb3ae9096d6947c2e78a7107acf2dd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 14 Apr 2018 10:51:05 -0700 Subject: scsi: target: target_core_user.[ch]: convert comments into DOC: Make documentation on target-supported userspace-I/O design be usable by kernel-doc by using "DOC:". This is used in the driver-api Documentation chapter. Signed-off-by: Randy Dunlap To: "Nicholas A. Bellinger" Cc: linux-scsi@vger.kernel.org Cc: target-devel@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Jonathan Corbet Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 8 ++++++-- include/uapi/linux/target_core_user.h | 11 ++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index d6af29250d94..ae0aea9a3aad 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -42,7 +42,11 @@ #include -/* +/** + * DOC: Userspace I/O + * Userspace I/O + * ------------- + * * Define a shared-memory interface for LIO to pass SCSI commands and * data to userspace for processing. This is to allow backends that * are too complex for in-kernel support to be possible. @@ -53,7 +57,7 @@ * See the .h file for how the ring is laid out. Note that while the * command ring is defined, the particulars of the data area are * not. Offset values in the command entry point to other locations - * internal to the mmap()ed area. There is separate space outside the + * internal to the mmap-ed area. There is separate space outside the * command ring for data buffers. This leaves maximum flexibility for * moving buffer allocations, or even page flipping or other * allocation techniques, without altering the command ring layout. diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 0be80f72646b..6e299349b158 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -9,21 +9,22 @@ #define TCMU_VERSION "2.0" -/* +/** + * DOC: Ring Design * Ring Design * ----------- * * The mmaped area is divided into three parts: - * 1) The mailbox (struct tcmu_mailbox, below) - * 2) The command ring - * 3) Everything beyond the command ring (data) + * 1) The mailbox (struct tcmu_mailbox, below); + * 2) The command ring; + * 3) Everything beyond the command ring (data). * * The mailbox tells userspace the offset of the command ring from the * start of the shared memory region, and how big the command ring is. * * The kernel passes SCSI commands to userspace by putting a struct * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking - * userspace via uio's interrupt mechanism. + * userspace via UIO's interrupt mechanism. * * tcmu_cmd_entry contains a header. If the header type is PAD, * userspace should skip hdr->length bytes (mod cmdr_size) to find the -- cgit v1.2.3 From f134fbbb4ff813dd227c9ce40b5c0b2078a77b07 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 21 Apr 2018 08:54:40 +1000 Subject: mtd: spi-nor: clear Winbond Extended Address Reg on switch to 3-byte addressing. Winbond spi-nor flash 32MB and larger have an 'Extended Address Register' as one option for addressing beyond 16MB (Macronix has the same concept, Spansion has EXTADD bits in the Bank Address Register). According to section 8.2.7 Write Extended Address Register (C5h) of the Winbond W25Q256FV data sheet (256M-BIT SPI flash) The Extended Address Register is only effective when the device is in the 3-Byte Address Mode. When the device operates in the 4-Byte Address Mode (ADS=1), any command with address input of A31-A24 will replace the Extended Address Register values. It is recommended to check and update the Extended Address Register if necessary when the device is switched from 4-Byte to 3-Byte Address Mode. So the documentation suggests clearing the EAR after switching to 3-byte mode. Experimentation shows that the EAR is *always* one after the switch to 3-byte mode, so clearing the EAR is mandatory at shutdown for a subsequent 3-byte-addressed reboot to work. Note that some SOCs (e.g. MT7621) do not assert a reset line at normal reboot, so we cannot rely on hardware reset. The MT7621 does assert a reset line at watchdog-reset. Acked-by: Marek Vasut Signed-off-by: NeilBrown Signed-off-by: Boris Brezillon --- drivers/mtd/spi-nor/spi-nor.c | 14 ++++++++++++++ include/linux/mtd/spi-nor.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 9363f299e4ee..494b7a269872 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -284,6 +284,20 @@ static inline int set_4byte(struct spi_nor *nor, const struct flash_info *info, if (need_wren) write_disable(nor); + if (!status && !enable && + JEDEC_MFR(info) == SNOR_MFR_WINBOND) { + /* + * On Winbond W25Q256FV, leaving 4byte mode causes + * the Extended Address Register to be set to 1, so all + * 3-byte-address reads come from the second 16M. + * We must clear the register to enable normal behavior. + */ + write_enable(nor); + nor->cmd_buf[0] = 0; + nor->write_reg(nor, SPINOR_OP_WREAR, nor->cmd_buf, 1); + write_disable(nor); + } + return status; default: /* Spansion style */ diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index de36969eb359..e60da0d34cc1 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -62,6 +62,8 @@ #define SPINOR_OP_RDCR 0x35 /* Read configuration register */ #define SPINOR_OP_RDFSR 0x70 /* Read flag status register */ #define SPINOR_OP_CLFSR 0x50 /* Clear flag status register */ +#define SPINOR_OP_RDEAR 0xc8 /* Read Extended Address Register */ +#define SPINOR_OP_WREAR 0xc5 /* Write Extended Address Register */ /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */ #define SPINOR_OP_READ_4B 0x13 /* Read data bytes (low frequency) */ -- cgit v1.2.3 From 07cb9623ee7d15cc9968969ac247edae6972fb8f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:10 +0100 Subject: ipv6: make ip6_dst_mtu_forward inline Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko. Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/ip6_route.h | 21 +++++++++++++++++++++ include/net/ipv6.h | 2 -- net/ipv6/ip6_output.c | 22 ---------------------- 3 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d5fb1e4ae7ac..376928c26d2d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info * !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 68b167d98879..765441867cfa 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -958,8 +958,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst); - int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3db47986ef38..cec49e137dbb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) -{ - unsigned int mtu; - struct inet6_dev *idev; - - if (dst_metric_locked(dst, RTAX_MTU)) { - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; - } - - mtu = IPV6_MIN_MTU; - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - - return mtu; -} -EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); - static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) -- cgit v1.2.3 From 4f3780c004ef608477a534558eb16f46c5a1c534 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:11 +0100 Subject: netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple Reduces the number of cache lines touched in the offload forwarding path. This is safe because PMTU limits are bypassed for the forwarding path (see commit f87c10a8aa1e for more details). Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ net/ipv4/netfilter/nf_flow_table_ipv4.c | 17 +++-------------- net/ipv6/netfilter/nf_flow_table_ipv6.c | 17 +++-------------- net/netfilter/nf_flow_table.c | 8 ++++++-- 4 files changed, 14 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 09ba67598991..76ee5c81b752 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -55,6 +55,8 @@ struct flow_offload_tuple { int oifidx; + u16 mtu; + struct dst_entry *dst_cache; }; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 0cd46bffa469..461b1815e633 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -178,7 +178,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, } /* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -192,17 +192,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) -{ - u32 mtu; - - mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -233,9 +222,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; if (skb_try_make_writable(skb, sizeof(*iph))) diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index 207cb35569b1..0e6328490142 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, } /* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) -{ - u32 mtu; - - mtu = ip6_dst_mtu_forward(&rt->dst); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; if (skb_try_make_writable(skb, sizeof(*ip6h))) diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c index db0673a40b97..7403a0dfddf7 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; + struct dst_entry *dst = route->tuple[dir].dst; ft->dir = dir; @@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; + ft->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; + ft->mtu = ip6_dst_mtu_forward(dst); break; } @@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->iifidx = route->tuple[dir].ifindex; ft->oifidx = route->tuple[!dir].ifindex; - - ft->dst_cache = route->tuple[dir].dst; + ft->dst_cache = dst; } struct flow_offload * -- cgit v1.2.3 From 6a3e030f08e1b700aa6d1ebdc7ebe4c44a2ef67a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:57 -0700 Subject: net/ipv6: Clean up rt expires helpers rt6_clean_expires and rt6_set_expires are no longer used. Removed them. rt6_update_expires has 1 caller in route.c, so move it from the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 21 --------------------- net/ipv6/route.c | 9 +++++++++ 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 642211174692..327a74cd6a0d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -223,27 +223,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -static inline void rt6_clean_expires(struct rt6_info *rt) -{ - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->dst.expires = 0; -} - -static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) -{ - rt->dst.expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; -} - -static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) -{ - if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) - rt0->dst.expires = rt0->from->expires; - - dst_set_expires(&rt0->dst, timeout); - rt0->rt6i_flags |= RTF_EXPIRES; -} - /* Function to safely get fn->sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 884d9cee790f..062dd4d8232c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2221,6 +2221,15 @@ static void ip6_link_failure(struct sk_buff *skb) } } +static void rt6_update_expires(struct rt6_info *rt0, int timeout) +{ + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; + + dst_set_expires(&rt0->dst, timeout); + rt0->rt6i_flags |= RTF_EXPIRES; +} + static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); -- cgit v1.2.3 From a269f1a764bb3abf5c73499fb74bc7ab1c2e986c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:58 -0700 Subject: net/ipv6: Rename rt6_get_cookie_safe rt6_get_cookie_safe takes a fib6_info and checks the sernum of the node. Update the name to reflect its purpose. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 +++--- net/ipv6/route.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 327a74cd6a0d..dd1481ed8bdb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -228,8 +228,8 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, - u32 *cookie) +static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, + u32 *cookie) { struct fib6_node *fn; bool status = false; @@ -254,7 +254,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - rt6_get_cookie_safe(rt->from, &cookie); + fib6_get_cookie_safe(rt->from, &cookie); return cookie; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 062dd4d8232c..2d6fcfe11c82 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2128,7 +2128,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; - if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) || + if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) || rt_cookie != cookie) return false; @@ -2142,7 +2142,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; - if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) || + if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) || rt_cookie != cookie) return NULL; -- cgit v1.2.3 From a87b7dc9f7fa7c87cbd575361553e8e9e4ab4473 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:00 -0700 Subject: net/ipv6: Move rcu locking to callers of fib6_get_cookie_safe A later patch protects 'from' in rt6_info and this simplifies the locking needed by it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 ++++-- net/ipv6/route.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index dd1481ed8bdb..dc3505fb62b3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -234,7 +234,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, struct fib6_node *fn; bool status = false; - rcu_read_lock(); fn = rcu_dereference(f6i->fib6_node); if (fn) { @@ -244,7 +243,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, status = true; } - rcu_read_unlock(); return status; } @@ -252,10 +250,14 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) { u32 cookie = 0; + rcu_read_lock(); + if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) fib6_get_cookie_safe(rt->from, &cookie); + rcu_read_unlock(); + return cookie; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f8b22183b7fd..810a37ac043b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2159,9 +2159,12 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { + struct dst_entry *dst_ret; struct rt6_info *rt; - rt = (struct rt6_info *) dst; + rt = container_of(dst, struct rt6_info, dst); + + rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -2170,9 +2173,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - return rt6_dst_from_check(rt, cookie); + dst_ret = rt6_dst_from_check(rt, cookie); else - return rt6_check(rt, cookie); + dst_ret = rt6_check(rt, cookie); + + rcu_read_unlock(); + + return dst_ret; } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) -- cgit v1.2.3 From a68886a691804d3f6d479ebf6825480fbafb6a00 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:02 -0700 Subject: net/ipv6: Make from in rt6_info rcu protected When a dst entry is created from a fib entry, the 'from' in rt6_info is set to the fib entry. The 'from' reference is used most notably for cookie checking - making sure stale dst entries are updated if the fib entry is changed. When a fib entry is deleted, the pcpu routes on it are walked releasing the fib6_info reference. This is needed for the fib6_info cleanup to happen and to make sure all device references are released in a timely manner. There is a race window when a FIB entry is deleted and the 'from' on the pcpu route is dropped and the pcpu route hits a cookie check. Handle this race using rcu on from. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 10 +++--- net/ipv6/ip6_fib.c | 8 +++-- net/ipv6/ip6_output.c | 9 +++-- net/ipv6/route.c | 96 ++++++++++++++++++++++++++++++++++++--------------- 4 files changed, 88 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index dc3505fb62b3..1af450d4e923 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,7 +174,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct fib6_info *from; + struct fib6_info __rcu *from; struct rt6key rt6i_dst; struct rt6key rt6i_src; @@ -248,13 +248,15 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + struct fib6_info *from; u32 cookie = 0; rcu_read_lock(); - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - fib6_get_cookie_safe(rt->from, &cookie); + from = rcu_dereference(rt->from); + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 64ca02b7745f..6421c893466e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -875,8 +875,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - fib6_info_release(pcpu_rt->from); - pcpu_rt->from = NULL; + struct fib6_info *from; + + from = rcu_dereference_protected(pcpu_rt->from, + lockdep_is_held(&table->tb6_lock)); + rcu_assign_pointer(pcpu_rt->from, NULL); + fib6_info_release(from); } } } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3db47986ef38..6a477d54f8c7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -962,16 +962,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct fib6_info *from; struct rt6_info *rt; bool had_dst = *dst != NULL; if (!had_dst) *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; - err = ip6_route_get_saddr(net, rt ? rt->from : NULL, - &fl6->daddr, + + rcu_read_lock(); + from = rt ? rcu_dereference(rt->from) : NULL; + err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); + rcu_read_unlock(); + if (err) goto out_err_release; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 810a37ac043b..004d00fe2fe5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -359,7 +359,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct fib6_info *from = rt->from; + struct fib6_info *from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -371,8 +371,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - rt->from = NULL; + rcu_read_lock(); + from = rcu_dereference(rt->from); + rcu_assign_pointer(rt->from, NULL); fib6_info_release(from); + rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -402,12 +405,16 @@ static bool __rt6_check_expired(const struct rt6_info *rt) static bool rt6_check_expired(const struct rt6_info *rt) { + struct fib6_info *from; + + from = rcu_dereference(rt->from); + if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->from) { + } else if (from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - fib6_check_expired(rt->from); + fib6_check_expired(from); } return false; } @@ -963,7 +970,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; fib6_info_hold(from); - rt->from = from; + rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -2133,11 +2140,13 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie) return true; } -static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { u32 rt_cookie = 0; - if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) || + if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || rt_cookie != cookie) return NULL; @@ -2147,11 +2156,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) return &rt->dst; } -static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - fib6_check(rt->from, cookie)) + fib6_check(from, cookie)) return &rt->dst; else return NULL; @@ -2160,6 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; + struct fib6_info *from; struct rt6_info *rt; rt = container_of(dst, struct rt6_info, dst); @@ -2171,11 +2183,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * into this function always. */ - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - dst_ret = rt6_dst_from_check(rt, cookie); + from = rcu_dereference(rt->from); + + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + dst_ret = rt6_dst_from_check(rt, from, cookie); else - dst_ret = rt6_check(rt, cookie); + dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); @@ -2211,13 +2225,17 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) rt6_remove_exception_rt(rt); - } else if (rt->from) { + } else { + struct fib6_info *from; struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->from->fib6_node); - if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + from = rcu_dereference(rt->from); + if (from) { + fn = rcu_dereference(from->fib6_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + } rcu_read_unlock(); } } @@ -2225,8 +2243,15 @@ static void ip6_link_failure(struct sk_buff *skb) static void rt6_update_expires(struct rt6_info *rt0, int timeout) { - if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) - rt0->dst.expires = rt0->from->expires; + if (!(rt0->rt6i_flags & RTF_EXPIRES)) { + struct fib6_info *from; + + rcu_read_lock(); + from = rcu_dereference(rt0->from); + if (from) + rt0->dst.expires = from->expires; + rcu_read_unlock(); + } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; @@ -2243,8 +2268,14 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { + bool from_set; + + rcu_read_lock(); + from_set = !!rcu_dereference(rt->from); + rcu_read_unlock(); + return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->from); + (rt->rt6i_flags & RTF_PCPU || from_set); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -2280,16 +2311,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { + struct fib6_info *from; struct rt6_info *nrt6; rcu_read_lock(); - nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); - rcu_read_unlock(); + from = rcu_dereference(rt6->from); + nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, rt6->from)) + if (rt6_insert_exception(nrt6, from)) dst_release_immediate(&nrt6->dst); } + rcu_read_unlock(); } } @@ -3222,6 +3255,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; + struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3304,7 +3338,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NDISC_REDIRECT, &ndopts); rcu_read_lock(); - nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); + from = rcu_dereference(rt->from); + nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); rcu_read_unlock(); if (!nrt) goto out; @@ -4687,6 +4722,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; + struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; @@ -4783,15 +4819,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } skb_dst_set(skb, &rt->dst); + + rcu_read_lock(); + from = rcu_dereference(rt->from); + if (fibmatch) - err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif, + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt->from, dst, - &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); + rcu_read_unlock(); + if (err < 0) { kfree_skb(skb); goto errout; -- cgit v1.2.3 From a47060cf40c6c8e6e125692f5d73ceea173b12b7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 6 Apr 2018 15:31:49 +0200 Subject: usb: audio-v2: Correct the comment for struct uac_clock_selector_descriptor The comment in UAC2 clock selector descriptor definition mentions the bAssocTerminal after baCSourceID[], but it doesn't exist in the actual definition. Let's correct it. Fixes: 5dd360ebd832 ("include/linux/usb/audio-v2.h: add more UAC2 details") Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/audio-v2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h index aaafecf073ff..49699255cfd3 100644 --- a/include/linux/usb/audio-v2.h +++ b/include/linux/usb/audio-v2.h @@ -94,7 +94,7 @@ struct uac_clock_selector_descriptor { __u8 bClockID; __u8 bNrInPins; __u8 baCSourceID[]; - /* bmControls, bAssocTerminal and iClockSource omitted */ + /* bmControls and iClockSource omitted */ } __attribute__((packed)); /* 4.7.2.3 Clock Multiplier Descriptor */ -- cgit v1.2.3 From 60b9f942bc059913bcdac90d5b225f557438b5c5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 18 Apr 2018 11:26:19 +0200 Subject: USB: phy: drop unused legacy controller-phy bind helper Drop the unused legacy usb_bind_phy() helper whose last user was removed in 2016 when OMAP moved to device-tree boot (9080b8dc761a ("ARM: OMAP2+: Remove legacy usb-host.c platform init code")). Note that this means that for the last couple of years the phy_bind_list has been empty (when using mainline kernels) and that consequently all phy lookups using the usb_get_phy_dev() interface have failed with -ENODEV. This helper along with its current users will be removed by follow-on patches. Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/phy.c | 34 ---------------------------------- include/linux/usb/phy.h | 8 -------- 2 files changed, 42 deletions(-) (limited to 'include') diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c index bceb2c9988dd..833547b00383 100644 --- a/drivers/usb/phy/phy.c +++ b/drivers/usb/phy/phy.c @@ -795,40 +795,6 @@ void usb_remove_phy(struct usb_phy *x) } EXPORT_SYMBOL_GPL(usb_remove_phy); -/** - * usb_bind_phy - bind the phy and the controller that uses the phy - * @dev_name: the device name of the device that will bind to the phy - * @index: index to specify the port number - * @phy_dev_name: the device name of the phy - * - * Fills the phy_bind structure with the dev_name and phy_dev_name. This will - * be used when the phy driver registers the phy and when the controller - * requests this phy. - * - * To be used by platform specific initialization code. - */ -int usb_bind_phy(const char *dev_name, u8 index, - const char *phy_dev_name) -{ - struct usb_phy_bind *phy_bind; - unsigned long flags; - - phy_bind = kzalloc(sizeof(*phy_bind), GFP_KERNEL); - if (!phy_bind) - return -ENOMEM; - - phy_bind->dev_name = dev_name; - phy_bind->phy_dev_name = phy_dev_name; - phy_bind->index = index; - - spin_lock_irqsave(&phy_lock, flags); - list_add_tail(&phy_bind->list, &phy_bind_list); - spin_unlock_irqrestore(&phy_lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(usb_bind_phy); - /** * usb_phy_set_event - set event to phy event * @x: the phy returned by usb_get_phy(); diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index b7a2625947f5..ac5a079161e1 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -242,8 +242,6 @@ extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); -extern int usb_bind_phy(const char *dev_name, u8 index, - const char *phy_dev_name); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); @@ -293,12 +291,6 @@ static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) { } -static inline int usb_bind_phy(const char *dev_name, u8 index, - const char *phy_dev_name) -{ - return -EOPNOTSUPP; -} - static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } -- cgit v1.2.3 From bc40f53417410be18298c5b5dbf5bcae9588d84f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 18 Apr 2018 11:26:20 +0200 Subject: USB: core: hcd: drop support for legacy phys Drop support for looking up and initialising legacy phys in USB core, something which hasn't been used by a mainline kernel since commit 9080b8dc761a ("ARM: OMAP2+: Remove legacy usb-host.c platform init code"). Specifically, since that commit usb_get_phy_dev() have always returned -ENODEV and consequently this code has not been used. Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 38 +++----------------------------------- include/linux/usb/hcd.h | 1 - 2 files changed, 3 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 777036ae6367..6241d32c5ba7 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include "usb.h" @@ -2739,30 +2738,10 @@ int usb_add_hcd(struct usb_hcd *hcd, int retval; struct usb_device *rhdev; - if (IS_ENABLED(CONFIG_USB_PHY) && !hcd->skip_phy_initialization) { - struct usb_phy *phy = usb_get_phy_dev(hcd->self.sysdev, 0); - - if (IS_ERR(phy)) { - retval = PTR_ERR(phy); - if (retval == -EPROBE_DEFER) - return retval; - } else { - retval = usb_phy_init(phy); - if (retval) { - usb_put_phy(phy); - return retval; - } - hcd->usb_phy = phy; - hcd->remove_phy = 1; - } - } - if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) { hcd->phy_roothub = usb_phy_roothub_init(hcd->self.sysdev); - if (IS_ERR(hcd->phy_roothub)) { - retval = PTR_ERR(hcd->phy_roothub); - goto err_phy_roothub_init; - } + if (IS_ERR(hcd->phy_roothub)) + return PTR_ERR(hcd->phy_roothub); retval = usb_phy_roothub_power_on(hcd->phy_roothub); if (retval) @@ -2936,12 +2915,7 @@ err_create_buf: usb_phy_roothub_power_off(hcd->phy_roothub); err_usb_phy_roothub_power_on: usb_phy_roothub_exit(hcd->phy_roothub); -err_phy_roothub_init: - if (hcd->remove_phy && hcd->usb_phy) { - usb_phy_shutdown(hcd->usb_phy); - usb_put_phy(hcd->usb_phy); - hcd->usb_phy = NULL; - } + return retval; } EXPORT_SYMBOL_GPL(usb_add_hcd); @@ -3017,12 +2991,6 @@ void usb_remove_hcd(struct usb_hcd *hcd) usb_phy_roothub_power_off(hcd->phy_roothub); usb_phy_roothub_exit(hcd->phy_roothub); - if (hcd->remove_phy && hcd->usb_phy) { - usb_phy_shutdown(hcd->usb_phy); - usb_put_phy(hcd->usb_phy); - hcd->usb_phy = NULL; - } - usb_put_invalidate_rhdev(hcd); hcd->flags = 0; } diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index aef50cb2ed1b..e33009c77840 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -150,7 +150,6 @@ struct usb_hcd { unsigned rh_pollable:1; /* may we poll the root hub? */ unsigned msix_enabled:1; /* driver has MSI-X enabled? */ unsigned msi_enabled:1; /* driver has MSI enabled? */ - unsigned remove_phy:1; /* auto-remove USB phy */ /* * do not manage the PHY state in the HCD core, instead let the driver * handle this (for example if the PHY can only be turned on after a -- cgit v1.2.3 From c3c0ac70c77d34c03e9600170932b2c28478795f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 18 Apr 2018 11:26:24 +0200 Subject: USB: phy: drop legacy board-file support The legacy interface for associating controllers with phys from board files and platform code has been unused since commit 9080b8dc761a ("ARM: OMAP2+: Remove legacy usb-host.c platform init code"). Since then, all calls to usb_get_phy_dev() and its devres version have been returning -ENODEV. Now that the final calls to these functions have been removed, we can drop this legacy lookup interface altogether. Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/phy.c | 99 +------------------------------------------------ include/linux/usb/phy.h | 28 -------------- 2 files changed, 2 insertions(+), 125 deletions(-) (limited to 'include') diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c index 833547b00383..0277f62739a2 100644 --- a/drivers/usb/phy/phy.c +++ b/drivers/usb/phy/phy.c @@ -27,7 +27,6 @@ #define DEFAULT_ACA_CUR_MAX 5000 static LIST_HEAD(phy_list); -static LIST_HEAD(phy_bind_list); static DEFINE_SPINLOCK(phy_lock); struct phy_devm { @@ -50,24 +49,6 @@ static struct usb_phy *__usb_find_phy(struct list_head *list, return ERR_PTR(-ENODEV); } -static struct usb_phy *__usb_find_phy_dev(struct device *dev, - struct list_head *list, u8 index) -{ - struct usb_phy_bind *phy_bind = NULL; - - list_for_each_entry(phy_bind, list, list) { - if (!(strcmp(phy_bind->dev_name, dev_name(dev))) && - phy_bind->index == index) { - if (phy_bind->phy) - return phy_bind->phy; - else - return ERR_PTR(-EPROBE_DEFER); - } - } - - return ERR_PTR(-ENODEV); -} - static struct usb_phy *__of_usb_find_phy(struct device_node *node) { struct usb_phy *phy; @@ -584,72 +565,6 @@ struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, } EXPORT_SYMBOL_GPL(devm_usb_get_phy_by_phandle); -/** - * usb_get_phy_dev - find the USB PHY - * @dev - device that requests this phy - * @index - the index of the phy - * - * Returns the phy driver, after getting a refcount to it; or - * -ENODEV if there is no such phy. The caller is responsible for - * calling usb_put_phy() to release that count. - * - * For use by USB host and peripheral drivers. - */ -struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index) -{ - struct usb_phy *phy = NULL; - unsigned long flags; - - spin_lock_irqsave(&phy_lock, flags); - - phy = __usb_find_phy_dev(dev, &phy_bind_list, index); - if (IS_ERR(phy) || !try_module_get(phy->dev->driver->owner)) { - dev_dbg(dev, "unable to find transceiver\n"); - if (!IS_ERR(phy)) - phy = ERR_PTR(-ENODEV); - - goto err0; - } - - get_device(phy->dev); - -err0: - spin_unlock_irqrestore(&phy_lock, flags); - - return phy; -} -EXPORT_SYMBOL_GPL(usb_get_phy_dev); - -/** - * devm_usb_get_phy_dev - find the USB PHY using device ptr and index - * @dev - device that requests this phy - * @index - the index of the phy - * - * Gets the phy using usb_get_phy_dev(), and associates a device with it using - * devres. On driver detach, release function is invoked on the devres data, - * then, devres data is freed. - * - * For use by USB host and peripheral drivers. - */ -struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index) -{ - struct usb_phy **ptr, *phy; - - ptr = devres_alloc(devm_usb_phy_release, sizeof(*ptr), GFP_KERNEL); - if (!ptr) - return NULL; - - phy = usb_get_phy_dev(dev, index); - if (!IS_ERR(phy)) { - *ptr = phy; - devres_add(dev, ptr); - } else - devres_free(ptr); - - return phy; -} -EXPORT_SYMBOL_GPL(devm_usb_get_phy_dev); - /** * devm_usb_put_phy - release the USB PHY * @dev - device that wants to release this phy @@ -745,7 +660,6 @@ EXPORT_SYMBOL_GPL(usb_add_phy); */ int usb_add_phy_dev(struct usb_phy *x) { - struct usb_phy_bind *phy_bind; unsigned long flags; int ret; @@ -762,13 +676,9 @@ int usb_add_phy_dev(struct usb_phy *x) ATOMIC_INIT_NOTIFIER_HEAD(&x->notifier); spin_lock_irqsave(&phy_lock, flags); - list_for_each_entry(phy_bind, &phy_bind_list, list) - if (!(strcmp(phy_bind->phy_dev_name, dev_name(x->dev)))) - phy_bind->phy = x; - list_add_tail(&x->head, &phy_list); - spin_unlock_irqrestore(&phy_lock, flags); + return 0; } EXPORT_SYMBOL_GPL(usb_add_phy_dev); @@ -782,15 +692,10 @@ EXPORT_SYMBOL_GPL(usb_add_phy_dev); void usb_remove_phy(struct usb_phy *x) { unsigned long flags; - struct usb_phy_bind *phy_bind; spin_lock_irqsave(&phy_lock, flags); - if (x) { - list_for_each_entry(phy_bind, &phy_bind_list, list) - if (phy_bind->phy == x) - phy_bind->phy = NULL; + if (x) list_del(&x->head); - } spin_unlock_irqrestore(&phy_lock, flags); } EXPORT_SYMBOL_GPL(usb_remove_phy); diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index ac5a079161e1..e4de6bc1f69b 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -157,22 +157,6 @@ struct usb_phy { enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; -/** - * struct usb_phy_bind - represent the binding for the phy - * @dev_name: the device name of the device that will bind to the phy - * @phy_dev_name: the device name of the phy - * @index: used if a single controller uses multiple phys - * @phy: reference to the phy - * @list: to maintain a linked list of the binding information - */ -struct usb_phy_bind { - const char *dev_name; - const char *phy_dev_name; - u8 index; - struct usb_phy *phy; - struct list_head list; -}; - /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); @@ -234,8 +218,6 @@ usb_phy_vbus_off(struct usb_phy *x) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); -extern struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index); -extern struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, @@ -261,16 +243,6 @@ static inline struct usb_phy *devm_usb_get_phy(struct device *dev, return ERR_PTR(-ENXIO); } -static inline struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index) -{ - return ERR_PTR(-ENXIO); -} - -static inline struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index) -{ - return ERR_PTR(-ENXIO); -} - static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { -- cgit v1.2.3 From e8374db15436648411d933b01d929acae0538775 Mon Sep 17 00:00:00 2001 From: Li Jun Date: Mon, 16 Apr 2018 14:54:37 +0800 Subject: usb: typec: tcpm: remove max_snk_mv/ma/mw Since there is no user of max_snk_*, so we can remove them from tcpm. Reviewed-by: Hans de Goede Signed-off-by: Li Jun Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm.c | 12 ------------ include/linux/usb/tcpm.h | 9 --------- 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/drivers/usb/typec/tcpm.c b/drivers/usb/typec/tcpm.c index 048b9532fd39..27192083f8e6 100644 --- a/drivers/usb/typec/tcpm.c +++ b/drivers/usb/typec/tcpm.c @@ -257,9 +257,6 @@ struct tcpm_port { u32 snk_vdo[VDO_MAX_OBJECTS]; unsigned int nr_snk_vdo; - unsigned int max_snk_mv; - unsigned int max_snk_ma; - unsigned int max_snk_mw; unsigned int operating_snk_mw; /* Requested current / voltage */ @@ -3598,9 +3595,6 @@ EXPORT_SYMBOL_GPL(tcpm_update_source_capabilities); int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo, unsigned int nr_pdo, - unsigned int max_snk_mv, - unsigned int max_snk_ma, - unsigned int max_snk_mw, unsigned int operating_snk_mw) { if (tcpm_validate_caps(port, pdo, nr_pdo)) @@ -3608,9 +3602,6 @@ int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo, mutex_lock(&port->lock); port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo); - port->max_snk_mv = max_snk_mv; - port->max_snk_ma = max_snk_ma; - port->max_snk_mw = max_snk_mw; port->operating_snk_mw = operating_snk_mw; switch (port->state) { @@ -3676,9 +3667,6 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) port->nr_snk_vdo = tcpm_copy_vdos(port->snk_vdo, tcpc->config->snk_vdo, tcpc->config->nr_snk_vdo); - port->max_snk_mv = tcpc->config->max_snk_mv; - port->max_snk_ma = tcpc->config->max_snk_ma; - port->max_snk_mw = tcpc->config->max_snk_mw; port->operating_snk_mw = tcpc->config->operating_snk_mw; if (!tcpc->config->try_role_hw) port->try_role = tcpc->config->default_role; diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index f0d839daeaea..f5bda9a0f644 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -62,9 +62,6 @@ enum tcpm_transmit_type { * @snk_pdo: PDO parameters sent to partner as response to * PD_CTRL_GET_SINK_CAP message * @nr_snk_pdo: Number of entries in @snk_pdo - * @max_snk_mv: Maximum acceptable sink voltage in mV - * @max_snk_ma: Maximum sink current in mA - * @max_snk_mw: Maximum required sink power in mW * @operating_snk_mw: * Required operating sink power in mW * @type: Port type (TYPEC_PORT_DFP, TYPEC_PORT_UFP, or @@ -85,9 +82,6 @@ struct tcpc_config { const u32 *snk_vdo; unsigned int nr_snk_vdo; - unsigned int max_snk_mv; - unsigned int max_snk_ma; - unsigned int max_snk_mw; unsigned int operating_snk_mw; enum typec_port_type type; @@ -174,9 +168,6 @@ int tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo, unsigned int nr_pdo); int tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo, unsigned int nr_pdo, - unsigned int max_snk_mv, - unsigned int max_snk_ma, - unsigned int max_snk_mw, unsigned int operating_snk_mw); void tcpm_vbus_change(struct tcpm_port *port); -- cgit v1.2.3 From ffe95371d2a84f3ad8085656d4fcb2fc926ff7a1 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 19 Apr 2018 19:05:50 +0300 Subject: usb: define HCD_USB32 speed option for hosts that support USB 3.2 dual-lane Hosts that support USB 3.2 Enhaned SuperSpeed can set their hcd speed to HCD_USB32 to let usb core and host drivers know that the controller supports new USB 3.2 dual-lane features. make sure usb core handle HCD_USB32 hosts correctly, for now similar to HCD_USB32. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 3 +++ include/linux/usb/hcd.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 6241d32c5ba7..5a799f84dcc2 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -567,6 +567,7 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb) switch (wValue & 0xff00) { case USB_DT_DEVICE << 8: switch (hcd->speed) { + case HCD_USB32: case HCD_USB31: bufp = usb31_rh_dev_descriptor; break; @@ -591,6 +592,7 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb) break; case USB_DT_CONFIG << 8: switch (hcd->speed) { + case HCD_USB32: case HCD_USB31: case HCD_USB3: bufp = ss_rh_config_descriptor; @@ -2804,6 +2806,7 @@ int usb_add_hcd(struct usb_hcd *hcd, case HCD_USB3: rhdev->speed = USB_SPEED_SUPER; break; + case HCD_USB32: case HCD_USB31: rhdev->speed = USB_SPEED_SUPER_PLUS; break; diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index e33009c77840..34a6ded6f319 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -260,6 +260,7 @@ struct hc_driver { #define HCD_USB25 0x0030 /* Wireless USB 1.0 (USB 2.5)*/ #define HCD_USB3 0x0040 /* USB 3.0 */ #define HCD_USB31 0x0050 /* USB 3.1 */ +#define HCD_USB32 0x0060 /* USB 3.2 */ #define HCD_MASK 0x0070 #define HCD_BH 0x0100 /* URB complete in BH context */ -- cgit v1.2.3 From 013eedb8c56e55549c45df19ca56a029cc804028 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 19 Apr 2018 19:05:51 +0300 Subject: USB: Add support to store lane count used by USB 3.2 USB 3.2 specification adds Dual-lane support, doubling the maximum SuperSpeedPlus data rate from 10Gbps to 20Gbps. Dual-lane takes into use a second set of rx and tx wires/pins in the Type-C cable and connector. Add "rx_lanes" and "tx_lanes" variables to struct usb_device to store the numer of lanes in use. Number of lanes can be read using the extended port status hub request that was introduced in USB 3.1. Extended port status rx and tx lane count are zero based, maximum lanes supported by non inter-chip (SSIC) USB 3.2 is 2 (dual lane) with rx and tx lane count symmetric. SSIC devices support asymmetric lanes up to 4 lanes per direction. If extended port status is not available then default to one lane. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 8 ++++++++ include/linux/usb.h | 4 ++++ include/uapi/linux/usb/ch11.h | 5 +++++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 9e79bcf03cde..e21cc3be18b7 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2746,6 +2746,14 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1, if (!udev) return 0; + if (hub_is_superspeedplus(hub->hdev)) { + /* extended portstatus Rx and Tx lane count are zero based */ + udev->rx_lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1; + udev->tx_lanes = USB_EXT_PORT_TX_LANES(ext_portstatus) + 1; + } else { + udev->rx_lanes = 1; + udev->tx_lanes = 1; + } if (hub_is_wusb(hub)) udev->speed = USB_SPEED_WIRELESS; else if (hub_is_superspeedplus(hub->hdev) && diff --git a/include/linux/usb.h b/include/linux/usb.h index 0173597e59aa..beffceec4915 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -551,6 +551,8 @@ struct usb3_lpm_parameters { * @route: tree topology hex string for use with xHCI * @state: device state: configured, not attached, etc. * @speed: device speed: high/full/low (or error) + * @rx_lanes: number of rx lanes in use, USB 3.2 adds dual-lane support + * @tx_lanes: number of tx lanes in use, USB 3.2 adds dual-lane support * @tt: Transaction Translator info; used with low/full speed dev, highspeed hub * @ttport: device port on that tt hub * @toggle: one bit for each endpoint, with ([0] = IN, [1] = OUT) endpoints @@ -624,6 +626,8 @@ struct usb_device { u32 route; enum usb_device_state state; enum usb_device_speed speed; + unsigned int rx_lanes; + unsigned int tx_lanes; struct usb_tt *tt; int ttport; diff --git a/include/uapi/linux/usb/ch11.h b/include/uapi/linux/usb/ch11.h index 29c120c88747..fb0cd24c392c 100644 --- a/include/uapi/linux/usb/ch11.h +++ b/include/uapi/linux/usb/ch11.h @@ -197,6 +197,11 @@ struct usb_port_status { #define USB_EXT_PORT_STAT_RX_LANES 0x00000f00 #define USB_EXT_PORT_STAT_TX_LANES 0x0000f000 +#define USB_EXT_PORT_RX_LANES(p) \ + (((p) & USB_EXT_PORT_STAT_RX_LANES) >> 8) +#define USB_EXT_PORT_TX_LANES(p) \ + (((p) & USB_EXT_PORT_STAT_TX_LANES) >> 12) + /* * wHubCharacteristics (masks) * See USB 2.0 spec Table 11-13, offset 3 -- cgit v1.2.3 From 143470368efd3f0fb4cbe81aa89f49de8048d8a9 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 Apr 2018 01:02:58 +0300 Subject: usb: tegra: Move utmi-pads reset from ehci-tegra to tegra-phy UTMI pads are shared by USB controllers and reset of UTMI pads is shared with the reset of USB1 controller. Currently reset of UTMI pads is done by the EHCI driver and ChipIdea UDC works because EHCI driver always happen to be probed first. Move reset controls from ehci-tegra to tegra-phy in order to resolve the problem. Signed-off-by: Dmitry Osipenko Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-tegra.c | 87 ++++++++++++++++++--------------------- drivers/usb/phy/phy-tegra-usb.c | 79 ++++++++++++++++++++++++++++++++--- include/linux/usb/tegra_usb_phy.h | 2 + 3 files changed, 115 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c index a6f4389f7e88..4d2cdec4cb78 100644 --- a/drivers/usb/host/ehci-tegra.c +++ b/drivers/usb/host/ehci-tegra.c @@ -36,7 +36,6 @@ #define DRV_NAME "tegra-ehci" static struct hc_driver __read_mostly tegra_ehci_hc_driver; -static bool usb1_reset_attempted; struct tegra_ehci_soc_config { bool has_hostpc; @@ -51,67 +50,54 @@ struct tegra_ehci_hcd { enum tegra_usb_phy_port_speed port_speed; }; -/* - * The 1st USB controller contains some UTMI pad registers that are global for - * all the controllers on the chip. Those registers are also cleared when - * reset is asserted to the 1st controller. This means that the 1st controller - * can only be reset when no other controlled has finished probing. So we'll - * reset the 1st controller before doing any other setup on any of the - * controllers, and then never again. - * - * Since this is a PHY issue, the Tegra PHY driver should probably be doing - * the resetting of the USB controllers. But to keep compatibility with old - * device trees that don't have reset phandles in the PHYs, do it here. - * Those old DTs will be vulnerable to total USB breakage if the 1st EHCI - * device isn't the first one to finish probing, so warn them. - */ static int tegra_reset_usb_controller(struct platform_device *pdev) { struct device_node *phy_np; struct usb_hcd *hcd = platform_get_drvdata(pdev); struct tegra_ehci_hcd *tegra = (struct tegra_ehci_hcd *)hcd_to_ehci(hcd)->priv; - bool has_utmi_pad_registers = false; + struct reset_control *rst; + int err; phy_np = of_parse_phandle(pdev->dev.of_node, "nvidia,phy", 0); if (!phy_np) return -ENOENT; - if (of_property_read_bool(phy_np, "nvidia,has-utmi-pad-registers")) - has_utmi_pad_registers = true; + /* + * The 1st USB controller contains some UTMI pad registers that are + * global for all the controllers on the chip. Those registers are + * also cleared when reset is asserted to the 1st controller. + */ + rst = of_reset_control_get_shared(phy_np, "utmi-pads"); + if (IS_ERR(rst)) { + dev_warn(&pdev->dev, + "can't get utmi-pads reset from the PHY\n"); + dev_warn(&pdev->dev, + "continuing, but please update your DT\n"); + } else { + /* + * PHY driver performs UTMI-pads reset in a case of + * non-legacy DT. + */ + reset_control_put(rst); + } - if (!usb1_reset_attempted) { - struct reset_control *usb1_reset; + of_node_put(phy_np); - if (!has_utmi_pad_registers) - usb1_reset = of_reset_control_get(phy_np, "utmi-pads"); - else - usb1_reset = tegra->rst; - - if (IS_ERR(usb1_reset)) { - dev_warn(&pdev->dev, - "can't get utmi-pads reset from the PHY\n"); - dev_warn(&pdev->dev, - "continuing, but please update your DT\n"); - } else { - reset_control_assert(usb1_reset); - udelay(1); - reset_control_deassert(usb1_reset); - - if (!has_utmi_pad_registers) - reset_control_put(usb1_reset); - } + /* reset control is shared, hence initialize it first */ + err = reset_control_deassert(tegra->rst); + if (err) + return err; - usb1_reset_attempted = true; - } + err = reset_control_assert(tegra->rst); + if (err) + return err; - if (!has_utmi_pad_registers) { - reset_control_assert(tegra->rst); - udelay(1); - reset_control_deassert(tegra->rst); - } + udelay(1); - of_node_put(phy_np); + err = reset_control_deassert(tegra->rst); + if (err) + return err; return 0; } @@ -440,7 +426,7 @@ static int tegra_ehci_probe(struct platform_device *pdev) goto cleanup_hcd_create; } - tegra->rst = devm_reset_control_get(&pdev->dev, "usb"); + tegra->rst = devm_reset_control_get_shared(&pdev->dev, "usb"); if (IS_ERR(tegra->rst)) { dev_err(&pdev->dev, "Can't get ehci reset\n"); err = PTR_ERR(tegra->rst); @@ -452,8 +438,10 @@ static int tegra_ehci_probe(struct platform_device *pdev) goto cleanup_hcd_create; err = tegra_reset_usb_controller(pdev); - if (err) + if (err) { + dev_err(&pdev->dev, "Failed to reset controller\n"); goto cleanup_clk_en; + } u_phy = devm_usb_get_phy_by_phandle(&pdev->dev, "nvidia,phy", 0); if (IS_ERR(u_phy)) { @@ -538,6 +526,9 @@ static int tegra_ehci_remove(struct platform_device *pdev) usb_phy_shutdown(hcd->usb_phy); usb_remove_hcd(hcd); + reset_control_assert(tegra->rst); + udelay(1); + clk_disable_unprepare(tegra->clk); usb_put_hcd(hcd); diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c index e46219e7fa93..ea7ef1dc0b42 100644 --- a/drivers/usb/phy/phy-tegra-usb.c +++ b/drivers/usb/phy/phy-tegra-usb.c @@ -236,17 +236,83 @@ static void set_phcd(struct tegra_usb_phy *phy, bool enable) static int utmip_pad_open(struct tegra_usb_phy *phy) { - int err; + int ret; phy->pad_clk = devm_clk_get(phy->u_phy.dev, "utmi-pads"); if (IS_ERR(phy->pad_clk)) { - err = PTR_ERR(phy->pad_clk); + ret = PTR_ERR(phy->pad_clk); dev_err(phy->u_phy.dev, - "Failed to get UTMIP pad clock: %d\n", err); - return err; + "Failed to get UTMIP pad clock: %d\n", ret); + return ret; } - return 0; + phy->pad_rst = devm_reset_control_get_optional_shared( + phy->u_phy.dev, "utmi-pads"); + if (IS_ERR(phy->pad_rst)) { + ret = PTR_ERR(phy->pad_rst); + dev_err(phy->u_phy.dev, + "Failed to get UTMI-pads reset: %d\n", ret); + return ret; + } + + ret = clk_prepare_enable(phy->pad_clk); + if (ret) { + dev_err(phy->u_phy.dev, + "Failed to enable UTMI-pads clock: %d\n", ret); + return ret; + } + + spin_lock(&utmip_pad_lock); + + ret = reset_control_deassert(phy->pad_rst); + if (ret) { + dev_err(phy->u_phy.dev, + "Failed to initialize UTMI-pads reset: %d\n", ret); + goto unlock; + } + + ret = reset_control_assert(phy->pad_rst); + if (ret) { + dev_err(phy->u_phy.dev, + "Failed to assert UTMI-pads reset: %d\n", ret); + goto unlock; + } + + udelay(1); + + ret = reset_control_deassert(phy->pad_rst); + if (ret) + dev_err(phy->u_phy.dev, + "Failed to deassert UTMI-pads reset: %d\n", ret); +unlock: + spin_unlock(&utmip_pad_lock); + + clk_disable_unprepare(phy->pad_clk); + + return ret; +} + +static int utmip_pad_close(struct tegra_usb_phy *phy) +{ + int ret; + + ret = clk_prepare_enable(phy->pad_clk); + if (ret) { + dev_err(phy->u_phy.dev, + "Failed to enable UTMI-pads clock: %d\n", ret); + return ret; + } + + ret = reset_control_assert(phy->pad_rst); + if (ret) + dev_err(phy->u_phy.dev, + "Failed to assert UTMI-pads reset: %d\n", ret); + + udelay(1); + + clk_disable_unprepare(phy->pad_clk); + + return ret; } static void utmip_pad_power_on(struct tegra_usb_phy *phy) @@ -700,6 +766,9 @@ static void tegra_usb_phy_close(struct tegra_usb_phy *phy) if (!IS_ERR(phy->vbus)) regulator_disable(phy->vbus); + if (!phy->is_ulpi_phy) + utmip_pad_close(phy); + clk_disable_unprepare(phy->pll_u); } diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h index d641ea1660b7..0c5c3ea8b2d7 100644 --- a/include/linux/usb/tegra_usb_phy.h +++ b/include/linux/usb/tegra_usb_phy.h @@ -17,6 +17,7 @@ #define __TEGRA_USB_PHY_H #include +#include #include /* @@ -76,6 +77,7 @@ struct tegra_usb_phy { bool is_legacy_phy; bool is_ulpi_phy; int reset_gpio; + struct reset_control *pad_rst; }; void tegra_usb_phy_preresume(struct usb_phy *phy); -- cgit v1.2.3 From c5f78b1fe4e5baf4c4ca30377c2d7e06e2e391ec Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 27 Mar 2018 11:48:24 +0800 Subject: serial: Introduce UPSTAT_SYNC_FIFO for synchronised FIFOs This change adds a flag to indicate that a UART is has an external means of synchronising its FIFO, without needing CTSRTS or XON/XOFF. This allows us to use the throttle/unthrottle callbacks, without having to claim other methods of flow control. Signed-off-by: Jeremy Kerr Tested-by: Eddie James Tested-by: Joel Stanley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 4 ++-- include/linux/serial_core.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 0466f9f08a91..c47158c93202 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -674,8 +674,8 @@ static void uart_send_xchar(struct tty_struct *tty, char ch) static void uart_throttle(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; + upstat_t mask = UPSTAT_SYNC_FIFO; struct uart_port *port; - upstat_t mask = 0; port = uart_port_ref(state); if (!port) @@ -703,8 +703,8 @@ static void uart_throttle(struct tty_struct *tty) static void uart_unthrottle(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; + upstat_t mask = UPSTAT_SYNC_FIFO; struct uart_port *port; - upstat_t mask = 0; port = uart_port_ref(state); if (!port) diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 1d356105f25a..d224961e1346 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -233,6 +233,7 @@ struct uart_port { #define UPSTAT_AUTORTS ((__force upstat_t) (1 << 2)) #define UPSTAT_AUTOCTS ((__force upstat_t) (1 << 3)) #define UPSTAT_AUTOXOFF ((__force upstat_t) (1 << 4)) +#define UPSTAT_SYNC_FIFO ((__force upstat_t) (1 << 5)) int hw_stopped; /* sw-assisted CTS flow state */ unsigned int mctrl; /* current modem ctrl settings */ -- cgit v1.2.3 From ebbaf9ab9ebd69f42b286c7a67cc84571c3d947a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 27 Mar 2018 11:48:25 +0800 Subject: serial/8250: export serial8250_read_char Currently, we export serial8250_rx_chars, which does a whole bunch of reads from the 8250 data register, without any form of flow control between reads. An upcoming change to the aspeed vuart driver implements more fine-grained flow control in the interrupt handler, requiring character-at-a-time control over the rx path. This change exports serial8250_read_char to allow this. Signed-off-by: Jeremy Kerr Tested-by: Eddie James Tested-by: Joel Stanley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 3 ++- include/linux/serial_8250.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 95833cbc4338..8fbd5fbeb318 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1680,7 +1680,7 @@ static void serial8250_enable_ms(struct uart_port *port) serial8250_rpm_put(up); } -static void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr) +void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr) { struct uart_port *port = &up->port; unsigned char ch; @@ -1740,6 +1740,7 @@ static void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr) uart_insert_char(port, lsr, UART_LSR_OE, ch, flag); } +EXPORT_SYMBOL_GPL(serial8250_read_char); /* * serial8250_rx_chars: processes according to the passed in LSR diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index a27ef5f56431..76b9db71e489 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -163,6 +163,7 @@ extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl); extern int fsl8250_handle_irq(struct uart_port *port); int serial8250_handle_irq(struct uart_port *port, unsigned int iir); unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr); +void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr); void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); -- cgit v1.2.3 From fbcf93ebcaef7d09881ee308b52cd84f5e43c622 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 21 Apr 2018 09:48:23 -0700 Subject: bpf: btf: Clean up btf.h in uapi This patch cleans up btf.h in uapi: 1) Rename "name" to "name_off" to better reflect it is an offset to the string section instead of a char array. 2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 8 +++----- kernel/bpf/btf.c | 20 ++++++++++---------- tools/include/uapi/linux/btf.h | 8 +++----- tools/lib/bpf/btf.c | 2 +- 4 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 74a30b1090df..bcb56ee47014 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -6,9 +6,7 @@ #include #define BTF_MAGIC 0xeB9F -#define BTF_MAGIC_SWAP 0x9FeB #define BTF_VERSION 1 -#define BTF_FLAGS_COMPR 0x01 struct btf_header { __u16 magic; @@ -43,7 +41,7 @@ struct btf_header { #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) struct btf_type { - __u32 name; + __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused @@ -105,7 +103,7 @@ struct btf_type { * info in "struct btf_type"). */ struct btf_enum { - __u32 name; + __u32 name_off; __s32 val; }; @@ -122,7 +120,7 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name; + __u32 name_off; __u32 type; __u32 offset; /* offset in bits */ }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index eb56ac760547..22e1046a1a86 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -473,7 +473,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name), + btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -517,7 +517,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, btf_verifier_log_type(env, struct_type, NULL); __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name), + btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { @@ -1419,10 +1419,10 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); for_each_member(i, t, member) { - if (!btf_name_offset_valid(btf, member->name)) { + if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid member name_offset:%u", - member->name); + member->name_off); return -EINVAL; } @@ -1605,14 +1605,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { - if (!btf_name_offset_valid(btf, enums[i].name)) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", - enums[i].name); + enums[i].name_off); return -EINVAL; } btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name), + btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1636,7 +1636,7 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name)); + btf_name_by_offset(btf, enums[i].name_off)); return; } } @@ -1687,9 +1687,9 @@ static s32 btf_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_name_offset_valid(env->btf, t->name)) { + if (!btf_name_offset_valid(env->btf, t->name_off)) { btf_verifier_log(env, "[%u] Invalid name_offset:%u", - env->log_type_id, t->name); + env->log_type_id, t->name_off); return -EINVAL; } diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 74a30b1090df..bcb56ee47014 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -6,9 +6,7 @@ #include #define BTF_MAGIC 0xeB9F -#define BTF_MAGIC_SWAP 0x9FeB #define BTF_VERSION 1 -#define BTF_FLAGS_COMPR 0x01 struct btf_header { __u16 magic; @@ -43,7 +41,7 @@ struct btf_header { #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) struct btf_type { - __u32 name; + __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused @@ -105,7 +103,7 @@ struct btf_type { * info in "struct btf_type"). */ struct btf_enum { - __u32 name; + __u32 name_off; __s32 val; }; @@ -122,7 +120,7 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name; + __u32 name_off; __u32 type; __u32 offset; /* offset in bits */ }; diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 58b6255abc7a..2bac710e3194 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -281,7 +281,7 @@ int32_t btf__find_by_name(const struct btf *btf, const char *type_name) for (i = 1; i <= btf->nr_types; i++) { const struct btf_type *t = btf->types[i]; - const char *name = btf_name_by_offset(btf, t->name); + const char *name = btf_name_by_offset(btf, t->name_off); if (name && !strcmp(type_name, name)) return i; -- cgit v1.2.3 From c1c734cb1f54b062f7e67ffc9656d82f5b412b9c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 23 Mar 2018 10:58:31 -0700 Subject: serial: core: Make sure compiler barfs for 16-byte earlycon names As part of bringup I ended up wanting to call an earlycon driver by a name that was exactly 16-bytes big, specifically "qcom_geni_serial". Unfortunately, when I tried this I found that things compiled just fine. They just didn't work. Specifically the compiler felt perfectly justified in initting the ".name" field of "struct earlycon_id" with the full 16-bytes and just skipping the '\0'. Needless to say, that behavior didn't seem ideal, but I guess someone must have allowed it for a reason. One way to fix this is to shorten the name field to 15 bytes and then add an extra byte after that nobody touches. This should always be initted to 0 and we're golden. There are, of course, other ways to fix this too. We could audit all the users of the "name" field and make them stop at both null termination or at 16 bytes. We could also just make the name field much bigger so that we're not likely to run into this. ...but both seem like we'll just hit the bug again. Signed-off-by: Douglas Anderson Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d224961e1346..d2a2e4bc2435 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -349,7 +349,8 @@ struct earlycon_device { }; struct earlycon_id { - char name[16]; + char name[15]; + char name_term; /* In case compiler didn't '\0' term name */ char compatible[128]; int (*setup)(struct earlycon_device *, const char *options); } __aligned(32); -- cgit v1.2.3 From 05e6557b8ed833546ee2b66ce6b58fecf09f439e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 29 Mar 2018 15:26:48 +1100 Subject: staging: lustre: add container_of_safe() Luster has a container_of0() function which is similar to container_of() but passes an IS_ERR_OR_NULL() pointer through unchanged. This could be generally useful: bcache at last has a similar function. Naming is hard, but the precedent set by hlist_entry_safe() suggests a _safe suffix might be most consistent. So add container_of_safe() to kernel.h, and replace all occurrences of container_of0() with one of - list_first_entry, list_next_entry, when that is a better fit, - container_of(), when the pointer is used as a validpointer in surrounding code, - container_of_safe() when there is no obviously better alternative. Signed-off-by: NeilBrown Reviewed-by: James Simmons Signed-off-by: Greg Kroah-Hartman --- drivers/staging/lustre/include/linux/libcfs/libcfs.h | 11 ----------- drivers/staging/lustre/lustre/include/cl_object.h | 10 +++++----- drivers/staging/lustre/lustre/include/lu_object.h | 6 +++--- drivers/staging/lustre/lustre/llite/llite_nfs.c | 2 +- drivers/staging/lustre/lustre/llite/vvp_internal.h | 8 ++++---- drivers/staging/lustre/lustre/lmv/lmv_internal.h | 2 +- drivers/staging/lustre/lustre/lov/lov_cl_internal.h | 18 +++++++++--------- drivers/staging/lustre/lustre/lov/lov_internal.h | 2 +- drivers/staging/lustre/lustre/obdclass/lu_object.c | 8 ++++---- drivers/staging/lustre/lustre/obdecho/echo_client.c | 2 +- drivers/staging/lustre/lustre/osc/osc_cl_internal.h | 10 +++++----- drivers/staging/lustre/lustre/osc/osc_internal.h | 2 +- drivers/staging/lustre/lustre/osc/osc_io.c | 2 +- drivers/staging/lustre/lustre/osc/osc_object.c | 2 +- include/linux/kernel.h | 16 ++++++++++++++++ 15 files changed, 53 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h index 5d10aa32895b..6e7754b2f296 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h @@ -143,17 +143,6 @@ int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); int libcfs_ioctl(unsigned long cmd, void __user *arg); -/* container_of depends on "likely" which is defined in libcfs_private.h */ -static inline void *__container_of(void *ptr, unsigned long shift) -{ - if (IS_ERR_OR_NULL(ptr)) - return ptr; - return (char *)ptr - shift; -} - -#define container_of0(ptr, type, member) \ - ((type *)__container_of((void *)(ptr), offsetof(type, member))) - #define _LIBCFS_H extern struct miscdevice libcfs_dev; diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h index 341a145c3331..6f7b991be809 100644 --- a/drivers/staging/lustre/lustre/include/cl_object.h +++ b/drivers/staging/lustre/lustre/include/cl_object.h @@ -1941,7 +1941,7 @@ static inline int lu_device_is_cl(const struct lu_device *d) static inline struct cl_device *lu2cl_dev(const struct lu_device *d) { LASSERT(!d || IS_ERR(d) || lu_device_is_cl(d)); - return container_of0(d, struct cl_device, cd_lu_dev); + return container_of_safe(d, struct cl_device, cd_lu_dev); } static inline struct lu_device *cl2lu_dev(struct cl_device *d) @@ -1952,13 +1952,13 @@ static inline struct lu_device *cl2lu_dev(struct cl_device *d) static inline struct cl_object *lu2cl(const struct lu_object *o) { LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); - return container_of0(o, struct cl_object, co_lu); + return container_of_safe(o, struct cl_object, co_lu); } static inline const struct cl_object_conf * lu2cl_conf(const struct lu_object_conf *conf) { - return container_of0(conf, struct cl_object_conf, coc_lu); + return container_of_safe(conf, struct cl_object_conf, coc_lu); } static inline struct cl_object *cl_object_next(const struct cl_object *obj) @@ -1969,12 +1969,12 @@ static inline struct cl_object *cl_object_next(const struct cl_object *obj) static inline struct cl_device *cl_object_device(const struct cl_object *o) { LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); - return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); + return container_of_safe(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); } static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) { - return container_of0(h, struct cl_object_header, coh_lu); + return container_of_safe(h, struct cl_object_header, coh_lu); } static inline struct cl_site *cl_object_site(const struct cl_object *obj) diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h index 35c7b582f36d..c3b0ed518819 100644 --- a/drivers/staging/lustre/lustre/include/lu_object.h +++ b/drivers/staging/lustre/lustre/include/lu_object.h @@ -745,15 +745,15 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env, static inline struct lu_object *lu_object_top(struct lu_object_header *h) { LASSERT(!list_empty(&h->loh_layers)); - return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); + return list_first_entry(&h->loh_layers, struct lu_object, lo_linkage); } /** * Next sub-object in the layering */ -static inline struct lu_object *lu_object_next(const struct lu_object *o) +static inline const struct lu_object *lu_object_next(const struct lu_object *o) { - return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); + return list_next_entry(o, lo_linkage); } /** diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index a6a1d80c711a..14172688d55f 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -223,7 +223,7 @@ static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, /* It is hack to access lde_fid for comparison with lgd_fid. * So the input 'name' must be part of the 'lu_dirent'. */ - struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); + struct lu_dirent *lde = container_of((void*)name, struct lu_dirent, lde_name); struct ll_getname_data *lgd = container_of(ctx, struct ll_getname_data, ctx); struct lu_fid fid; diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h index 02ea5161d635..7d3abb43584a 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_internal.h +++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h @@ -263,22 +263,22 @@ static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv) static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d) { - return container_of0(d, struct vvp_device, vdv_cl.cd_lu_dev); + return container_of_safe(d, struct vvp_device, vdv_cl.cd_lu_dev); } static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d) { - return container_of0(d, struct vvp_device, vdv_cl); + return container_of_safe(d, struct vvp_device, vdv_cl); } static inline struct vvp_object *cl2vvp(const struct cl_object *obj) { - return container_of0(obj, struct vvp_object, vob_cl); + return container_of_safe(obj, struct vvp_object, vob_cl); } static inline struct vvp_object *lu2vvp(const struct lu_object *obj) { - return container_of0(obj, struct vvp_object, vob_cl.co_lu); + return container_of_safe(obj, struct vvp_object, vob_cl.co_lu); } static inline struct inode *vvp_object_inode(const struct cl_object *obj) diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h index c27c3c32188d..68a99170c424 100644 --- a/drivers/staging/lustre/lustre/lmv/lmv_internal.h +++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h @@ -60,7 +60,7 @@ int lmv_revalidate_slaves(struct obd_export *exp, static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv) { - return container_of0(lmv, struct obd_device, u.lmv); + return container_of_safe(lmv, struct obd_device, u.lmv); } static inline struct lmv_tgt_desc * diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h index 1185eceaf497..2e9c75ebdda5 100644 --- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -496,7 +496,7 @@ static inline struct lu_device *lov2lu_dev(struct lov_device *lov) static inline struct lov_device *lu2lov_dev(const struct lu_device *d) { LINVRNT(d->ld_type == &lov_device_type); - return container_of0(d, struct lov_device, ld_cl.cd_lu_dev); + return container_of(d, struct lov_device, ld_cl.cd_lu_dev); } static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) @@ -512,13 +512,13 @@ static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) { LINVRNT(d->ld_type == &lovsub_device_type); - return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev); + return container_of(d, struct lovsub_device, acid_cl.cd_lu_dev); } static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) { LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); - return container_of0(d, struct lovsub_device, acid_cl); + return container_of(d, struct lovsub_device, acid_cl); } static inline struct lu_object *lov2lu(struct lov_object *lov) @@ -534,13 +534,13 @@ static inline struct cl_object *lov2cl(struct lov_object *lov) static inline struct lov_object *lu2lov(const struct lu_object *obj) { LINVRNT(lov_is_object(obj)); - return container_of0(obj, struct lov_object, lo_cl.co_lu); + return container_of(obj, struct lov_object, lo_cl.co_lu); } static inline struct lov_object *cl2lov(const struct cl_object *obj) { LINVRNT(lov_is_object(&obj->co_lu)); - return container_of0(obj, struct lov_object, lo_cl); + return container_of(obj, struct lov_object, lo_cl); } static inline struct lu_object *lovsub2lu(struct lovsub_object *los) @@ -556,13 +556,13 @@ static inline struct cl_object *lovsub2cl(struct lovsub_object *los) static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) { LINVRNT(lovsub_is_object(&obj->co_lu)); - return container_of0(obj, struct lovsub_object, lso_cl); + return container_of(obj, struct lovsub_object, lso_cl); } static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) { LINVRNT(lovsub_is_object(obj)); - return container_of0(obj, struct lovsub_object, lso_cl.co_lu); + return container_of(obj, struct lovsub_object, lso_cl.co_lu); } static inline struct lovsub_lock * @@ -590,14 +590,14 @@ static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) { LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); - return container_of0(slice, struct lov_page, lps_cl); + return container_of(slice, struct lov_page, lps_cl); } static inline struct lovsub_page * cl2lovsub_page(const struct cl_page_slice *slice) { LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu)); - return container_of0(slice, struct lovsub_page, lsb_cl); + return container_of(slice, struct lovsub_page, lsb_cl); } static inline struct lov_io *cl2lov_io(const struct lu_env *env, diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h index 8dde3828f935..47042f27ca90 100644 --- a/drivers/staging/lustre/lustre/lov/lov_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_internal.h @@ -280,7 +280,7 @@ static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi) static inline struct obd_device *lov2obd(const struct lov_obd *lov) { - return container_of0(lov, struct obd_device, u.lov); + return container_of_safe(lov, struct obd_device, u.lov); } #endif diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index 3ae16e8501c2..3de7dc0497c4 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -319,7 +319,7 @@ static void lu_object_free(const struct lu_env *env, struct lu_object *o) * lives as long as possible and ->loo_object_free() methods * can look at its contents. */ - o = container_of0(splice.prev, struct lu_object, lo_linkage); + o = container_of(splice.prev, struct lu_object, lo_linkage); list_del_init(&o->lo_linkage); o->lo_ops->loo_object_free(env, o); } @@ -404,8 +404,8 @@ int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, * races due to the reasons described in lu_object_put(). */ while (!list_empty(&dispose)) { - h = container_of0(dispose.next, - struct lu_object_header, loh_lru); + h = container_of(dispose.next, + struct lu_object_header, loh_lru); list_del_init(&h->loh_lru); lu_object_free(env, lu_object_top(h)); lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); @@ -579,7 +579,7 @@ static struct lu_object *htable_lookup(struct lu_site *s, return ERR_PTR(-ENOENT); } - h = container_of0(hnode, struct lu_object_header, loh_hash); + h = container_of(hnode, struct lu_object_header, loh_hash); if (likely(!lu_object_is_dying(h))) { cfs_hash_get(s->ls_obj_hash, hnode); lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c index 99a76db51ae0..767067b61109 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -99,7 +99,7 @@ static int echo_client_cleanup(struct obd_device *obddev); */ static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) { - return container_of0(dev, struct echo_device, ed_cl); + return container_of_safe(dev, struct echo_device, ed_cl); } static inline struct cl_device *echo_dev2cl(struct echo_device *d) diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h index 1449013722f6..dc25dd12d7d5 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h +++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h @@ -460,7 +460,7 @@ static inline int osc_is_object(const struct lu_object *obj) static inline struct osc_device *lu2osc_dev(const struct lu_device *d) { LINVRNT(d->ld_type == &osc_device_type); - return container_of0(d, struct osc_device, od_cl.cd_lu_dev); + return container_of(d, struct osc_device, od_cl.cd_lu_dev); } static inline struct obd_export *osc_export(const struct osc_object *obj) @@ -476,7 +476,7 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj) static inline struct osc_object *cl2osc(const struct cl_object *obj) { LINVRNT(osc_is_object(&obj->co_lu)); - return container_of0(obj, struct osc_object, oo_cl); + return container_of(obj, struct osc_object, oo_cl); } static inline struct cl_object *osc2cl(const struct osc_object *obj) @@ -509,12 +509,12 @@ static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode) static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) { LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); - return container_of0(slice, struct osc_page, ops_cl); + return container_of(slice, struct osc_page, ops_cl); } static inline struct osc_page *oap2osc(struct osc_async_page *oap) { - return container_of0(oap, struct osc_page, ops_oap); + return container_of_safe(oap, struct osc_page, ops_oap); } static inline pgoff_t osc_index(struct osc_page *opg) @@ -545,7 +545,7 @@ osc_cl_page_osc(struct cl_page *page, struct osc_object *osc) static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) { LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); - return container_of0(slice, struct osc_lock, ols_cl); + return container_of(slice, struct osc_lock, ols_cl); } static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h index 1c8ba4ad6e7f..fca020568c19 100644 --- a/drivers/staging/lustre/lustre/osc/osc_internal.h +++ b/drivers/staging/lustre/lustre/osc/osc_internal.h @@ -180,7 +180,7 @@ struct osc_device { static inline struct osc_device *obd2osc_dev(const struct obd_device *d) { - return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); + return container_of_safe(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); } extern struct lu_kmem_descr osc_caches[]; diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c index 76743faf3e6d..67734a8ed331 100644 --- a/drivers/staging/lustre/lustre/osc/osc_io.c +++ b/drivers/staging/lustre/lustre/osc/osc_io.c @@ -55,7 +55,7 @@ static struct osc_io *cl2osc_io(const struct lu_env *env, const struct cl_io_slice *slice) { - struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); + struct osc_io *oio = container_of_safe(slice, struct osc_io, oi_cl); LINVRNT(oio == osc_env_io(env)); return oio; diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c index 4f81dd16f4f5..84240181c7ea 100644 --- a/drivers/staging/lustre/lustre/osc/osc_object.c +++ b/drivers/staging/lustre/lustre/osc/osc_object.c @@ -58,7 +58,7 @@ static struct lu_object *osc2lu(struct osc_object *osc) static struct osc_object *lu2osc(const struct lu_object *obj) { LINVRNT(osc_is_object(obj)); - return container_of0(obj, struct osc_object, oo_cl.co_lu); + return container_of(obj, struct osc_object, oo_cl.co_lu); } /***************************************************************************** diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 6a1eb0b0aad9..58d6645b1425 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -964,6 +964,22 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) +/** + * container_of_safe - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged. + */ +#define container_of_safe(ptr, type, member) ({ \ + void *__mptr = (void *)(ptr); \ + BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ + !__same_type(*(ptr), void), \ + "pointer type mismatch in container_of()"); \ + IS_ERR_OR_NULL(ptr) ? ERR_CAST(ptr) : \ + ((type *)(__mptr - offsetof(type, member))); }) + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD -- cgit v1.2.3 From af8bb9f89838249872240f258e67774ccbcc5970 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Apr 2018 10:58:09 -0500 Subject: PCI/ACPI: Request LTR control from platform before using it Per the PCI Firmware spec r3.2, sec 4.5, an ACPI-based OS should use _OSC to request control of Latency Tolerance Reporting (LTR) before using it. Request control of LTR, and if the platform does not grant control, don't use it. N.B. If the hardware supports LTR and the ASPM L1.2 substate but the BIOS doesn't support LTR in _OSC, we previously would enable ASPM L1.2. This patch will prevent us from enabling ASPM L1.2 in that case. It does not prevent us from enabling PCI-PM L1.2, since that doesn't depend on LTR. See PCIe r40, sec 5.5.1, for the L1 PM substate entry conditions. Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 6 ++++++ drivers/pci/probe.c | 5 +++++ include/linux/acpi.h | 3 ++- include/linux/pci.h | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 0da18bde6a16..2ff0d6702a2e 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -153,6 +153,7 @@ static struct pci_osc_bit_struct pci_osc_control_bit[] = { { OSC_PCI_EXPRESS_PME_CONTROL, "PME" }, { OSC_PCI_EXPRESS_AER_CONTROL, "AER" }, { OSC_PCI_EXPRESS_CAPABILITY_CONTROL, "PCIeCapability" }, + { OSC_PCI_EXPRESS_LTR_CONTROL, "LTR" }, }; static void decode_osc_bits(struct acpi_pci_root *root, char *msg, u32 word, @@ -475,6 +476,9 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm) | OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | OSC_PCI_EXPRESS_PME_CONTROL; + if (IS_ENABLED(CONFIG_PCIEASPM)) + control |= OSC_PCI_EXPRESS_LTR_CONTROL; + if (pci_aer_available()) { if (aer_acpi_firmware_first()) dev_info(&device->dev, @@ -905,6 +909,8 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, host_bridge->native_aer = 0; if (!(root->osc_control_set & OSC_PCI_EXPRESS_PME_CONTROL)) host_bridge->native_pme = 0; + if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL)) + host_bridge->native_ltr = 0; pci_scan_child_bus(bus); pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info, diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ac91b6fd0bcd..cc1688d75664 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -554,6 +554,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv) bridge->native_aer = 1; bridge->native_hotplug = 1; bridge->native_pme = 1; + bridge->native_ltr = 1; return bridge; } @@ -1954,9 +1955,13 @@ static void pci_configure_relaxed_ordering(struct pci_dev *dev) static void pci_configure_ltr(struct pci_dev *dev) { #ifdef CONFIG_PCIEASPM + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); u32 cap; struct pci_dev *bridge; + if (!host->native_ltr) + return; + if (!pci_is_pcie(dev)) return; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 15bfb15c2fa5..49f63c67a9d1 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -506,7 +506,8 @@ extern bool osc_pc_lpi_support_confirmed; #define OSC_PCI_EXPRESS_PME_CONTROL 0x00000004 #define OSC_PCI_EXPRESS_AER_CONTROL 0x00000008 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 -#define OSC_PCI_CONTROL_MASKS 0x0000001f +#define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 +#define OSC_PCI_CONTROL_MASKS 0x0000003f #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 diff --git a/include/linux/pci.h b/include/linux/pci.h index 73178a2fcee0..d0149c01996d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -473,6 +473,7 @@ struct pci_host_bridge { unsigned int native_aer:1; /* OS may use PCIe AER */ unsigned int native_hotplug:1; /* OS may use PCIe hotplug */ unsigned int native_pme:1; /* OS may use PCIe PME */ + unsigned int native_ltr:1; /* OS may use PCIe LTR */ /* Resource alignment requirements */ resource_size_t (*align_resource)(struct pci_dev *dev, const struct resource *res, -- cgit v1.2.3 From 6163849d289be6ff2acd2fb520da303dec3219f0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 20 Apr 2018 23:18:26 +0800 Subject: net: introduce a new tracepoint for tcp_rcv_space_adjust tcp_rcv_space_adjust is called every time data is copied to user space, introducing a tcp tracepoint for which could show us when the packet is copied to user. When a tcp packet arrives, tcp_rcv_established() will be called and with the existed tracepoint tcp_probe we could get the time when this packet arrives. Then this packet will be copied to user, and tcp_rcv_space_adjust will be called and with this new introduced tracepoint we could get the time when this packet is copied to user. With these two tracepoints, we could figure out whether the user program processes this packet immediately or there's latency. Hence in the printk message, sk_cookie is printed as a key to relate tcp_rcv_space_adjust with tcp_probe. Maybe we could export sockfd in this new tracepoint as well, then we could relate this new tracepoint with epoll/read/recv* tracepoints, and finally that could show us the whole lifespan of this packet. But we could also implement that with pid as these functions are executed in process context. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 30 +++++++++++++++++++++--------- net/ipv4/tcp_input.c | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3dd68029d77a..c1a5284713b8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -10,6 +10,7 @@ #include #include #include +#include #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ @@ -113,7 +114,7 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, */ DECLARE_EVENT_CLASS(tcp_event_sk, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), TP_ARGS(sk), @@ -125,6 +126,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk, __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) + __field(__u64, sock_cookie) ), TP_fast_assign( @@ -144,24 +146,34 @@ DECLARE_EVENT_CLASS(tcp_event_sk, TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + + __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, - __entry->saddr_v6, __entry->daddr_v6) + __entry->saddr_v6, __entry->daddr_v6, + __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), + + TP_ARGS(sk) +); + +DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, + + TP_PROTO(struct sock *sk), TP_ARGS(sk) ); @@ -232,6 +244,7 @@ TRACE_EVENT(tcp_probe, __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) + __field(__u64, sock_cookie) ), TP_fast_assign( @@ -256,15 +269,14 @@ TRACE_EVENT(tcp_probe, __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; + __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x " - "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u " - "rcv_wnd=%u", + TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", __entry->saddr, __entry->daddr, __entry->mark, __entry->length, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, - __entry->srtt, __entry->rcv_wnd) + __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #endif /* _TRACE_TCP_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0396fb919b5d..5a17cfc75326 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -582,6 +582,8 @@ void tcp_rcv_space_adjust(struct sock *sk) u32 copied; int time; + trace_tcp_rcv_space_adjust(sk); + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) -- cgit v1.2.3 From b16fb418b1bf2a9f14d5d2a4fe29bde1f5550b37 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 21 Apr 2018 09:41:31 -0700 Subject: net: fib_rules: add extack support Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/fib_rules.h | 3 ++- net/core/fib_rules.c | 55 +++++++++++++++++++++++++++++++++++++------------ net/decnet/dn_rules.c | 7 +++++-- net/ipv4/fib_rules.c | 7 +++++-- net/ipv4/ipmr.c | 3 ++- net/ipv6/fib6_rules.c | 7 +++++-- net/ipv6/ip6mr.c | 3 ++- 7 files changed, 63 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index e5cfcfc7dd93..b473df5b9512 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -75,7 +75,8 @@ struct fib_rules_ops { int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, - struct nlattr **); + struct nlattr **, + struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 67801104e9a8..ebd9351b213a 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -469,14 +469,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || - nla_len(tb[FRA_SRC]) != ops->addr_size) + nla_len(tb[FRA_SRC]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; + } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || - nla_len(tb[FRA_DST]) != ops->addr_size) + nla_len(tb[FRA_DST]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; + } nlrule = kzalloc(ops->rule_size, GFP_KERNEL); if (!nlrule) { @@ -537,6 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); if (nlrule->l3mdev != 1) #endif + NL_SET_ERR_MSG(extack, "Invalid l3mdev"); goto errout_free; } @@ -554,31 +559,41 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { - if (nlrule->action != FR_ACT_GOTO) + if (nlrule->action != FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; + } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ - if (nlrule->target <= nlrule->pref) + if (nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; + } } else if (nlrule->action == FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } - if (nlrule->l3mdev && nlrule->table) + if (nlrule->l3mdev && nlrule->table) { + NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; + } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; + NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || - !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) + !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { + NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; + } } else { nlrule->uid_range = fib_kuid_range_unset; } @@ -589,15 +604,19 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; + } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; + } } *rule = nlrule; @@ -621,18 +640,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -EINVAL, unresolved = 0; bool user_priority = false; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; + } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; + } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) @@ -644,7 +668,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_free; } - err = ops->configure(rule, skb, frh, tb); + err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; @@ -723,18 +747,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -EINVAL; bool user_priority = false; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; + } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; + } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index c795c3f509c9..72236695db3d 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { int err = -EINVAL; struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - if (frh->tos) + if (frh->tos) { + NL_SET_ERR_MSG(extack, "Invalid tos value"); goto errout; + } if (rule->table == RT_TABLE_UNSPEC) { if (rule->action == FR_ACT_TO_TBL) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 737d11bc8838..f8eb78d042a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; - if (frh->tos & ~IPTOS_TOS_MASK) + if (frh->tos & ~IPTOS_TOS_MASK) { + NL_SET_ERR_MSG(extack, "Invalid tos"); goto errout; + } /* split local/main if they are not already split */ err = fib_unmerge(net); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2fb4de3f7f66..38e092eafc97 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = { }; static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh, struct nlattr **tb) + struct fib_rule_hdr *frh, struct nlattr **tb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index df113c7b5fc8..6547fc6491a6 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -245,15 +245,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { int err = -EINVAL; struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { - if (rule->table == RT6_TABLE_UNSPEC) + if (rule->table == RT6_TABLE_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid table"); goto errout; + } if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 298fd8b6ed17..20a419ee8000 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { }; static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh, struct nlattr **tb) + struct fib_rule_hdr *frh, struct nlattr **tb, + struct netlink_ext_ack *extack) { return 0; } -- cgit v1.2.3 From c6849a3ac17e336811f1d5bba991d2a9bdc47af1 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 22 Apr 2018 21:50:04 +0800 Subject: net: init sk_cookie for inet socket With sk_cookie we can identify a socket, that is very helpful for traceing and statistic, i.e. tcp tracepiont and ebpf. So we'd better init it by default for inet socket. When using it, we just need call atomic64_read(&sk->sk_cookie). Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 9 +++++++++ net/ipv4/tcp_input.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 15fe980a27ea..5c916e6dff36 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -25,6 +25,15 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +static inline +void sock_init_cookie(struct sock *sk) +{ + u64 res; + + res = atomic64_inc_return(&sock_net(sk)->cookie_gen); + atomic64_set(&sk->sk_cookie, res); +} + u64 sock_gen_cookie(struct sock *sk); int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5a17cfc75326..17b78582ba63 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include #include #include +#include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -6190,10 +6191,15 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; #endif - atomic64_set(&ireq->ir_cookie, 0); ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); ireq->ireq_family = sk_listener->sk_family; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + BUILD_BUG_ON(offsetof(struct inet_request_sock, ireq_net) != + offsetof(struct sock, sk_net)); + sock_init_cookie((struct sock *)ireq); } return req; -- cgit v1.2.3 From 1ac4329a1cff2e0bb12b71c13ad53a0e05bc87a6 Mon Sep 17 00:00:00 2001 From: Denis Bolotin Date: Mon, 23 Apr 2018 14:56:05 +0300 Subject: qed: Add configuration information to register dump and debug data Configuration information is added to the debug data collection, in addition to register dump. Added qed_dbg_nvm_image() that receives an image type, allocates a buffer and reads the image. The images are saved in the buffers and the dump size is updated. Signed-off-by: Denis Bolotin Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 113 +++++++++++++++++++++++++++- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 14 +++- drivers/net/ethernet/qlogic/qed/qed_mcp.h | 14 ++++ include/linux/qed/qed_if.h | 3 + 4 files changed, 139 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 4926c5532fba..b3211c7d38c2 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -7778,6 +7778,57 @@ int qed_dbg_igu_fifo_size(struct qed_dev *cdev) return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO); } +int qed_dbg_nvm_image_length(struct qed_hwfn *p_hwfn, + enum qed_nvm_images image_id, u32 *length) +{ + struct qed_nvm_image_att image_att; + int rc; + + *length = 0; + rc = qed_mcp_get_nvm_image_att(p_hwfn, image_id, &image_att); + if (rc) + return rc; + + *length = image_att.length; + + return rc; +} + +int qed_dbg_nvm_image(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes, enum qed_nvm_images image_id) +{ + struct qed_hwfn *p_hwfn = + &cdev->hwfns[cdev->dbg_params.engine_for_debug]; + u32 len_rounded, i; + __be32 val; + int rc; + + *num_dumped_bytes = 0; + rc = qed_dbg_nvm_image_length(p_hwfn, image_id, &len_rounded); + if (rc) + return rc; + + DP_NOTICE(p_hwfn->cdev, + "Collecting a debug feature [\"nvram image %d\"]\n", + image_id); + + len_rounded = roundup(len_rounded, sizeof(u32)); + rc = qed_mcp_get_nvm_image(p_hwfn, image_id, buffer, len_rounded); + if (rc) + return rc; + + /* QED_NVM_IMAGE_NVM_META image is not swapped like other images */ + if (image_id != QED_NVM_IMAGE_NVM_META) + for (i = 0; i < len_rounded; i += 4) { + val = cpu_to_be32(*(u32 *)(buffer + i)); + *(u32 *)(buffer + i) = val; + } + + *num_dumped_bytes = len_rounded; + + return rc; +} + int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes) { @@ -7831,6 +7882,9 @@ enum debug_print_features { IGU_FIFO = 6, PHY = 7, FW_ASSERTS = 8, + NVM_CFG1 = 9, + DEFAULT_CFG = 10, + NVM_META = 11, }; static u32 qed_calc_regdump_header(enum debug_print_features feature, @@ -7965,13 +8019,61 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer) DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc); } + /* nvm cfg1 */ + rc = qed_dbg_nvm_image(cdev, + (u8 *)buffer + offset + REGDUMP_HEADER_SIZE, + &feature_size, QED_NVM_IMAGE_NVM_CFG1); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(NVM_CFG1, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else if (rc != -ENOENT) { + DP_ERR(cdev, + "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n", + QED_NVM_IMAGE_NVM_CFG1, "QED_NVM_IMAGE_NVM_CFG1", rc); + } + + /* nvm default */ + rc = qed_dbg_nvm_image(cdev, + (u8 *)buffer + offset + REGDUMP_HEADER_SIZE, + &feature_size, QED_NVM_IMAGE_DEFAULT_CFG); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(DEFAULT_CFG, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else if (rc != -ENOENT) { + DP_ERR(cdev, + "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n", + QED_NVM_IMAGE_DEFAULT_CFG, "QED_NVM_IMAGE_DEFAULT_CFG", + rc); + } + + /* nvm meta */ + rc = qed_dbg_nvm_image(cdev, + (u8 *)buffer + offset + REGDUMP_HEADER_SIZE, + &feature_size, QED_NVM_IMAGE_NVM_META); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(NVM_META, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else if (rc != -ENOENT) { + DP_ERR(cdev, + "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n", + QED_NVM_IMAGE_NVM_META, "QED_NVM_IMAGE_NVM_META", rc); + } + return 0; } int qed_dbg_all_data_size(struct qed_dev *cdev) { + struct qed_hwfn *p_hwfn = + &cdev->hwfns[cdev->dbg_params.engine_for_debug]; + u32 regs_len = 0, image_len = 0; u8 cur_engine, org_engine; - u32 regs_len = 0; org_engine = qed_get_debug_engine(cdev); for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) { @@ -7993,6 +8095,15 @@ int qed_dbg_all_data_size(struct qed_dev *cdev) /* Engine common */ regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev); + qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_CFG1, &image_len); + if (image_len) + regs_len += REGDUMP_HEADER_SIZE + image_len; + qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_DEFAULT_CFG, &image_len); + if (image_len) + regs_len += REGDUMP_HEADER_SIZE + image_len; + qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_META, &image_len); + if (image_len) + regs_len += REGDUMP_HEADER_SIZE + image_len; return regs_len; } diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 1377ad172fb3..0550f0ee11b3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -2529,7 +2529,7 @@ err0: return rc; } -static int +int qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, enum qed_nvm_images image_id, struct qed_nvm_image_att *p_image_att) @@ -2545,6 +2545,15 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, case QED_NVM_IMAGE_FCOE_CFG: type = NVM_TYPE_FCOE_CFG; break; + case QED_NVM_IMAGE_NVM_CFG1: + type = NVM_TYPE_NVM_CFG1; + break; + case QED_NVM_IMAGE_DEFAULT_CFG: + type = NVM_TYPE_DEFAULT_CFG; + break; + case QED_NVM_IMAGE_NVM_META: + type = NVM_TYPE_META; + break; default: DP_NOTICE(p_hwfn, "Unknown request of image_id %08x\n", image_id); @@ -2588,9 +2597,6 @@ int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn, return -EINVAL; } - /* Each NVM image is suffixed by CRC; Upper-layer has no need for it */ - image_att.length -= 4; - if (image_att.length > buffer_len) { DP_VERBOSE(p_hwfn, QED_MSG_STORAGE, diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index dd62c38b3bd3..3af3896420b9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -482,6 +482,20 @@ struct qed_nvm_image_att { u32 length; }; +/** + * @brief Allows reading a whole nvram image + * + * @param p_hwfn + * @param image_id - image to get attributes for + * @param p_image_att - image attributes structure into which to fill data + * + * @return int - 0 - operation was successful. + */ +int +qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, + enum qed_nvm_images image_id, + struct qed_nvm_image_att *p_image_att); + /** * @brief Allows reading a whole nvram image * diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b5b2bc9eacca..e53f9c7c2809 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -159,6 +159,9 @@ struct qed_dcbx_get { enum qed_nvm_images { QED_NVM_IMAGE_ISCSI_CFG, QED_NVM_IMAGE_FCOE_CFG, + QED_NVM_IMAGE_NVM_CFG1, + QED_NVM_IMAGE_DEFAULT_CFG, + QED_NVM_IMAGE_NVM_META, }; struct qed_link_eee_params { -- cgit v1.2.3 From 2ff0dab80a8999e99a93fd70f8d701ec3deab207 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 18 Apr 2018 15:18:02 +0200 Subject: mfd: bd9571mwv: Add DDR Backup Power register bit definitions Add definitions for the KEEPON_* bits in the "BKUP Mode Cnt" register, which control the DDR rails to be kept powered when backup mode is enabled. Signed-off-by: Geert Uytterhoeven Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/mfd/bd9571mwv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h index f0708ba4cbba..eb05569f752b 100644 --- a/include/linux/mfd/bd9571mwv.h +++ b/include/linux/mfd/bd9571mwv.h @@ -33,6 +33,11 @@ #define BD9571MWV_I2C_MD2_E1_BIT_2 0x12 #define BD9571MWV_BKUP_MODE_CNT 0x20 +#define BD9571MWV_BKUP_MODE_CNT_KEEPON_MASK GENMASK(3, 0) +#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR0 BIT(0) +#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR1 BIT(1) +#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR0C BIT(2) +#define BD9571MWV_BKUP_MODE_CNT_KEEPON_DDR1C BIT(3) #define BD9571MWV_BKUP_MODE_STATUS 0x21 #define BD9571MWV_BKUP_RECOVERY_CNT 0x22 #define BD9571MWV_BKUP_CTRL_TIM_CNT 0x23 -- cgit v1.2.3 From a2b2eff6ac2716f499defa590a6ec4ba379d765e Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Fri, 13 Apr 2018 14:34:31 -0400 Subject: media: v4l: fwnode: Fix comment incorrectly mentioning v4l2_fwnode_parse_endpoint It's called v4l2_fwnode_endpoint_parse, not v4l2_fwnode_parse_endpoint. Signed-off-by: Leonard Crestez Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-fwnode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h index c228ec1c77cf..9cccab618b98 100644 --- a/include/media/v4l2-fwnode.h +++ b/include/media/v4l2-fwnode.h @@ -99,7 +99,7 @@ struct v4l2_fwnode_endpoint { struct fwnode_endpoint base; /* * Fields below this line will be zeroed by - * v4l2_fwnode_parse_endpoint() + * v4l2_fwnode_endpoint_parse() */ enum v4l2_mbus_type bus_type; union { -- cgit v1.2.3 From 672e314b21dc614894e69bb56a2b55cc7d256810 Mon Sep 17 00:00:00 2001 From: Matt Atwood Date: Mon, 23 Apr 2018 15:28:03 -0700 Subject: drm/i915/kbl: Add KBL GT2 sku Adding a missing GT2 sku discovered off hardware. Signed-off-by: Matt Atwood Reviewed-by: Clint Taylor Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/1524522483-19987-1-git-send-email-matthew.s.atwood@intel.com --- include/drm/i915_pciids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 70f0c2535b87..bab70ff6e78b 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -349,6 +349,7 @@ #define INTEL_KBL_GT2_IDS(info) \ INTEL_VGA_DEVICE(0x5916, info), /* ULT GT2 */ \ INTEL_VGA_DEVICE(0x5917, info), /* Mobile GT2 */ \ + INTEL_VGA_DEVICE(0x591C, info), /* ULX GT2 */ \ INTEL_VGA_DEVICE(0x5921, info), /* ULT GT2F */ \ INTEL_VGA_DEVICE(0x591E, info), /* ULX GT2 */ \ INTEL_VGA_DEVICE(0x5912, info), /* DT GT2 */ \ -- cgit v1.2.3 From 7bb3bb4d56d8f3e0b29b8e4a70f2ab7a8e04a935 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 23 Apr 2018 12:49:58 +0200 Subject: drm/bridge: analogix_dp: Split the platform-specific poweron in two parts Some of the platform-specific stuff in rockchip_dp_poweron() needs to happen before the generic code. Some needs to happen after. Let's split the callback in two. Specifically we can't start doing PSR work until _after_ the whole controller is up, so don't set the enable until the end. Cc: Kristian H. Kristensen Signed-off-by: Douglas Anderson [seanpaul added exynos change] Signed-off-by: Sean Paul Signed-off-by: Thierry Escande Reviewed-by: Andrzej Hajda Signed-off-by: Enric Balletbo i Serra Tested-by: Marek Szyprowski Reviewed-by: Archit Taneja Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20180423105003.9004-23-enric.balletbo@collabora.com --- drivers/gpu/drm/bridge/analogix/analogix_dp_core.c | 7 +++++-- drivers/gpu/drm/exynos/exynos_dp.c | 2 +- drivers/gpu/drm/rockchip/analogix_dp-rockchip.c | 12 ++++++++++-- include/drm/bridge/analogix_dp.h | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c index a260de4f0bd8..2bcbfadb6ac5 100644 --- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c +++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c @@ -1260,8 +1260,8 @@ static int analogix_dp_set_bridge(struct analogix_dp_device *dp) goto out_dp_clk_pre; } - if (dp->plat_data->power_on) - dp->plat_data->power_on(dp->plat_data); + if (dp->plat_data->power_on_start) + dp->plat_data->power_on_start(dp->plat_data); phy_power_on(dp->phy); @@ -1286,6 +1286,9 @@ static int analogix_dp_set_bridge(struct analogix_dp_device *dp) goto out_dp_init; } + if (dp->plat_data->power_on_end) + dp->plat_data->power_on_end(dp->plat_data); + enable_irq(dp->irq); return 0; diff --git a/drivers/gpu/drm/exynos/exynos_dp.c b/drivers/gpu/drm/exynos/exynos_dp.c index 964831dab102..86330f396784 100644 --- a/drivers/gpu/drm/exynos/exynos_dp.c +++ b/drivers/gpu/drm/exynos/exynos_dp.c @@ -162,7 +162,7 @@ static int exynos_dp_bind(struct device *dev, struct device *master, void *data) dp->drm_dev = drm_dev; dp->plat_data.dev_type = EXYNOS_DP; - dp->plat_data.power_on = exynos_dp_poweron; + dp->plat_data.power_on_start = exynos_dp_poweron; dp->plat_data.power_off = exynos_dp_poweroff; dp->plat_data.attach = exynos_dp_bridge_attach; dp->plat_data.get_modes = exynos_dp_get_modes; diff --git a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c index b3f46ed24cdc..23317a2269e1 100644 --- a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c +++ b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c @@ -109,7 +109,7 @@ static int rockchip_dp_pre_init(struct rockchip_dp_device *dp) return 0; } -static int rockchip_dp_poweron(struct analogix_dp_plat_data *plat_data) +static int rockchip_dp_poweron_start(struct analogix_dp_plat_data *plat_data) { struct rockchip_dp_device *dp = to_dp(plat_data); int ret; @@ -127,6 +127,13 @@ static int rockchip_dp_poweron(struct analogix_dp_plat_data *plat_data) return ret; } + return ret; +} + +static int rockchip_dp_poweron_end(struct analogix_dp_plat_data *plat_data) +{ + struct rockchip_dp_device *dp = to_dp(plat_data); + return rockchip_drm_psr_activate(&dp->encoder); } @@ -330,7 +337,8 @@ static int rockchip_dp_bind(struct device *dev, struct device *master, dp->plat_data.encoder = &dp->encoder; dp->plat_data.dev_type = dp->data->chip_type; - dp->plat_data.power_on = rockchip_dp_poweron; + dp->plat_data.power_on_start = rockchip_dp_poweron_start; + dp->plat_data.power_on_end = rockchip_dp_poweron_end; dp->plat_data.power_off = rockchip_dp_powerdown; dp->plat_data.get_modes = rockchip_dp_get_modes; diff --git a/include/drm/bridge/analogix_dp.h b/include/drm/bridge/analogix_dp.h index e9a1116d2f8e..475b706b49de 100644 --- a/include/drm/bridge/analogix_dp.h +++ b/include/drm/bridge/analogix_dp.h @@ -33,7 +33,8 @@ struct analogix_dp_plat_data { struct drm_connector *connector; bool skip_connector; - int (*power_on)(struct analogix_dp_plat_data *); + int (*power_on_start)(struct analogix_dp_plat_data *); + int (*power_on_end)(struct analogix_dp_plat_data *); int (*power_off)(struct analogix_dp_plat_data *); int (*attach)(struct analogix_dp_plat_data *, struct drm_bridge *, struct drm_connector *); -- cgit v1.2.3 From 9a31fa395c19d5873190bf84c8192f5799861342 Mon Sep 17 00:00:00 2001 From: Takeshi Kihara Date: Fri, 20 Apr 2018 21:27:43 +0900 Subject: clk: renesas: Add r8a77990 CPG Core Clock Definitions This patch adds all R-Car E3 Clock Pulse Generator Core Clock Outputs. Note that internal CPG clocks (S0, S1, S2, S3, SDSRC, POST3) are not included, as they are used as internal clock sources only, and never referenced from DT. Signed-off-by: Takeshi Kihara [shimoda: add SPDX-License-Identifier] Signed-off-by: Yoshihiro Shimoda Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/r8a77990-cpg-mssr.h | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/dt-bindings/clock/r8a77990-cpg-mssr.h (limited to 'include') diff --git a/include/dt-bindings/clock/r8a77990-cpg-mssr.h b/include/dt-bindings/clock/r8a77990-cpg-mssr.h new file mode 100644 index 000000000000..a596a482f3a9 --- /dev/null +++ b/include/dt-bindings/clock/r8a77990-cpg-mssr.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Renesas Electronics Corp. + */ +#ifndef __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__ +#define __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__ + +#include + +/* r8a77990 CPG Core Clocks */ +#define R8A77990_CLK_Z2 0 +#define R8A77990_CLK_ZR 1 +#define R8A77990_CLK_ZG 2 +#define R8A77990_CLK_ZTR 3 +#define R8A77990_CLK_ZT 4 +#define R8A77990_CLK_ZX 5 +#define R8A77990_CLK_S0D1 6 +#define R8A77990_CLK_S0D3 7 +#define R8A77990_CLK_S0D6 8 +#define R8A77990_CLK_S0D12 9 +#define R8A77990_CLK_S0D24 10 +#define R8A77990_CLK_S1D1 11 +#define R8A77990_CLK_S1D2 12 +#define R8A77990_CLK_S1D4 13 +#define R8A77990_CLK_S2D1 14 +#define R8A77990_CLK_S2D2 15 +#define R8A77990_CLK_S2D4 16 +#define R8A77990_CLK_S3D1 17 +#define R8A77990_CLK_S3D2 18 +#define R8A77990_CLK_S3D4 19 +#define R8A77990_CLK_S0D6C 20 +#define R8A77990_CLK_S3D1C 21 +#define R8A77990_CLK_S3D2C 22 +#define R8A77990_CLK_S3D4C 23 +#define R8A77990_CLK_LB 24 +#define R8A77990_CLK_CL 25 +#define R8A77990_CLK_ZB3 26 +#define R8A77990_CLK_ZB3D2 27 +#define R8A77990_CLK_CR 28 +#define R8A77990_CLK_CRD2 29 +#define R8A77990_CLK_SD0H 30 +#define R8A77990_CLK_SD0 31 +#define R8A77990_CLK_SD1H 32 +#define R8A77990_CLK_SD1 33 +#define R8A77990_CLK_SD3H 34 +#define R8A77990_CLK_SD3 35 +#define R8A77990_CLK_RPC 36 +#define R8A77990_CLK_RPCD2 37 +#define R8A77990_CLK_ZA2 38 +#define R8A77990_CLK_ZA8 39 +#define R8A77990_CLK_Z2D 40 +#define R8A77990_CLK_CANFD 41 +#define R8A77990_CLK_MSO 42 +#define R8A77990_CLK_R 43 +#define R8A77990_CLK_OSC 44 +#define R8A77990_CLK_LV0 45 +#define R8A77990_CLK_LV1 46 +#define R8A77990_CLK_CSI0 47 +#define R8A77990_CLK_CP 48 +#define R8A77990_CLK_CPEX 49 + +#endif /* __DT_BINDINGS_CLOCK_R8A77990_CPG_MSSR_H__ */ -- cgit v1.2.3 From a268de77faf6881756b4943b287fd78ec05a7d1e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:17 +0100 Subject: netfilter: nf_flow_table: move init code to nf_flow_table_core.c Reduces duplication of .gc and .params in flowtable type definitions and makes the API clearer Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +- net/ipv4/netfilter/nf_flow_table_ipv4.c | 3 +- net/ipv6/netfilter/nf_flow_table_ipv6.c | 3 +- net/netfilter/nf_flow_table_core.c | 102 +++++++++++++++++++------------- net/netfilter/nf_flow_table_inet.c | 3 +- net/netfilter/nf_tables_api.c | 22 +++---- 6 files changed, 74 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 76ee5c81b752..f876e32a60b8 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -14,9 +14,8 @@ struct nf_flowtable; struct nf_flowtable_type { struct list_head list; int family; - void (*gc)(struct work_struct *work); + int (*init)(struct nf_flowtable *ft); void (*free)(struct nf_flowtable *ft); - const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; }; @@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void nf_flow_offload_work_gc(struct work_struct *work); -extern const struct rhashtable_params nf_flow_offload_rhash_params; void flow_offload_dead(struct flow_offload *flow); diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index b6e43ff0c7b7..e1e56d7123d2 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -7,8 +7,7 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index f1804ce8d561..c511d206bf9b 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -8,8 +8,7 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 7403a0dfddf7..09d1be669c39 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -116,16 +116,50 @@ void flow_offload_dead(struct flow_offload *flow) } EXPORT_SYMBOL_GPL(flow_offload_dead); +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; +} + +static const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { flow->timeout = (u32)jiffies; rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -135,10 +169,10 @@ static void flow_offload_del(struct nf_flowtable *flow_table, { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); flow_offload_free(flow); } @@ -148,7 +182,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, - *flow_table->type->params); + nf_flow_offload_rhash_params); } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -237,7 +271,7 @@ out: return 1; } -void nf_flow_offload_work_gc(struct work_struct *work) +static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; @@ -245,42 +279,6 @@ void nf_flow_offload_work_gc(struct work_struct *work) nf_flow_offload_gc_step(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } -EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); - -static u32 flow_offload_hash(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple *tuple = data; - - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple_rhash *tuplehash = data; - - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct flow_offload_tuple *tuple = arg->key; - const struct flow_offload_tuple_rhash *x = ptr; - - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) - return 1; - - return 0; -} - -const struct rhashtable_params nf_flow_offload_rhash_params = { - .head_offset = offsetof(struct flow_offload_tuple_rhash, node), - .hashfn = flow_offload_hash, - .obj_hashfn = flow_offload_hash_obj, - .obj_cmpfn = flow_offload_hash_cmp, - .automatic_shrinking = true, -}; -EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params); static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) @@ -398,6 +396,24 @@ int nf_flow_dnat_port(const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); +int nf_flow_table_init(struct nf_flowtable *flowtable) +{ + int err; + + INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + + err = rhashtable_init(&flowtable->rhashtable, + &nf_flow_offload_rhash_params); + if (err < 0) + return err; + + queue_delayed_work(system_power_efficient_wq, + &flowtable->gc_work, HZ); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_table_init); + static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; @@ -423,8 +439,10 @@ EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { + cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); WARN_ON(!nf_flow_offload_gc_step(flow_table)); + rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 375a1881d93d..99771aa7e7ea 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9134cc429ad4..6cd9955916e5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5150,14 +5150,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, } flowtable->data.type = type; - err = rhashtable_init(&flowtable->data.rhashtable, type->params); + err = type->init(&flowtable->data); if (err < 0) goto err3; err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], flowtable); if (err < 0) - goto err3; + goto err4; for (i = 0; i < flowtable->ops_len; i++) { if (!flowtable->ops[i].dev) @@ -5171,37 +5171,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (flowtable->ops[i].dev == ft->ops[k].dev && flowtable->ops[i].pf == ft->ops[k].pf) { err = -EBUSY; - goto err4; + goto err5; } } } err = nf_register_net_hook(net, &flowtable->ops[i]); if (err < 0) - goto err4; + goto err5; } err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; - - INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc); - queue_delayed_work(system_power_efficient_wq, - &flowtable->data.gc_work, HZ); + goto err6; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err5: +err6: i = flowtable->ops_len; -err4: +err5: for (k = i - 1; k >= 0; k--) { kfree(flowtable->dev_name[k]); nf_unregister_net_hook(net, &flowtable->ops[k]); } kfree(flowtable->ops); +err4: + flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); err2: @@ -5485,11 +5483,9 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - cancel_delayed_work_sync(&flowtable->data.gc_work); kfree(flowtable->ops); kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); - rhashtable_destroy(&flowtable->data.rhashtable); module_put(flowtable->data.type->owner); } -- cgit v1.2.3 From 84453a90252ca0cd7d1bd229199a40c58bfe431e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:19 +0100 Subject: netfilter: nf_flow_table: track flow tables in nf_flow_table directly Avoids having nf_flow_table depend on nftables (useful for future iptables backport work) Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + include/net/netfilter/nf_tables.h | 3 --- net/netfilter/nf_flow_table_core.c | 21 ++++++++++++++++++--- net/netfilter/nf_tables_api.c | 17 ----------------- 4 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f876e32a60b8..ab408adba688 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -21,6 +21,7 @@ struct nf_flowtable_type { }; struct nf_flowtable { + struct list_head list; struct rhashtable rhashtable; const struct nf_flowtable_type *type; struct delayed_work gc_work; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index cd368d1b8cb8..2f2062ae1c45 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1109,9 +1109,6 @@ struct nft_flowtable { struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 09d1be669c39..e761359b56a9 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -18,6 +18,9 @@ struct flow_offload_entry { struct rcu_head rcu_head; }; +static DEFINE_MUTEX(flowtable_lock); +static LIST_HEAD(flowtables); + static void flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, struct nf_flow_route *route, @@ -410,6 +413,10 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) queue_delayed_work(system_power_efficient_wq, &flowtable->gc_work, HZ); + mutex_lock(&flowtable_lock); + list_add(&flowtable->list, &flowtables); + mutex_unlock(&flowtable_lock); + return 0; } EXPORT_SYMBOL_GPL(nf_flow_table_init); @@ -425,20 +432,28 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - void *data) + struct net_device *dev) { - nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data); + nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); } void nf_flow_table_cleanup(struct net *net, struct net_device *dev) { - nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev); + struct nf_flowtable *flowtable; + + mutex_lock(&flowtable_lock); + list_for_each_entry(flowtable, &flowtables, list) + nf_flow_table_iterate_cleanup(flowtable, dev); + mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { + mutex_lock(&flowtable_lock); + list_del(&flow_table->list); + mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); WARN_ON(!nf_flow_offload_gc_step(flow_table)); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 517bb93c00fb..16b67f54b3d2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5060,23 +5060,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) return ERR_PTR(-ENOENT); } -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data) -{ - struct nft_flowtable *flowtable; - const struct nft_table *table; - - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - iter(&flowtable->data, data); - } - } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_flow_table_iterate); - static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { -- cgit v1.2.3 From 6bdc3c68d94c5d6adc675ee55361962e9dd2489d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:20 +0100 Subject: netfilter: nf_flow_table: make flow_offload_dead inline It is too trivial to keep as a separate exported function Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 ++++- net/netfilter/nf_flow_table_core.c | 6 ------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ab408adba688..5aa49524ebef 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -103,7 +103,10 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void flow_offload_dead(struct flow_offload *flow); +static inline void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e761359b56a9..0d38f20fd226 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -113,12 +113,6 @@ void flow_offload_free(struct flow_offload *flow) } EXPORT_SYMBOL_GPL(flow_offload_free); -void flow_offload_dead(struct flow_offload *flow) -{ - flow->flags |= FLOW_OFFLOAD_DYING; -} -EXPORT_SYMBOL_GPL(flow_offload_dead); - static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; -- cgit v1.2.3 From 59c466dd68e796f3a7a0709d90c72ce2d84e29c2 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:21 +0100 Subject: netfilter: nf_flow_table: add a new flow state for tearing down offloading On cleanup, this will be treated differently from FLOW_OFFLOAD_DYING: If FLOW_OFFLOAD_DYING is set, the connection is going away, so both the offload state and the connection tracking entry will be deleted. If FLOW_OFFLOAD_TEARDOWN is set, the connection remains alive, but the offload state is torn down. This is useful for cases that require more complex state tracking / timeout handling on TCP, or if the connection has been idle for too long. Support for sending flows back to the slow path will be implemented in a following patch Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_core.c | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 5aa49524ebef..ba9fa4592f2b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -68,6 +68,7 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 +#define FLOW_OFFLOAD_TEARDOWN 0x8 struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; @@ -103,6 +104,7 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); +void flow_offload_teardown(struct flow_offload *flow); static inline void flow_offload_dead(struct flow_offload *flow) { flow->flags |= FLOW_OFFLOAD_DYING; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 0d38f20fd226..5a81e4f771e9 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -174,6 +174,12 @@ static void flow_offload_del(struct nf_flowtable *flow_table, flow_offload_free(flow); } +void flow_offload_teardown(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_TEARDOWN; +} +EXPORT_SYMBOL_GPL(flow_offload_teardown); + struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) @@ -226,11 +232,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static inline bool nf_flow_is_dying(const struct flow_offload *flow) -{ - return flow->flags & FLOW_OFFLOAD_DYING; -} - static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; @@ -258,7 +259,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); if (nf_flow_has_expired(flow) || - nf_flow_is_dying(flow)) + (flow->flags & (FLOW_OFFLOAD_DYING | + FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } out: @@ -419,10 +421,14 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + if (!dev) { + flow_offload_teardown(flow); return; + } - flow_offload_dead(flow); + if (flow->tuplehash[0].tuple.iifidx == dev->ifindex || + flow->tuplehash[1].tuple.iifidx == dev->ifindex) + flow_offload_dead(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, -- cgit v1.2.3 From cac20fcdf146b82d02e412d7a345f5826279cd82 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Mar 2018 12:06:51 +0200 Subject: netfilter: nf_tables: simplify lookup functions Replace the nf_tables_ prefix by nft_ and merge code into single lookup function whenever possible. In many cases we go over the 80-chars boundary function names, this save us ~50 LoC. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 12 +- net/netfilter/nf_tables_api.c | 249 +++++++++++++++----------------------- net/netfilter/nft_flow_offload.c | 5 +- net/netfilter/nft_objref.c | 4 +- 4 files changed, 110 insertions(+), 160 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2f2062ae1c45..123e82a2f8bb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1015,9 +1015,9 @@ static inline void *nft_obj_data(const struct nft_object *obj) #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, u32 objtype, - u8 genmask); +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); void nft_obj_notify(struct net *net, struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, @@ -1106,9 +1106,9 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask); +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 16b67f54b3d2..f65e650b61aa 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -386,13 +386,17 @@ static struct nft_table *nft_table_lookup(const struct net *net, { struct nft_table *table; + if (nla == NULL) + return ERR_PTR(-EINVAL); + list_for_each_entry(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) return table; } - return NULL; + + return ERR_PTR(-ENOENT); } static struct nft_table *nft_table_lookup_byhandle(const struct net *net, @@ -406,37 +410,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_active_genmask(table, genmask)) return table; } - return NULL; -} - -static struct nft_table *nf_tables_table_lookup(const struct net *net, - const struct nlattr *nla, - u8 family, u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup(net, nla, family, genmask); - if (table != NULL) - return table; - - return ERR_PTR(-ENOENT); -} - -static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, - const struct nlattr *nla, - u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup_byhandle(net, nla, genmask); - if (table != NULL) - return table; return ERR_PTR(-ENOENT); } @@ -608,8 +581,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -735,7 +707,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, int err; name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(net, name, family, genmask); + table = nft_table_lookup(net, name, family, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -893,12 +865,11 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, return nft_flush(&ctx, family); if (nla[NFTA_TABLE_HANDLE]) - table = nf_tables_table_lookup_byhandle(net, - nla[NFTA_TABLE_HANDLE], - genmask); + table = nft_table_lookup_byhandle(net, nla[NFTA_TABLE_HANDLE], + genmask); else - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -949,8 +920,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, - u8 genmask) +nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) { struct nft_chain *chain; @@ -963,9 +933,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, return ERR_PTR(-ENOENT); } -static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +static struct nft_chain *nft_chain_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_chain *chain; @@ -1194,12 +1163,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); @@ -1513,8 +1481,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], - genmask); + chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1576,8 +1543,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1586,11 +1552,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + chain = nft_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { - chain = nf_tables_chain_lookup(table, name, genmask); + chain = nft_chain_lookup(table, name, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) return PTR_ERR(chain); @@ -1647,16 +1613,15 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, u32 use; int err; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); } if (IS_ERR(chain)) return PTR_ERR(chain); @@ -1939,8 +1904,8 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, - u64 handle) +static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, + u64 handle) { struct nft_rule *rule; @@ -1953,13 +1918,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, - const struct nlattr *nla) +static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, + const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -2191,16 +2156,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) return PTR_ERR(rule); @@ -2265,18 +2229,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nf_tables_rule_lookup(chain, handle); + rule = __nft_rule_lookup(chain, handle); if (IS_ERR(rule)) return PTR_ERR(rule); @@ -2300,7 +2263,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nf_tables_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(chain, pos_handle); if (IS_ERR(old_rule)) return PTR_ERR(old_rule); } @@ -2435,14 +2398,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], - genmask); + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } @@ -2451,8 +2412,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nf_tables_rule_lookup(chain, - nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) return PTR_ERR(rule); @@ -2635,8 +2595,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, struct nft_table *table = NULL; if (nla[NFTA_SET_TABLE] != NULL) { - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); } @@ -2645,8 +2605,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, return 0; } -static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2661,14 +2621,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask) { struct nft_set *set; - if (nla == NULL) - return ERR_PTR(-EINVAL); - list_for_each_entry(set, &table->sets, list) { if (be64_to_cpu(nla_get_be64(nla)) == set->handle && nft_active_genmask(set, genmask)) @@ -2677,9 +2635,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla, - u8 genmask) +static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nlattr *nla, u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); @@ -2703,12 +2660,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nf_tables_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; - set = nf_tables_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, nla_set_id, genmask); } return set; } @@ -2980,7 +2937,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3132,14 +3089,13 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) return PTR_ERR(set); @@ -3262,9 +3218,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return err; if (nla[NFTA_SET_HANDLE]) - set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); + set = nft_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], + genmask); else - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3404,8 +3361,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, int family = nfmsg->nfgen_family; struct nft_table *table; - table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -3741,8 +3698,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3954,8 +3910,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EINVAL; goto err2; } - obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], - set->objtype, genmask); + obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err2; @@ -4284,8 +4240,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -4373,9 +4328,9 @@ void nft_unregister_obj(struct nft_object_type *obj_type) } EXPORT_SYMBOL_GPL(nft_unregister_obj); -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask) { struct nft_object *obj; @@ -4387,11 +4342,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +EXPORT_SYMBOL_GPL(nft_obj_lookup); -static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) { struct nft_object *obj; @@ -4535,13 +4490,12 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); if (err != -ENOENT) @@ -4761,13 +4715,12 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) return PTR_ERR(obj); @@ -4817,18 +4770,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); if (IS_ERR(table)) return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); if (nla[NFTA_OBJ_HANDLE]) - obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], - objtype, genmask); + obj = nft_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], + objtype, genmask); else - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], - objtype, genmask); + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, + genmask); if (IS_ERR(obj)) return PTR_ERR(obj); if (obj->use > 0) @@ -4903,9 +4855,8 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -4916,11 +4867,11 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); +EXPORT_SYMBOL_GPL(nft_flowtable_lookup); static struct nft_flowtable * -nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +nft_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -5093,13 +5044,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); if (err != -ENOENT) @@ -5210,19 +5161,19 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); if (nla[NFTA_FLOWTABLE_HANDLE]) - flowtable = nf_tables_flowtable_lookup_byhandle(table, - nla[NFTA_FLOWTABLE_HANDLE], - genmask); + flowtable = nft_flowtable_lookup_byhandle(table, + nla[NFTA_FLOWTABLE_HANDLE], + genmask); else - flowtable = nf_tables_flowtable_lookup(table, - nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, + nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (flowtable->use > 0) @@ -5407,13 +5358,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); @@ -6382,8 +6333,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b65829b2be22..d6bab8c3cbb0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nf_tables_flowtable_lookup(ctx->table, - tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 0b02407773ad..cdf348f751ec 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); - obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, - genmask); + obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + genmask); if (IS_ERR(obj)) return -ENOENT; -- cgit v1.2.3 From 71cc0873e0e0a4c6dca899c42e3ac143f7960d8e Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 3 Apr 2018 23:15:39 +0200 Subject: netfilter: nf_tables: Simplify set backend selection Drop nft_set_type's ability to act as a container of multiple backend implementations it chooses from. Instead consolidate the whole selection logic in nft_select_set_ops() and the actual backend provided estimate() callback. This turns nf_tables_set_types into a list containing all available backends which is traversed when selecting one matching userspace requested criteria. Also, this change allows to embed nft_set_ops structure into nft_set_type and pull flags field into the latter as it's only used during selection phase. A crucial part of this change is to make sure the new layout respects hash backend constraints formerly enforced by nft_hash_select_ops() function: This is achieved by introduction of a specific estimate() callback for nft_hash_fast_ops which returns false for key lengths != 4. In turn, nft_hash_estimate() is changed to return false for key lengths == 4 so it won't be chosen by accident. Also, both callbacks must return false for unbounded sets as their size estimate depends on a known maximum element count. Note that this patch partially reverts commit 4f2921ca21b71 ("netfilter: nf_tables: meter: pick a set backend that supports updates") by making nft_set_ops_candidate() not explicitly look for an update callback but make NFT_SET_EVAL a regular backend feature flag which is checked along with the others. This way all feature requirements are checked in one go. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 34 ++++----- net/netfilter/nf_tables_api.c | 25 +++---- net/netfilter/nft_set_bitmap.c | 34 ++++----- net/netfilter/nft_set_hash.c | 153 +++++++++++++++++++++----------------- net/netfilter/nft_set_rbtree.c | 36 ++++----- 5 files changed, 139 insertions(+), 143 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 123e82a2f8bb..de77d36e36b3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -275,23 +275,6 @@ struct nft_set_estimate { enum nft_set_class space; }; -/** - * struct nft_set_type - nf_tables set type - * - * @select_ops: function to select nft_set_ops - * @ops: default ops, used when no select_ops functions is present - * @list: used internally - * @owner: module reference - */ -struct nft_set_type { - const struct nft_set_ops *(*select_ops)(const struct nft_ctx *, - const struct nft_set_desc *desc, - u32 flags); - const struct nft_set_ops *ops; - struct list_head list; - struct module *owner; -}; - struct nft_set_ext; struct nft_expr; @@ -310,7 +293,6 @@ struct nft_expr; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size - * @features: features supported by the implementation */ struct nft_set_ops { bool (*lookup)(const struct net *net, @@ -361,9 +343,23 @@ struct nft_set_ops { void (*destroy)(const struct nft_set *set); unsigned int elemsize; +}; + +/** + * struct nft_set_type - nf_tables set type + * + * @ops: set ops for this type + * @list: used internally + * @owner: module reference + * @features: features supported by the implementation + */ +struct nft_set_type { + const struct nft_set_ops ops; + struct list_head list; + struct module *owner; u32 features; - const struct nft_set_type *type; }; +#define to_set_type(o) container_of(o, struct nft_set_type, ops) int nft_register_set(struct nft_set_type *type); void nft_unregister_set(struct nft_set_type *type); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2f14cadd9922..9ce35acf491d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2523,14 +2523,12 @@ void nft_unregister_set(struct nft_set_type *type) EXPORT_SYMBOL_GPL(nft_unregister_set); #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ - NFT_SET_TIMEOUT | NFT_SET_OBJECT) + NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ + NFT_SET_EVAL) -static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) +static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) { - if ((flags & NFT_SET_EVAL) && !ops->update) - return false; - - return (flags & ops->features) == (flags & NFT_SET_FEATURES); + return (flags & type->features) == (flags & NFT_SET_FEATURES); } /* @@ -2567,14 +2565,9 @@ nft_select_set_ops(const struct nft_ctx *ctx, best.space = ~0; list_for_each_entry(type, &nf_tables_set_types, list) { - if (!type->select_ops) - ops = type->ops; - else - ops = type->select_ops(ctx, desc, flags); - if (!ops) - continue; + ops = &type->ops; - if (!nft_set_ops_candidate(ops, flags)) + if (!nft_set_ops_candidate(type, flags)) continue; if (!ops->estimate(desc, flags, &est)) continue; @@ -2605,7 +2598,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!try_module_get(type->owner)) continue; if (bops != NULL) - module_put(bops->type->owner); + module_put(to_set_type(bops)->owner); bops = ops; best = est; @@ -3247,14 +3240,14 @@ err3: err2: kvfree(set); err1: - module_put(ops->type->owner); + module_put(to_set_type(ops)->owner); return err; } static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); - module_put(set->ops->type->owner); + module_put(to_set_type(set->ops)->owner); kfree(set->name); kvfree(set); } diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 45fb2752fb63..d6626e01c7ee 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type; -static struct nft_set_ops nft_bitmap_ops __read_mostly = { - .type = &nft_bitmap_type, - .privsize = nft_bitmap_privsize, - .elemsize = offsetof(struct nft_bitmap_elem, ext), - .estimate = nft_bitmap_estimate, - .init = nft_bitmap_init, - .destroy = nft_bitmap_destroy, - .insert = nft_bitmap_insert, - .remove = nft_bitmap_remove, - .deactivate = nft_bitmap_deactivate, - .flush = nft_bitmap_flush, - .activate = nft_bitmap_activate, - .lookup = nft_bitmap_lookup, - .walk = nft_bitmap_walk, - .get = nft_bitmap_get, -}; - static struct nft_set_type nft_bitmap_type __read_mostly = { - .ops = &nft_bitmap_ops, .owner = THIS_MODULE, + .ops = { + .privsize = nft_bitmap_privsize, + .elemsize = offsetof(struct nft_bitmap_elem, ext), + .estimate = nft_bitmap_estimate, + .init = nft_bitmap_init, + .destroy = nft_bitmap_destroy, + .insert = nft_bitmap_insert, + .remove = nft_bitmap_remove, + .deactivate = nft_bitmap_deactivate, + .flush = nft_bitmap_flush, + .activate = nft_bitmap_activate, + .lookup = nft_bitmap_lookup, + .walk = nft_bitmap_walk, + .get = nft_bitmap_get, + }, }; static int __init nft_bitmap_module_init(void) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index fc9c6d5d64cd..dbf1f4ad077c 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -605,6 +605,12 @@ static void nft_hash_destroy(const struct nft_set *set) static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (!desc->size) + return false; + + if (desc->klen == 4) + return false; + est->size = sizeof(struct nft_hash) + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + desc->size * sizeof(struct nft_hash_elem); @@ -614,91 +620,100 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_hash_type; -static struct nft_set_ops nft_rhash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_rhash_privsize, - .elemsize = offsetof(struct nft_rhash_elem, ext), - .estimate = nft_rhash_estimate, - .init = nft_rhash_init, - .destroy = nft_rhash_destroy, - .insert = nft_rhash_insert, - .activate = nft_rhash_activate, - .deactivate = nft_rhash_deactivate, - .flush = nft_rhash_flush, - .remove = nft_rhash_remove, - .lookup = nft_rhash_lookup, - .update = nft_rhash_update, - .walk = nft_rhash_walk, - .get = nft_rhash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, -}; +static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + if (!desc->size) + return false; -static struct nft_set_ops nft_hash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; + if (desc->klen != 4) + return false; -static struct nft_set_ops nft_hash_fast_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup_fast, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; - -static const struct nft_set_ops * -nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, - u32 flags) -{ - if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) { - switch (desc->klen) { - case 4: - return &nft_hash_fast_ops; - default: - return &nft_hash_ops; - } - } + est->size = sizeof(struct nft_hash) + + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + desc->size * sizeof(struct nft_hash_elem); + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; - return &nft_rhash_ops; + return true; } +static struct nft_set_type nft_rhash_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT | NFT_SET_EVAL, + .ops = { + .privsize = nft_rhash_privsize, + .elemsize = offsetof(struct nft_rhash_elem, ext), + .estimate = nft_rhash_estimate, + .init = nft_rhash_init, + .destroy = nft_rhash_destroy, + .insert = nft_rhash_insert, + .activate = nft_rhash_activate, + .deactivate = nft_rhash_deactivate, + .flush = nft_rhash_flush, + .remove = nft_rhash_remove, + .lookup = nft_rhash_lookup, + .update = nft_rhash_update, + .walk = nft_rhash_walk, + .get = nft_rhash_get, + }, +}; + static struct nft_set_type nft_hash_type __read_mostly = { - .select_ops = nft_hash_select_ops, .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, +}; + +static struct nft_set_type nft_hash_fast_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_fast_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup_fast, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_hash_type); + if (nft_register_set(&nft_hash_fast_type) || + nft_register_set(&nft_hash_type) || + nft_register_set(&nft_rhash_type)) + return 1; + return 0; } static void __exit nft_hash_module_exit(void) { + nft_unregister_set(&nft_rhash_type); nft_unregister_set(&nft_hash_type); + nft_unregister_set(&nft_hash_fast_type); } module_init(nft_hash_module_init); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e6f08bc5f359..22c57d7612c4 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -393,28 +393,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type; -static struct nft_set_ops nft_rbtree_ops __read_mostly = { - .type = &nft_rbtree_type, - .privsize = nft_rbtree_privsize, - .elemsize = offsetof(struct nft_rbtree_elem, ext), - .estimate = nft_rbtree_estimate, - .init = nft_rbtree_init, - .destroy = nft_rbtree_destroy, - .insert = nft_rbtree_insert, - .remove = nft_rbtree_remove, - .deactivate = nft_rbtree_deactivate, - .flush = nft_rbtree_flush, - .activate = nft_rbtree_activate, - .lookup = nft_rbtree_lookup, - .walk = nft_rbtree_walk, - .get = nft_rbtree_get, - .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, -}; - static struct nft_set_type nft_rbtree_type __read_mostly = { - .ops = &nft_rbtree_ops, .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_rbtree_privsize, + .elemsize = offsetof(struct nft_rbtree_elem, ext), + .estimate = nft_rbtree_estimate, + .init = nft_rbtree_init, + .destroy = nft_rbtree_destroy, + .insert = nft_rbtree_insert, + .remove = nft_rbtree_remove, + .deactivate = nft_rbtree_deactivate, + .flush = nft_rbtree_flush, + .activate = nft_rbtree_activate, + .lookup = nft_rbtree_lookup, + .walk = nft_rbtree_walk, + .get = nft_rbtree_get, + }, }; static int __init nft_rbtree_module_init(void) -- cgit v1.2.3 From 2eb0f624b709e78ec8e2f4c3412947703db99301 Mon Sep 17 00:00:00 2001 From: Thierry Du Tre Date: Wed, 4 Apr 2018 15:38:22 +0200 Subject: netfilter: add NAT support for shifted portmap ranges This is a patch proposal to support shifted ranges in portmaps. (i.e. tcp/udp incoming port 5000-5100 on WAN redirected to LAN 192.168.1.5:2000-2100) Currently DNAT only works for single port or identical port ranges. (i.e. ports 5000-5100 on WAN interface redirected to a LAN host while original destination port is not altered) When different port ranges are configured, either 'random' mode should be used, or else all incoming connections are mapped onto the first port in the redirect range. (in described example WAN:5000-5100 will all be mapped to 192.168.1.5:2000) This patch introduces a new mode indicated by flag NF_NAT_RANGE_PROTO_OFFSET which uses a base port value to calculate an offset with the destination port present in the incoming stream. That offset is then applied as index within the redirect port range (index modulo rangewidth to handle range overflow). In described example the base port would be 5000. An incoming stream with destination port 5004 would result in an offset value 4 which means that the NAT'ed stream will be using destination port 2004. Other possibilities include deterministic mapping of larger or multiple ranges to a smaller range : WAN:5000-5999 -> LAN:5000-5099 (maps WAN port 5*xx to port 51xx) This patch does not change any current behavior. It just adds new NAT proto range functionality which must be selected via the specific flag when intended to use. A patch for iptables (libipt_DNAT.c + libip6t_DNAT.c) will also be proposed which makes this functionality immediately available. Signed-off-by: Thierry Du Tre Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 2 +- include/net/netfilter/ipv6/nf_nat_masquerade.h | 2 +- include/net/netfilter/nf_nat.h | 2 +- include/net/netfilter/nf_nat_l3proto.h | 4 +- include/net/netfilter/nf_nat_l4proto.h | 8 +-- include/net/netfilter/nf_nat_redirect.h | 2 +- include/uapi/linux/netfilter/nf_nat.h | 12 ++++- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- net/ipv4/netfilter/nf_nat_h323.c | 4 +- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 4 +- net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 4 +- net/ipv4/netfilter/nf_nat_pptp.c | 2 +- net/ipv4/netfilter/nf_nat_proto_gre.c | 2 +- net/ipv4/netfilter/nf_nat_proto_icmp.c | 2 +- net/ipv4/netfilter/nft_masq_ipv4.c | 2 +- net/ipv6/netfilter/ip6t_MASQUERADE.c | 2 +- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 4 +- net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 4 +- net/ipv6/netfilter/nf_nat_proto_icmpv6.c | 2 +- net/ipv6/netfilter/nft_masq_ipv6.c | 2 +- net/ipv6/netfilter/nft_redir_ipv6.c | 2 +- net/netfilter/nf_nat_core.c | 27 +++++----- net/netfilter/nf_nat_helper.c | 2 +- net/netfilter/nf_nat_proto_common.c | 9 ++-- net/netfilter/nf_nat_proto_dccp.c | 2 +- net/netfilter/nf_nat_proto_sctp.c | 2 +- net/netfilter/nf_nat_proto_tcp.c | 2 +- net/netfilter/nf_nat_proto_udp.c | 4 +- net/netfilter/nf_nat_proto_unknown.c | 2 +- net/netfilter/nf_nat_redirect.c | 6 +-- net/netfilter/nf_nat_sip.c | 2 +- net/netfilter/nft_nat.c | 2 +- net/netfilter/xt_NETMAP.c | 8 +-- net/netfilter/xt_REDIRECT.c | 2 +- net/netfilter/xt_nat.c | 72 +++++++++++++++++++++++--- net/openvswitch/conntrack.c | 4 +- 36 files changed, 145 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h index ebd869473603..cd24be4c4a99 100644 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -6,7 +6,7 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv4_register_notifier(void); diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h index 1ed4f2631ed6..0c3b5ebf0bb8 100644 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -3,7 +3,7 @@ #define _NF_NAT_MASQUERADE_IPV6_H_ unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv6_register_notifier(void); void nf_nat_masquerade_ipv6_unregister_notifier(void); diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 207a467e7ca6..da3d601cadee 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -39,7 +39,7 @@ struct nf_conn_nat { /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index ce7c2b4e64bb..ac47098a61dc 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -7,7 +7,7 @@ struct nf_nat_l3proto { u8 l3proto; bool (*in_range)(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range); + const struct nf_nat_range2 *range); u32 (*secure_port)(const struct nf_conntrack_tuple *t, __be16); @@ -33,7 +33,7 @@ struct nf_nat_l3proto { struct flowi *fl); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; int nf_nat_l3proto_register(const struct nf_nat_l3proto *); diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 67835ff8a2d9..b4d6b29bca62 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -34,12 +34,12 @@ struct nf_nat_l4proto { */ void (*unique_tuple)(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; /* Protocol registration. */ @@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover); int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); #endif /*_NF_NAT_L4PROTO_H*/ diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h index 5ddabb08c472..c129aacc8ae8 100644 --- a/include/net/netfilter/nf_nat_redirect.h +++ b/include/net/netfilter/nf_nat_redirect.h @@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_ipv4_multi_range_compat *mr, unsigned int hooknum); unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum); #endif /* _NF_NAT_REDIRECT_H_ */ diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index a33000da7229..4a95c0db14d4 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -10,6 +10,7 @@ #define NF_NAT_RANGE_PROTO_RANDOM (1 << 2) #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -17,7 +18,7 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) struct nf_nat_ipv4_range { unsigned int flags; @@ -40,4 +41,13 @@ struct nf_nat_range { union nf_conntrack_man_proto max_proto; }; +struct nf_nat_range2 { + unsigned int flags; + union nf_inet_addr min_addr; + union nf_inet_addr max_addr; + union nf_conntrack_man_proto min_proto; + union nf_conntrack_man_proto max_proto; + union nf_conntrack_man_proto base_proto; +}; + #endif /* _NETFILTER_NF_NAT_H */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index a03e4e7ef5f9..ce1512b02cb2 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; mr = par->targinfo; diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index ac8342dcb55e..4e6b53ab6c33 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); @@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_callforwarding_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(new->status & IPS_NAT_DONE_MASK); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index f7ff6a364d7b..4346336cee4c 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, #endif /* CONFIG_XFRM */ static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); @@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index 0c366aad89cb..f538c5001547 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -24,13 +24,13 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 8a69363b4884..5d259a12e25f 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; - struct nf_nat_range range; + struct nf_nat_range2 range; struct nf_conn_nat *nat; nat = nf_ct_nat_ext_add(ct); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index edf05002d674..00fda6331ce5 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); static void gre_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 7b98baa13ede..6d7cf1d79baf 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, static void icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index f18677277119..f1193e1e928a 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 92c0047e7e33..491f808e356a 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 6b7f075f811f..56d75eb5448f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, #endif static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; @@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 98f61fcb9108..9dfc2b90c362 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -26,14 +26,14 @@ static atomic_t v6_worker_count; unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index 57593b00c5b4..d9bf42ba44fa 100644 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple, static void icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 4146536e9c15..dd0122f3cffe 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index a27e424f690d..74269865acc8 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 617693ff9f4c..37b3c9913b08 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple); static int in_range(const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. @@ -194,7 +194,7 @@ find_appropriate_src(struct net *net, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; @@ -224,7 +224,7 @@ find_appropriate_src(struct net *net, static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone, static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { - if (l4proto->in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && + l4proto->in_range(tuple, maniptype, + &range->min_proto, + &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) goto out; @@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, } } - /* Last change: get protocol to try to obtain unique tuple. */ + /* Last chance: get protocol to try to obtain unique tuple. */ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); out: rcu_read_unlock(); @@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); @@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); - struct nf_nat_range range = { + struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, @@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, - struct nf_nat_range *range) + struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; const struct nf_nat_l4proto *l4proto; @@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { static int nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_conn *ct, struct nf_nat_range2 *range, const struct nf_nat_l3proto *l3proto) { struct nlattr *tb[CTA_NAT_MAX+1]; @@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_l3proto *l3proto; int err; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 607a373379b4..99606baedda4 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet); void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 7d7466dbf663..5d849d835561 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover) @@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, : tuple->src.u.all); } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { off = prandom_u32(); + } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) { + off = (ntohs(*portptr) - ntohs(range->base_proto.all)); } else { off = *rover; } @@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, *portptr = htons(min + off % range_size); if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) + if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL| + NF_NAT_RANGE_PROTO_OFFSET))) *rover = off; return; } @@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index 269fcd5dc34c..67ea0d83aa5a 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover; static void dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index c57ee3240b1d..1c5d9b65fbba 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover; static void sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 4f8820fc5148..f15fcd475f98 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -23,7 +23,7 @@ static u16 tcp_port_rover; static void tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index edd4a77dc09a..5790f70a83b2 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -22,7 +22,7 @@ static u16 udp_port_rover; static void udp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { @@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, static void udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c index 6e494d584412..c5db3e251232 100644 --- a/net/netfilter/nf_nat_proto_unknown.c +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 25b06b959118..7c4bb0a773ca 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); @@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; struct in6_addr newdst; enum ip_conntrack_info ctinfo; struct nf_conn *ct; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 791fac4fd745..1f3086074981 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, static void nf_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 1f36954c2ba9..c15807d10b91 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr, const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c index 58aa9dd3c5b7..1d437875e15a 100644 --- a/net/netfilter/xt_NETMAP.c +++ b/net/netfilter/xt_NETMAP.c @@ -21,8 +21,8 @@ static unsigned int netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - struct nf_nat_range newrange; + const struct nf_nat_range2 *range = par->targinfo; + struct nf_nat_range2 newrange; struct nf_conn *ct; enum ip_conntrack_info ctinfo; union nf_inet_addr new_addr, netmask; @@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; @@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && xt_hooknum(par) != NF_INET_POST_ROUTING && diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 98a4c6d4f1cb..5ce9461e979c 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index bdb689cdc829..8af9707f8789 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static void xt_nat_convert_range(struct nf_nat_range *dst, +static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; @@ -54,7 +55,7 @@ static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -71,7 +72,7 @@ static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); - return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); + + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY))); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, + { + .name = "SNAT", + .revision = 2, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, + .target = xt_snat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 2, + .target = xt_dnat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, }; static int __init xt_nat_init(void) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c5904f629091..02fc343feb66 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -72,7 +72,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; #ifdef CONFIG_NF_NAT_NEEDED - struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ + struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; @@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net, */ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { int hooknum, nh_off, err = NF_ACCEPT; -- cgit v1.2.3 From cd9a5a15808403a7895c51b1378168d6c75cf8a6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 9 Apr 2018 00:00:57 +0900 Subject: netfilter: ebtables: remove EBT_MATCH and EBT_NOMATCH EBT_MATCH and EBT_NOMATCH are used to change return value. match functions(ebt_xxx.c) return false when received frame is not matched and returns true when received frame is matched. but, EBT_MATCH_ITERATE understands oppositely. so, to change return value, EBT_MATCH and EBT_NOMATCH are used. but, we can use operation '!' simply. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 4 ---- net/bridge/netfilter/ebtables.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 0773b5a032f1..c6935be7c6ca 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -17,10 +17,6 @@ #include #include -/* return values for match() functions */ -#define EBT_MATCH 0 -#define EBT_NOMATCH 1 - struct ebt_match { struct list_head list; const char name[EBT_FUNCTION_MAXNAMELEN]; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 355410b13316..7c07221369c0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb, { par->match = m->u.match; par->matchinfo = m->data; - return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH; + return !m->u.match->match(skb, par); } static inline int -- cgit v1.2.3 From a1d768f1a00db556e2aae9f92bdb38671e601da5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Apr 2018 23:09:58 +0900 Subject: netfilter: ebtables: add ebt_get_target and ebt_get_target_c ebt_get_target similar to {ip/ip6/arp}t_get_target. and ebt_get_target_c similar to {ip/ip6/arp}t_get_target_c. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 ++++++ net/bridge/netfilter/ebtables.c | 22 +++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 0c7dc8315013..3b86c14ea49d 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -191,6 +191,12 @@ struct ebt_entry { unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; +static __inline__ struct ebt_entry_target * +ebt_get_target(struct ebt_entry *e) +{ + return (void *)e + e->target_offset; +} + /* {g,s}etsockopt numbers */ #define EBT_BASE_CTL 128 diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 7c07221369c0..9be240129448 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) return (void *)entry + entry->next_offset; } +static inline const struct ebt_entry_target * +ebt_get_target_c(const struct ebt_entry *e) +{ + return ebt_get_target((struct ebt_entry *)e); +} + /* Do some firewalling */ unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, @@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, */ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); - t = (struct ebt_entry_target *) - (((char *)point) + point->target_offset); + t = ebt_get_target_c(point); /* standard target */ if (!t->u.target->target) verdict = ((struct ebt_standard_target *)t)->verdict; @@ -637,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); par.net = net; par.target = t->u.target; @@ -716,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); if (ret != 0) goto cleanup_watchers; - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); gap = e->next_offset - e->target_offset; target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); @@ -789,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack if (pos == nentries) continue; } - t = (struct ebt_entry_target *) - (((char *)e) + e->target_offset); + t = ebt_get_target_c(e); if (strcmp(t->u.name, EBT_STANDARD_TARGET)) goto letscontinue; if (e->target_offset + sizeof(struct ebt_standard_target) > @@ -1396,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, return -EFAULT; hlp = ubase + (((char *)e + e->target_offset) - base); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target_c(e); ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); if (ret != 0) @@ -1737,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, return ret; target_offset = e->target_offset - (origsize - *size); - t = (struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target(e); ret = compat_target_to_user(t, dstptr, size); if (ret) @@ -1785,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e, EBT_MATCH_ITERATE(e, compat_calc_match, &off); EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); - t = (const struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target_c(e); off += xt_compat_target_offset(t->u.target); off += ebt_compat_entry_padsize(); -- cgit v1.2.3 From 8e1102d5a1596dca10f51e3de800809944f8816d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 18:04:49 +0200 Subject: netfilter: nf_tables: support timeouts larger than 23 days Marco De Benedetto says: I would like to use a timeout of 30 days for elements in a set but it seems there is a some kind of problem above 24d20h31m23s. Fix this by using 'jiffies64' for timeout handling to get same behaviour on 32 and 64bit systems. nftables passes timeouts as u64 in milliseconds to the kernel, but on kernel side we used a mixture of 'long' and jiffies conversions rather than u64 and jiffies64. Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1237 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nf_tables_api.c | 50 +++++++++++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index de77d36e36b3..435c9e3b9181 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -585,7 +585,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } -static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } @@ -603,7 +603,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); + time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9ce35acf491d..d57aeea89a79 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2779,6 +2779,27 @@ cont: return 0; } +static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +{ + u64 ms = be64_to_cpu(nla_get_be64(nla)); + u64 max = (u64)(~((u64)0)); + + max = div_u64(max, NSEC_PER_MSEC); + if (ms >= max) + return -ERANGE; + + ms *= NSEC_PER_MSEC; + *result = nsecs_to_jiffies64(ms); + return 0; +} + +static u64 nf_jiffies64_to_msecs(u64 input) +{ + u64 ms = jiffies64_to_nsecs(input); + + return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -2826,7 +2847,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(set->timeout)), + nf_jiffies64_to_msecs(set->timeout), NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && @@ -3122,8 +3143,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_TIMEOUT]))); + + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + if (err) + return err; } gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { @@ -3387,8 +3410,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u64), }, [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(unsigned long), - .align = __alignof__(unsigned long), + .len = sizeof(u64), + .align = __alignof__(u64), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -3481,22 +3504,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(jiffies_to_msecs( - *nft_set_ext_timeout(ext))), + nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), NFTA_SET_ELEM_PAD)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - unsigned long expires, now = jiffies; + u64 expires, now = get_jiffies_64(); expires = *nft_set_ext_expiration(ext); - if (time_before(now, expires)) + if (time_before64(now, expires)) expires -= now; else expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - cpu_to_be64(jiffies_to_msecs(expires)), + nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; } @@ -3871,7 +3893,7 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) *nft_set_ext_expiration(ext) = - jiffies + timeout; + get_jiffies_64() + timeout; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -3958,8 +3980,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_ELEM_TIMEOUT]))); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT], + &timeout); + if (err) + return err; } else if (set->flags & NFT_SET_TIMEOUT) { timeout = set->timeout; } -- cgit v1.2.3 From bd2bbdb497dba24b9ca7f6257c83e496c64b6e9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 19:15:53 +0200 Subject: netfilter: merge meta_bridge into nft_meta It overcomplicates things for no reason. nft_meta_bridge only offers retrieval of bridge port interface name. Because of this being its own module, we had to export all nft_meta functions, which we can then make static again (which even reduces the size of nft_meta -- including bridge port retrieval...): before: text data bss dec hex filename 1838 832 0 2670 a6e net/bridge/netfilter/nft_meta_bridge.ko 6147 936 1 7084 1bac net/netfilter/nft_meta.ko after: 5826 936 1 6763 1a6b net/netfilter/nft_meta.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 44 ----------- net/bridge/netfilter/Kconfig | 7 -- net/bridge/netfilter/Makefile | 1 - net/bridge/netfilter/nft_meta_bridge.c | 135 --------------------------------- net/netfilter/nft_meta.c | 90 ++++++++++++++-------- 5 files changed, 58 insertions(+), 219 deletions(-) delete mode 100644 include/net/netfilter/nft_meta.h delete mode 100644 net/bridge/netfilter/nft_meta_bridge.c (limited to 'include') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h deleted file mode 100644 index 5c69e9b09388..000000000000 --- a/include/net/netfilter/nft_meta.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFT_META_H_ -#define _NFT_META_H_ - -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -extern const struct nla_policy nft_meta_policy[]; - -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr); - -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); - -#endif diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index f212447794bd..9a0159aebe1a 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE - -config NFT_BRIDGE_META - tristate "Netfilter nf_table bridge meta support" - depends on NFT_META - help - Add support for bridge dedicated meta key. - config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 4bc758dd4a8c..9b868861f21a 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,7 +3,6 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # -obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # packet logging diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c deleted file mode 100644 index bb63c9aed55d..000000000000 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../br_private.h" - -static void nft_meta_bridge_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - const struct nft_meta *priv = nft_expr_priv(expr); - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - u32 *dest = ®s->data[priv->dreg]; - const struct net_bridge_port *p; - - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - break; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - break; - default: - goto out; - } - - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; -out: - return nft_meta_get_eval(expr, regs, pkt); -err: - regs->verdict.code = NFT_BREAK; -} - -static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_meta *priv = nft_expr_priv(expr); - unsigned int len; - - priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - len = IFNAMSIZ; - break; - default: - return nft_meta_get_init(ctx, expr, tb); - } - - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); -} - -static struct nft_expr_type nft_meta_bridge_type; -static const struct nft_expr_ops nft_meta_bridge_get_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_bridge_get_eval, - .init = nft_meta_bridge_get_init, - .dump = nft_meta_get_dump, -}; - -static const struct nft_expr_ops nft_meta_bridge_set_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_set_eval, - .init = nft_meta_set_init, - .destroy = nft_meta_set_destroy, - .dump = nft_meta_set_dump, - .validate = nft_meta_set_validate, -}; - -static const struct nft_expr_ops * -nft_meta_bridge_select_ops(const struct nft_ctx *ctx, - const struct nlattr * const tb[]) -{ - if (tb[NFTA_META_KEY] == NULL) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG]) - return &nft_meta_bridge_get_ops; - - if (tb[NFTA_META_SREG]) - return &nft_meta_bridge_set_ops; - - return ERR_PTR(-EINVAL); -} - -static struct nft_expr_type nft_meta_bridge_type __read_mostly = { - .family = NFPROTO_BRIDGE, - .name = "meta", - .select_ops = nft_meta_bridge_select_ops, - .policy = nft_meta_policy, - .maxattr = NFTA_META_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_meta_bridge_module_init(void) -{ - return nft_register_expr(&nft_meta_bridge_type); -} - -static void __exit nft_meta_bridge_module_exit(void) -{ - nft_unregister_expr(&nft_meta_bridge_type); -} - -module_init(nft_meta_bridge_module_init); -module_exit(nft_meta_bridge_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tomasz Bursztyka "); -MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8fb91940e2e7..6c0b82628117 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2008-2009 Patrick McHardy + * Copyright (c) 2014 Intel Corporation + * Author: Tomasz Bursztyka * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -24,21 +26,35 @@ #include /* for TCP_TIME_WAIT */ #include #include -#include #include /* NF_BR_PRE_ROUTING */ +struct nft_meta { + enum nft_meta_keys key:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +#ifdef CONFIG_NF_TABLES_BRIDGE +#include "../bridge/br_private.h" +#endif + +static void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; +#ifdef CONFIG_NF_TABLES_BRIDGE + const struct net_bridge_port *p; +#endif switch (priv->key) { case NFT_META_LEN: @@ -214,6 +230,18 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_SECPATH: nft_reg_store8(dest, !!skb->sp); break; +#endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; #endif default: WARN_ON(1); @@ -224,11 +252,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } -EXPORT_SYMBOL_GPL(nft_meta_get_eval); -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -258,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_meta_set_eval); -const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; -EXPORT_SYMBOL_GPL(nft_meta_policy); -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -317,6 +342,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_SECPATH: len = sizeof(u8); break; +#endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + if (ctx->family != NFPROTO_BRIDGE) + return -EOPNOTSUPP; + len = IFNAMSIZ; + break; #endif default: return -EOPNOTSUPP; @@ -326,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } -EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -360,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -388,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } -EXPORT_SYMBOL_GPL(nft_meta_set_validate); -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -424,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } -EXPORT_SYMBOL_GPL(nft_meta_set_init); -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -440,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -457,17 +485,15 @@ int nft_meta_set_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_set_dump); -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } -EXPORT_SYMBOL_GPL(nft_meta_set_destroy); static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { -- cgit v1.2.3 From f0316f93897c4c4e67278b175bfbfd3a95ba650a Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 5 Dec 2015 18:41:28 +0000 Subject: drm/i2c: tda9950: add CEC driver Add a CEC driver for the TDA9950, which is a stand-alone I2C CEC device, but is also integrated into HDMI transceivers such as the TDA9989 and TDA19989. The TDA9950 contains a command processor which handles retransmissions and the low level bus protocol. The driver just has to read and write the messages, and handle error conditions. Reviewed-by: Hans Verkuil Signed-off-by: Russell King --- drivers/gpu/drm/i2c/Kconfig | 5 + drivers/gpu/drm/i2c/Makefile | 1 + drivers/gpu/drm/i2c/tda9950.c | 509 ++++++++++++++++++++++++++++++++++ include/linux/platform_data/tda9950.h | 16 ++ 4 files changed, 531 insertions(+) create mode 100644 drivers/gpu/drm/i2c/tda9950.c create mode 100644 include/linux/platform_data/tda9950.h (limited to 'include') diff --git a/drivers/gpu/drm/i2c/Kconfig b/drivers/gpu/drm/i2c/Kconfig index a6c92beb410a..3a232f5ff0a1 100644 --- a/drivers/gpu/drm/i2c/Kconfig +++ b/drivers/gpu/drm/i2c/Kconfig @@ -26,4 +26,9 @@ config DRM_I2C_NXP_TDA998X help Support for NXP Semiconductors TDA998X HDMI encoders. +config DRM_I2C_NXP_TDA9950 + tristate "NXP Semiconductors TDA9950/TDA998X HDMI CEC" + select CEC_NOTIFIER + select CEC_CORE + endmenu diff --git a/drivers/gpu/drm/i2c/Makefile b/drivers/gpu/drm/i2c/Makefile index b20100c18ffb..a962f6f08568 100644 --- a/drivers/gpu/drm/i2c/Makefile +++ b/drivers/gpu/drm/i2c/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_DRM_I2C_SIL164) += sil164.o tda998x-y := tda998x_drv.o obj-$(CONFIG_DRM_I2C_NXP_TDA998X) += tda998x.o +obj-$(CONFIG_DRM_I2C_NXP_TDA9950) += tda9950.o diff --git a/drivers/gpu/drm/i2c/tda9950.c b/drivers/gpu/drm/i2c/tda9950.c new file mode 100644 index 000000000000..3f7396caad48 --- /dev/null +++ b/drivers/gpu/drm/i2c/tda9950.c @@ -0,0 +1,509 @@ +/* + * TDA9950 Consumer Electronics Control driver + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * The NXP TDA9950 implements the HDMI Consumer Electronics Control + * interface. The host interface is similar to a mailbox: the data + * registers starting at REG_CDR0 are written to send a command to the + * internal CPU, and replies are read from these registers. + * + * As the data registers represent a mailbox, they must be accessed + * as a single I2C transaction. See the TDA9950 data sheet for details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum { + REG_CSR = 0x00, + CSR_BUSY = BIT(7), + CSR_INT = BIT(6), + CSR_ERR = BIT(5), + + REG_CER = 0x01, + + REG_CVR = 0x02, + + REG_CCR = 0x03, + CCR_RESET = BIT(7), + CCR_ON = BIT(6), + + REG_ACKH = 0x04, + REG_ACKL = 0x05, + + REG_CCONR = 0x06, + CCONR_ENABLE_ERROR = BIT(4), + CCONR_RETRY_MASK = 7, + + REG_CDR0 = 0x07, + + CDR1_REQ = 0x00, + CDR1_CNF = 0x01, + CDR1_IND = 0x81, + CDR1_ERR = 0x82, + CDR1_IER = 0x83, + + CDR2_CNF_SUCCESS = 0x00, + CDR2_CNF_OFF_STATE = 0x80, + CDR2_CNF_BAD_REQ = 0x81, + CDR2_CNF_CEC_ACCESS = 0x82, + CDR2_CNF_ARB_ERROR = 0x83, + CDR2_CNF_BAD_TIMING = 0x84, + CDR2_CNF_NACK_ADDR = 0x85, + CDR2_CNF_NACK_DATA = 0x86, +}; + +struct tda9950_priv { + struct i2c_client *client; + struct device *hdmi; + struct cec_adapter *adap; + struct tda9950_glue *glue; + u16 addresses; + struct cec_msg rx_msg; + struct cec_notifier *notify; + bool open; +}; + +static int tda9950_write_range(struct i2c_client *client, u8 addr, u8 *p, int cnt) +{ + struct i2c_msg msg; + u8 buf[cnt + 1]; + int ret; + + buf[0] = addr; + memcpy(buf + 1, p, cnt); + + msg.addr = client->addr; + msg.flags = 0; + msg.len = cnt + 1; + msg.buf = buf; + + dev_dbg(&client->dev, "wr 0x%02x: %*ph\n", addr, cnt, p); + + ret = i2c_transfer(client->adapter, &msg, 1); + if (ret < 0) + dev_err(&client->dev, "Error %d writing to cec:0x%x\n", ret, addr); + return ret < 0 ? ret : 0; +} + +static void tda9950_write(struct i2c_client *client, u8 addr, u8 val) +{ + tda9950_write_range(client, addr, &val, 1); +} + +static int tda9950_read_range(struct i2c_client *client, u8 addr, u8 *p, int cnt) +{ + struct i2c_msg msg[2]; + int ret; + + msg[0].addr = client->addr; + msg[0].flags = 0; + msg[0].len = 1; + msg[0].buf = &addr; + msg[1].addr = client->addr; + msg[1].flags = I2C_M_RD; + msg[1].len = cnt; + msg[1].buf = p; + + ret = i2c_transfer(client->adapter, msg, 2); + if (ret < 0) + dev_err(&client->dev, "Error %d reading from cec:0x%x\n", ret, addr); + + dev_dbg(&client->dev, "rd 0x%02x: %*ph\n", addr, cnt, p); + + return ret; +} + +static u8 tda9950_read(struct i2c_client *client, u8 addr) +{ + int ret; + u8 val; + + ret = tda9950_read_range(client, addr, &val, 1); + if (ret < 0) + val = 0; + + return val; +} + +static irqreturn_t tda9950_irq(int irq, void *data) +{ + struct tda9950_priv *priv = data; + unsigned int tx_status; + u8 csr, cconr, buf[19]; + u8 arb_lost_cnt, nack_cnt, err_cnt; + + if (!priv->open) + return IRQ_NONE; + + csr = tda9950_read(priv->client, REG_CSR); + if (!(csr & CSR_INT)) + return IRQ_NONE; + + cconr = tda9950_read(priv->client, REG_CCONR) & CCONR_RETRY_MASK; + + tda9950_read_range(priv->client, REG_CDR0, buf, sizeof(buf)); + + /* + * This should never happen: the data sheet says that there will + * always be a valid message if the interrupt line is asserted. + */ + if (buf[0] == 0) { + dev_warn(&priv->client->dev, "interrupt pending, but no message?\n"); + return IRQ_NONE; + } + + switch (buf[1]) { + case CDR1_CNF: /* transmit result */ + arb_lost_cnt = nack_cnt = err_cnt = 0; + switch (buf[2]) { + case CDR2_CNF_SUCCESS: + tx_status = CEC_TX_STATUS_OK; + break; + + case CDR2_CNF_ARB_ERROR: + tx_status = CEC_TX_STATUS_ARB_LOST; + arb_lost_cnt = cconr; + break; + + case CDR2_CNF_NACK_ADDR: + tx_status = CEC_TX_STATUS_NACK; + nack_cnt = cconr; + break; + + default: /* some other error, refer to TDA9950 docs */ + dev_err(&priv->client->dev, "CNF reply error 0x%02x\n", + buf[2]); + tx_status = CEC_TX_STATUS_ERROR; + err_cnt = cconr; + break; + } + /* TDA9950 executes all retries for us */ + tx_status |= CEC_TX_STATUS_MAX_RETRIES; + cec_transmit_done(priv->adap, tx_status, arb_lost_cnt, + nack_cnt, 0, err_cnt); + break; + + case CDR1_IND: + priv->rx_msg.len = buf[0] - 2; + if (priv->rx_msg.len > CEC_MAX_MSG_SIZE) + priv->rx_msg.len = CEC_MAX_MSG_SIZE; + + memcpy(priv->rx_msg.msg, buf + 2, priv->rx_msg.len); + cec_received_msg(priv->adap, &priv->rx_msg); + break; + + default: /* unknown */ + dev_err(&priv->client->dev, "unknown service id 0x%02x\n", + buf[1]); + break; + } + + return IRQ_HANDLED; +} + +static int tda9950_cec_transmit(struct cec_adapter *adap, u8 attempts, + u32 signal_free_time, struct cec_msg *msg) +{ + struct tda9950_priv *priv = adap->priv; + u8 buf[CEC_MAX_MSG_SIZE + 2]; + + buf[0] = 2 + msg->len; + buf[1] = CDR1_REQ; + memcpy(buf + 2, msg->msg, msg->len); + + if (attempts > 5) + attempts = 5; + + tda9950_write(priv->client, REG_CCONR, attempts); + + return tda9950_write_range(priv->client, REG_CDR0, buf, 2 + msg->len); +} + +static int tda9950_cec_adap_log_addr(struct cec_adapter *adap, u8 addr) +{ + struct tda9950_priv *priv = adap->priv; + u16 addresses; + u8 buf[2]; + + if (addr == CEC_LOG_ADDR_INVALID) + addresses = priv->addresses = 0; + else + addresses = priv->addresses |= BIT(addr); + + /* TDA9950 doesn't want address 15 set */ + addresses &= 0x7fff; + buf[0] = addresses >> 8; + buf[1] = addresses; + + return tda9950_write_range(priv->client, REG_ACKH, buf, 2); +} + +/* + * When operating as part of the TDA998x, we need additional handling + * to initialise and shut down the TDA9950 part of the device. These + * two hooks are provided to allow the TDA998x code to perform those + * activities. + */ +static int tda9950_glue_open(struct tda9950_priv *priv) +{ + int ret = 0; + + if (priv->glue && priv->glue->open) + ret = priv->glue->open(priv->glue->data); + + priv->open = true; + + return ret; +} + +static void tda9950_glue_release(struct tda9950_priv *priv) +{ + priv->open = false; + + if (priv->glue && priv->glue->release) + priv->glue->release(priv->glue->data); +} + +static int tda9950_open(struct tda9950_priv *priv) +{ + struct i2c_client *client = priv->client; + int ret; + + ret = tda9950_glue_open(priv); + if (ret) + return ret; + + /* Reset the TDA9950, and wait 250ms for it to recover */ + tda9950_write(client, REG_CCR, CCR_RESET); + msleep(250); + + tda9950_cec_adap_log_addr(priv->adap, CEC_LOG_ADDR_INVALID); + + /* Start the command processor */ + tda9950_write(client, REG_CCR, CCR_ON); + + return 0; +} + +static void tda9950_release(struct tda9950_priv *priv) +{ + struct i2c_client *client = priv->client; + int timeout = 50; + u8 csr; + + /* Stop the command processor */ + tda9950_write(client, REG_CCR, 0); + + /* Wait up to .5s for it to signal non-busy */ + do { + csr = tda9950_read(client, REG_CSR); + if (!(csr & CSR_BUSY) || --timeout) + break; + msleep(10); + } while (1); + + /* Warn the user that their IRQ may die if it's shared. */ + if (csr & CSR_BUSY) + dev_warn(&client->dev, "command processor failed to stop, irq%d may die (csr=0x%02x)\n", + client->irq, csr); + + tda9950_glue_release(priv); +} + +static int tda9950_cec_adap_enable(struct cec_adapter *adap, bool enable) +{ + struct tda9950_priv *priv = adap->priv; + + if (!enable) { + tda9950_release(priv); + return 0; + } else { + return tda9950_open(priv); + } +} + +static const struct cec_adap_ops tda9950_cec_ops = { + .adap_enable = tda9950_cec_adap_enable, + .adap_log_addr = tda9950_cec_adap_log_addr, + .adap_transmit = tda9950_cec_transmit, +}; + +/* + * When operating as part of the TDA998x, we need to claim additional + * resources. These two hooks permit the management of those resources. + */ +static void tda9950_devm_glue_exit(void *data) +{ + struct tda9950_glue *glue = data; + + if (glue && glue->exit) + glue->exit(glue->data); +} + +static int tda9950_devm_glue_init(struct device *dev, struct tda9950_glue *glue) +{ + int ret; + + if (glue && glue->init) { + ret = glue->init(glue->data); + if (ret) + return ret; + } + + ret = devm_add_action(dev, tda9950_devm_glue_exit, glue); + if (ret) + tda9950_devm_glue_exit(glue); + + return ret; +} + +static void tda9950_cec_del(void *data) +{ + struct tda9950_priv *priv = data; + + cec_delete_adapter(priv->adap); +} + +static int tda9950_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct tda9950_glue *glue = client->dev.platform_data; + struct device *dev = &client->dev; + struct tda9950_priv *priv; + unsigned long irqflags; + int ret; + u8 cvr; + + /* + * We must have I2C functionality: our multi-byte accesses + * must be performed as a single contiguous transaction. + */ + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, + "adapter does not support I2C functionality\n"); + return -ENXIO; + } + + /* We must have an interrupt to be functional. */ + if (client->irq <= 0) { + dev_err(&client->dev, "driver requires an interrupt\n"); + return -ENXIO; + } + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->client = client; + priv->glue = glue; + + i2c_set_clientdata(client, priv); + + /* + * If we're part of a TDA998x, we want the class devices to be + * associated with the HDMI Tx so we have a tight relationship + * between the HDMI interface and the CEC interface. + */ + priv->hdmi = dev; + if (glue && glue->parent) + priv->hdmi = glue->parent; + + priv->adap = cec_allocate_adapter(&tda9950_cec_ops, priv, "tda9950", + CEC_CAP_DEFAULTS, + CEC_MAX_LOG_ADDRS); + if (IS_ERR(priv->adap)) + return PTR_ERR(priv->adap); + + ret = devm_add_action(dev, tda9950_cec_del, priv); + if (ret) { + cec_delete_adapter(priv->adap); + return ret; + } + + ret = tda9950_devm_glue_init(dev, glue); + if (ret) + return ret; + + ret = tda9950_glue_open(priv); + if (ret) + return ret; + + cvr = tda9950_read(client, REG_CVR); + + dev_info(&client->dev, + "TDA9950 CEC interface, hardware version %u.%u\n", + cvr >> 4, cvr & 15); + + tda9950_glue_release(priv); + + irqflags = IRQF_TRIGGER_FALLING; + if (glue) + irqflags = glue->irq_flags; + + ret = devm_request_threaded_irq(dev, client->irq, NULL, tda9950_irq, + irqflags | IRQF_SHARED | IRQF_ONESHOT, + dev_name(&client->dev), priv); + if (ret < 0) + return ret; + + priv->notify = cec_notifier_get(priv->hdmi); + if (!priv->notify) + return -ENOMEM; + + ret = cec_register_adapter(priv->adap, priv->hdmi); + if (ret < 0) { + cec_notifier_put(priv->notify); + return ret; + } + + /* + * CEC documentation says we must not call cec_delete_adapter + * after a successful call to cec_register_adapter(). + */ + devm_remove_action(dev, tda9950_cec_del, priv); + + cec_register_cec_notifier(priv->adap, priv->notify); + + return 0; +} + +static int tda9950_remove(struct i2c_client *client) +{ + struct tda9950_priv *priv = i2c_get_clientdata(client); + + cec_unregister_adapter(priv->adap); + cec_notifier_put(priv->notify); + + return 0; +} + +static struct i2c_device_id tda9950_ids[] = { + { "tda9950", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(i2c, tda9950_ids); + +static struct i2c_driver tda9950_driver = { + .probe = tda9950_probe, + .remove = tda9950_remove, + .driver = { + .name = "tda9950", + }, + .id_table = tda9950_ids, +}; + +module_i2c_driver(tda9950_driver); + +MODULE_AUTHOR("Russell King "); +MODULE_DESCRIPTION("TDA9950/TDA998x Consumer Electronics Control Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/platform_data/tda9950.h b/include/linux/platform_data/tda9950.h new file mode 100644 index 000000000000..c65efd461102 --- /dev/null +++ b/include/linux/platform_data/tda9950.h @@ -0,0 +1,16 @@ +#ifndef LINUX_PLATFORM_DATA_TDA9950_H +#define LINUX_PLATFORM_DATA_TDA9950_H + +struct device; + +struct tda9950_glue { + struct device *parent; + unsigned long irq_flags; + void *data; + int (*init)(void *); + void (*exit)(void *); + int (*open)(void *); + void (*release)(void *); +}; + +#endif -- cgit v1.2.3 From 4773e77cdc9b3af93ee1ae7bcf2acf94fde17166 Mon Sep 17 00:00:00 2001 From: Prashanth Prakash Date: Wed, 4 Apr 2018 12:14:50 -0600 Subject: ACPI / CPPC: Add support for CPPC v3 CPPC V3 introduces two new entries to make it easier to convert between abstract processor performance and frequency. The two new entries are lowest frequency and nominal frequency. These are the frequencies corresponding to lowest and nominal abstract performance. Add support to read the new entries and populate them as part of the CPPC performance capabilities which can be used by cpufreq drivers Signed-off-by: Prashanth Prakash Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 81 ++++++++++++++++++++++++++++++++++++++---------- include/acpi/cppc_acpi.h | 14 ++++++--- 2 files changed, 75 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 735c74a4cbdb..8fa3d3a6e5b6 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -156,6 +156,9 @@ show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, highest_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_nonlinear_perf); +show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_freq); +show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_freq); + show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, reference_perf); show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, wraparound_time); @@ -183,6 +186,8 @@ static struct attribute *cppc_attrs[] = { &lowest_perf.attr, &lowest_nonlinear_perf.attr, &nominal_perf.attr, + &nominal_freq.attr, + &lowest_freq.attr, NULL }; @@ -613,7 +618,6 @@ bool __weak cpc_ffh_supported(void) return false; } - /** * pcc_data_alloc() - Allocate the pcc_data memory for pcc subspace * @@ -641,6 +645,34 @@ int pcc_data_alloc(int pcc_ss_id) return 0; } + +/* Check if CPPC revision + num_ent combination is supported */ +static bool is_cppc_supported(int revision, int num_ent) +{ + int expected_num_ent; + + switch (revision) { + case CPPC_V2_REV: + expected_num_ent = CPPC_V2_NUM_ENT; + break; + case CPPC_V3_REV: + expected_num_ent = CPPC_V3_NUM_ENT; + break; + default: + pr_debug("Firmware exports unsupported CPPC revision: %d\n", + revision); + return false; + } + + if (expected_num_ent != num_ent) { + pr_debug("Firmware exports %d entries. Expected: %d for CPPC rev:%d\n", + num_ent, expected_num_ent, revision); + return false; + } + + return true; +} + /* * An example CPC table looks like the following. * @@ -731,14 +763,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) cpc_obj->type); goto out_free; } - - /* Only support CPPCv2. Bail otherwise. */ - if (num_ent != CPPC_NUM_ENT) { - pr_debug("Firmware exports %d entries. Expected: %d\n", - num_ent, CPPC_NUM_ENT); - goto out_free; - } - cpc_ptr->num_entries = num_ent; /* Second entry should be revision. */ @@ -750,12 +774,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) cpc_obj->type); goto out_free; } + cpc_ptr->version = cpc_rev; - if (cpc_rev != CPPC_REV) { - pr_debug("Firmware exports revision:%d. Expected:%d\n", - cpc_rev, CPPC_REV); + if (!is_cppc_supported(cpc_rev, num_ent)) goto out_free; - } /* Iterate through remaining entries in _CPC */ for (i = 2; i < num_ent; i++) { @@ -808,6 +830,18 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) } } per_cpu(cpu_pcc_subspace_idx, pr->id) = pcc_subspace_id; + + /* + * Initialize the remaining cpc_regs as unsupported. + * Example: In case FW exposes CPPC v2, the below loop will initialize + * LOWEST_FREQ and NOMINAL_FREQ regs as unsupported + */ + for (i = num_ent - 2; i < MAX_CPC_REG_ENT; i++) { + cpc_ptr->cpc_regs[i].type = ACPI_TYPE_INTEGER; + cpc_ptr->cpc_regs[i].cpc_entry.int_value = 0; + } + + /* Store CPU Logical ID */ cpc_ptr->cpu_id = pr->id; @@ -1037,8 +1071,9 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); struct cpc_register_resource *highest_reg, *lowest_reg, - *lowest_non_linear_reg, *nominal_reg; - u64 high, low, nom, min_nonlinear; + *lowest_non_linear_reg, *nominal_reg, + *low_freq_reg = NULL, *nom_freq_reg = NULL; + u64 high, low, nom, min_nonlinear, low_f = 0, nom_f = 0; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); struct cppc_pcc_data *pcc_ss_data; int ret = 0, regs_in_pcc = 0; @@ -1053,10 +1088,13 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) lowest_reg = &cpc_desc->cpc_regs[LOWEST_PERF]; lowest_non_linear_reg = &cpc_desc->cpc_regs[LOW_NON_LINEAR_PERF]; nominal_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + low_freq_reg = &cpc_desc->cpc_regs[LOWEST_FREQ]; + nom_freq_reg = &cpc_desc->cpc_regs[NOMINAL_FREQ]; /* Are any of the regs PCC ?*/ if (CPC_IN_PCC(highest_reg) || CPC_IN_PCC(lowest_reg) || - CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg)) { + CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg) || + CPC_IN_PCC(low_freq_reg) || CPC_IN_PCC(nom_freq_reg)) { regs_in_pcc = 1; down_write(&pcc_ss_data->pcc_lock); /* Ring doorbell once to update PCC subspace */ @@ -1081,6 +1119,17 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) if (!high || !low || !nom || !min_nonlinear) ret = -EFAULT; + /* Read optional lowest and nominal frequencies if present */ + if (CPC_SUPPORTED(low_freq_reg)) + cpc_read(cpunum, low_freq_reg, &low_f); + + if (CPC_SUPPORTED(nom_freq_reg)) + cpc_read(cpunum, nom_freq_reg, &nom_f); + + perf_caps->lowest_freq = low_f; + perf_caps->nominal_freq = nom_f; + + out_err: if (regs_in_pcc) up_write(&pcc_ss_data->pcc_lock); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 2010c0516f27..8e0b8250a139 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -20,14 +20,16 @@ #include #include -/* Only support CPPCv2 for now. */ -#define CPPC_NUM_ENT 21 -#define CPPC_REV 2 +/* Support CPPCv2 and CPPCv3 */ +#define CPPC_V2_REV 2 +#define CPPC_V3_REV 3 +#define CPPC_V2_NUM_ENT 21 +#define CPPC_V3_NUM_ENT 23 #define PCC_CMD_COMPLETE_MASK (1 << 0) #define PCC_ERROR_MASK (1 << 2) -#define MAX_CPC_REG_ENT 19 +#define MAX_CPC_REG_ENT 21 /* CPPC specific PCC commands. */ #define CMD_READ 0 @@ -91,6 +93,8 @@ enum cppc_regs { AUTO_ACT_WINDOW, ENERGY_PERF, REFERENCE_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, }; /* @@ -104,6 +108,8 @@ struct cppc_perf_caps { u32 nominal_perf; u32 lowest_perf; u32 lowest_nonlinear_perf; + u32 lowest_freq; + u32 nominal_freq; }; struct cppc_perf_ctrls { -- cgit v1.2.3 From a9c2dfc8527318a27db045cd7ea51e8ecab8c884 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Apr 2018 17:24:56 +0200 Subject: ALSA: hda - Use a macro for snd_array iteration loops Introduce a new helper macro, snd_array_for_each(), to iterate for each snd_array element. It slightly improves the readability than lengthy open codes at each place. Along with it, add const prefix to some obvious places. There should be no functional changes by this. Signed-off-by: Takashi Iwai --- include/sound/hdaudio.h | 5 +++++ sound/hda/hdac_regmap.c | 4 ++-- sound/pci/hda/hda_auto_parser.c | 10 +++++----- sound/pci/hda/hda_codec.c | 36 ++++++++++++++++++------------------ sound/pci/hda/hda_generic.c | 27 +++++++++++++-------------- sound/pci/hda/hda_sysfs.c | 20 ++++++++++---------- sound/pci/hda/patch_conexant.c | 5 ++--- sound/pci/hda/patch_realtek.c | 4 ++-- 8 files changed, 57 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index 06536e01ed94..c052afc27547 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -571,4 +571,9 @@ static inline unsigned int snd_array_index(struct snd_array *array, void *ptr) return (unsigned long)(ptr - array->list) / array->elem_size; } +/* a helper macro to iterate for each snd_array element */ +#define snd_array_for_each(array, idx, ptr) \ + for ((idx) = 0, (ptr) = (array)->list; (idx) < (array)->used; \ + (ptr) = snd_array_elem(array, ++(idx))) + #endif /* __SOUND_HDAUDIO_H */ diff --git a/sound/hda/hdac_regmap.c b/sound/hda/hdac_regmap.c index 47a358fab132..419e285e0226 100644 --- a/sound/hda/hdac_regmap.c +++ b/sound/hda/hdac_regmap.c @@ -65,10 +65,10 @@ static bool hda_writeable_reg(struct device *dev, unsigned int reg) { struct hdac_device *codec = dev_to_hdac_dev(dev); unsigned int verb = get_verb(reg); + const unsigned int *v; int i; - for (i = 0; i < codec->vendor_verbs.used; i++) { - unsigned int *v = snd_array_elem(&codec->vendor_verbs, i); + snd_array_for_each(&codec->vendor_verbs, i, v) { if (verb == *v) return true; } diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c index d3ea73171a3d..b9a6b66aeb0e 100644 --- a/sound/pci/hda/hda_auto_parser.c +++ b/sound/pci/hda/hda_auto_parser.c @@ -793,11 +793,11 @@ EXPORT_SYMBOL_GPL(snd_hda_add_verbs); */ void snd_hda_apply_verbs(struct hda_codec *codec) { + const struct hda_verb **v; int i; - for (i = 0; i < codec->verbs.used; i++) { - struct hda_verb **v = snd_array_elem(&codec->verbs, i); + + snd_array_for_each(&codec->verbs, i, v) snd_hda_sequence_write(codec, *v); - } } EXPORT_SYMBOL_GPL(snd_hda_apply_verbs); @@ -890,10 +890,10 @@ EXPORT_SYMBOL_GPL(snd_hda_apply_fixup); static bool pin_config_match(struct hda_codec *codec, const struct hda_pintbl *pins) { + const struct hda_pincfg *pin; int i; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); + snd_array_for_each(&codec->init_pins, i, pin) { hda_nid_t nid = pin->nid; u32 cfg = pin->cfg; const struct hda_pintbl *t_pins; diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 5bc3a7468e17..0aa923d129f5 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -481,9 +481,10 @@ static struct hda_pincfg *look_up_pincfg(struct hda_codec *codec, struct snd_array *array, hda_nid_t nid) { + struct hda_pincfg *pin; int i; - for (i = 0; i < array->used; i++) { - struct hda_pincfg *pin = snd_array_elem(array, i); + + snd_array_for_each(array, i, pin) { if (pin->nid == nid) return pin; } @@ -618,14 +619,15 @@ EXPORT_SYMBOL_GPL(snd_hda_codec_get_pin_target); */ void snd_hda_shutup_pins(struct hda_codec *codec) { + const struct hda_pincfg *pin; int i; + /* don't shut up pins when unloading the driver; otherwise it breaks * the default pin setup at the next load of the driver */ if (codec->bus->shutdown) return; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); + snd_array_for_each(&codec->init_pins, i, pin) { /* use read here for syncing after issuing each verb */ snd_hda_codec_read(codec, pin->nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, 0); @@ -638,13 +640,14 @@ EXPORT_SYMBOL_GPL(snd_hda_shutup_pins); /* Restore the pin controls cleared previously via snd_hda_shutup_pins() */ static void restore_shutup_pins(struct hda_codec *codec) { + const struct hda_pincfg *pin; int i; + if (!codec->pins_shutup) return; if (codec->bus->shutdown) return; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); + snd_array_for_each(&codec->init_pins, i, pin) { snd_hda_codec_write(codec, pin->nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, pin->ctrl); @@ -697,8 +700,7 @@ get_hda_cvt_setup(struct hda_codec *codec, hda_nid_t nid) struct hda_cvt_setup *p; int i; - for (i = 0; i < codec->cvt_setups.used; i++) { - p = snd_array_elem(&codec->cvt_setups, i); + snd_array_for_each(&codec->cvt_setups, i, p) { if (p->nid == nid) return p; } @@ -1076,8 +1078,7 @@ void snd_hda_codec_setup_stream(struct hda_codec *codec, hda_nid_t nid, /* make other inactive cvts with the same stream-tag dirty */ type = get_wcaps_type(get_wcaps(codec, nid)); list_for_each_codec(c, codec->bus) { - for (i = 0; i < c->cvt_setups.used; i++) { - p = snd_array_elem(&c->cvt_setups, i); + snd_array_for_each(&c->cvt_setups, i, p) { if (!p->active && p->stream_tag == stream_tag && get_wcaps_type(get_wcaps(c, p->nid)) == type) p->dirty = 1; @@ -1140,12 +1141,11 @@ static void really_cleanup_stream(struct hda_codec *codec, static void purify_inactive_streams(struct hda_codec *codec) { struct hda_codec *c; + struct hda_cvt_setup *p; int i; list_for_each_codec(c, codec->bus) { - for (i = 0; i < c->cvt_setups.used; i++) { - struct hda_cvt_setup *p; - p = snd_array_elem(&c->cvt_setups, i); + snd_array_for_each(&c->cvt_setups, i, p) { if (p->dirty) really_cleanup_stream(c, p); } @@ -1156,10 +1156,10 @@ static void purify_inactive_streams(struct hda_codec *codec) /* clean up all streams; called from suspend */ static void hda_cleanup_all_streams(struct hda_codec *codec) { + struct hda_cvt_setup *p; int i; - for (i = 0; i < codec->cvt_setups.used; i++) { - struct hda_cvt_setup *p = snd_array_elem(&codec->cvt_setups, i); + snd_array_for_each(&codec->cvt_setups, i, p) { if (p->stream_tag) really_cleanup_stream(codec, p); } @@ -2461,10 +2461,10 @@ EXPORT_SYMBOL_GPL(snd_hda_create_dig_out_ctls); struct hda_spdif_out *snd_hda_spdif_out_of_nid(struct hda_codec *codec, hda_nid_t nid) { + struct hda_spdif_out *spdif; int i; - for (i = 0; i < codec->spdif_out.used; i++) { - struct hda_spdif_out *spdif = - snd_array_elem(&codec->spdif_out, i); + + snd_array_for_each(&codec->spdif_out, i, spdif) { if (spdif->nid == nid) return spdif; } diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 5cc65093d941..51030f040745 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -264,10 +264,10 @@ static struct nid_path *get_nid_path(struct hda_codec *codec, int anchor_nid) { struct hda_gen_spec *spec = codec->spec; + struct nid_path *path; int i; - for (i = 0; i < spec->paths.used; i++) { - struct nid_path *path = snd_array_elem(&spec->paths, i); + snd_array_for_each(&spec->paths, i, path) { if (path->depth <= 0) continue; if ((!from_nid || path->path[0] == from_nid) && @@ -325,10 +325,10 @@ EXPORT_SYMBOL_GPL(snd_hda_get_path_from_idx); static bool is_dac_already_used(struct hda_codec *codec, hda_nid_t nid) { struct hda_gen_spec *spec = codec->spec; + const struct nid_path *path; int i; - for (i = 0; i < spec->paths.used; i++) { - struct nid_path *path = snd_array_elem(&spec->paths, i); + snd_array_for_each(&spec->paths, i, path) { if (path->path[0] == nid) return true; } @@ -351,11 +351,11 @@ static bool is_reachable_path(struct hda_codec *codec, static bool is_ctl_used(struct hda_codec *codec, unsigned int val, int type) { struct hda_gen_spec *spec = codec->spec; + const struct nid_path *path; int i; val &= AMP_VAL_COMPARE_MASK; - for (i = 0; i < spec->paths.used; i++) { - struct nid_path *path = snd_array_elem(&spec->paths, i); + snd_array_for_each(&spec->paths, i, path) { if ((path->ctls[type] & AMP_VAL_COMPARE_MASK) == val) return true; } @@ -638,13 +638,13 @@ static bool is_active_nid(struct hda_codec *codec, hda_nid_t nid, { struct hda_gen_spec *spec = codec->spec; int type = get_wcaps_type(get_wcaps(codec, nid)); + const struct nid_path *path; int i, n; if (nid == codec->core.afg) return true; - for (n = 0; n < spec->paths.used; n++) { - struct nid_path *path = snd_array_elem(&spec->paths, n); + snd_array_for_each(&spec->paths, n, path) { if (!path->active) continue; if (codec->power_save_node) { @@ -2696,10 +2696,10 @@ static const struct snd_kcontrol_new out_jack_mode_enum = { static bool find_kctl_name(struct hda_codec *codec, const char *name, int idx) { struct hda_gen_spec *spec = codec->spec; + const struct snd_kcontrol_new *kctl; int i; - for (i = 0; i < spec->kctls.used; i++) { - struct snd_kcontrol_new *kctl = snd_array_elem(&spec->kctls, i); + snd_array_for_each(&spec->kctls, i, kctl) { if (!strcmp(kctl->name, name) && kctl->index == idx) return true; } @@ -4021,8 +4021,7 @@ static hda_nid_t set_path_power(struct hda_codec *codec, hda_nid_t nid, struct nid_path *path; int n; - for (n = 0; n < spec->paths.used; n++) { - path = snd_array_elem(&spec->paths, n); + snd_array_for_each(&spec->paths, n, path) { if (!path->depth) continue; if (path->path[0] == nid || @@ -5831,10 +5830,10 @@ static void init_digital(struct hda_codec *codec) */ static void clear_unsol_on_unused_pins(struct hda_codec *codec) { + const struct hda_pincfg *pin; int i; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); + snd_array_for_each(&codec->init_pins, i, pin) { hda_nid_t nid = pin->nid; if (is_jack_detectable(codec, nid) && !snd_hda_jack_tbl_get(codec, nid)) diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c index 9b7efece4484..6ec79c58d48d 100644 --- a/sound/pci/hda/hda_sysfs.c +++ b/sound/pci/hda/hda_sysfs.c @@ -80,10 +80,10 @@ static ssize_t pin_configs_show(struct hda_codec *codec, struct snd_array *list, char *buf) { + const struct hda_pincfg *pin; int i, len = 0; mutex_lock(&codec->user_mutex); - for (i = 0; i < list->used; i++) { - struct hda_pincfg *pin = snd_array_elem(list, i); + snd_array_for_each(list, i, pin) { len += sprintf(buf + len, "0x%02x 0x%08x\n", pin->nid, pin->cfg); } @@ -217,10 +217,10 @@ static ssize_t init_verbs_show(struct device *dev, char *buf) { struct hda_codec *codec = dev_get_drvdata(dev); + const struct hda_verb *v; int i, len = 0; mutex_lock(&codec->user_mutex); - for (i = 0; i < codec->init_verbs.used; i++) { - struct hda_verb *v = snd_array_elem(&codec->init_verbs, i); + snd_array_for_each(&codec->init_verbs, i, v) { len += snprintf(buf + len, PAGE_SIZE - len, "0x%02x 0x%03x 0x%04x\n", v->nid, v->verb, v->param); @@ -267,10 +267,10 @@ static ssize_t hints_show(struct device *dev, char *buf) { struct hda_codec *codec = dev_get_drvdata(dev); + const struct hda_hint *hint; int i, len = 0; mutex_lock(&codec->user_mutex); - for (i = 0; i < codec->hints.used; i++) { - struct hda_hint *hint = snd_array_elem(&codec->hints, i); + snd_array_for_each(&codec->hints, i, hint) { len += snprintf(buf + len, PAGE_SIZE - len, "%s = %s\n", hint->key, hint->val); } @@ -280,10 +280,10 @@ static ssize_t hints_show(struct device *dev, static struct hda_hint *get_hint(struct hda_codec *codec, const char *key) { + struct hda_hint *hint; int i; - for (i = 0; i < codec->hints.used; i++) { - struct hda_hint *hint = snd_array_elem(&codec->hints, i); + snd_array_for_each(&codec->hints, i, hint) { if (!strcmp(hint->key, key)) return hint; } @@ -783,13 +783,13 @@ void snd_hda_sysfs_init(struct hda_codec *codec) void snd_hda_sysfs_clear(struct hda_codec *codec) { #ifdef CONFIG_SND_HDA_RECONFIG + struct hda_hint *hint; int i; /* clear init verbs */ snd_array_free(&codec->init_verbs); /* clear hints */ - for (i = 0; i < codec->hints.used; i++) { - struct hda_hint *hint = snd_array_elem(&codec->hints, i); + snd_array_for_each(&codec->hints, i, hint) { kfree(hint->key); /* we don't need to free hint->val */ } snd_array_free(&codec->hints); diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 5b4dbcec6de8..093d2a9ece85 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -588,6 +588,7 @@ static void cxt_fixup_olpc_xo(struct hda_codec *codec, const struct hda_fixup *fix, int action) { struct conexant_spec *spec = codec->spec; + struct snd_kcontrol_new *kctl; int i; if (action != HDA_FIXUP_ACT_PROBE) @@ -606,9 +607,7 @@ static void cxt_fixup_olpc_xo(struct hda_codec *codec, snd_hda_codec_set_pin_target(codec, 0x1a, PIN_VREF50); /* override mic boost control */ - for (i = 0; i < spec->gen.kctls.used; i++) { - struct snd_kcontrol_new *kctl = - snd_array_elem(&spec->gen.kctls, i); + snd_array_for_each(&spec->gen.kctls, i, kctl) { if (!strcmp(kctl->name, "Mic Boost Volume")) { kctl->put = olpc_xo_mic_boost_put; break; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index aef1f52db7d9..7f2d5b157b75 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2828,6 +2828,7 @@ static int find_ext_mic_pin(struct hda_codec *codec); static void alc286_shutup(struct hda_codec *codec) { + const struct hda_pincfg *pin; int i; int mic_pin = find_ext_mic_pin(codec); /* don't shut up pins when unloading the driver; otherwise it breaks @@ -2835,8 +2836,7 @@ static void alc286_shutup(struct hda_codec *codec) */ if (codec->bus->shutdown) return; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); + snd_array_for_each(&codec->init_pins, i, pin) { /* use read here for syncing after issuing each verb */ if (pin->nid != mic_pin) snd_hda_codec_read(codec, pin->nid, 0, -- cgit v1.2.3 From ccc3b2b3482c2c05d05fd2cfbf0c28d644b4b0c2 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 5 Apr 2018 17:44:42 +0200 Subject: drm: Move simple_display_pipe prepare_fb helper into gem fb helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's nothing tinydrm specific to this, and there's a few more copies of the same in various other drivers. Signed-off-by: Daniel Vetter Cc: Gustavo Padovan Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: David Lechner Cc: "Noralf Trønnes" Cc: Daniel Vetter Cc: Shawn Guo Cc: Neil Armstrong Cc: Daniel Stone Cc: Haneen Mohammed Cc: Ben Widawsky Cc: "Ville Syrjälä" Reviewed-by: Oleksandr Andrushchenko Acked-by: David Lechner Reviewed-by: Noralf Trønnes Link: https://patchwork.freedesktop.org/patch/msgid/20180405154449.23038-3-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_gem_framebuffer_helper.c | 19 +++++++++++++++++++ drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c | 17 ----------------- drivers/gpu/drm/tinydrm/ili9225.c | 2 +- drivers/gpu/drm/tinydrm/mi0283qt.c | 3 ++- drivers/gpu/drm/tinydrm/repaper.c | 2 +- drivers/gpu/drm/tinydrm/st7586.c | 2 +- drivers/gpu/drm/tinydrm/st7735r.c | 2 +- include/drm/drm_gem_framebuffer_helper.h | 3 +++ include/drm/drm_simple_kms_helper.h | 3 +++ include/drm/tinydrm/tinydrm.h | 2 -- 10 files changed, 31 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c index 4d682a6e8bcb..acfbc0641a06 100644 --- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c +++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c @@ -22,6 +22,7 @@ #include #include #include +#include /** * DOC: overview @@ -265,6 +266,24 @@ int drm_gem_fb_prepare_fb(struct drm_plane *plane, } EXPORT_SYMBOL_GPL(drm_gem_fb_prepare_fb); +/** + * drm_gem_fb_simple_display_pipe_prepare_fb - prepare_fb helper for + * &drm_simple_display_pipe + * @pipe: Simple display pipe + * @plane_state: Plane state + * + * This function uses drm_gem_fb_prepare_fb() to check if the plane FB has a + * &dma_buf attached, extracts the exclusive fence and attaches it to plane + * state for the atomic helper to wait on. Drivers can use this as their + * &drm_simple_display_pipe_funcs.prepare_fb callback. + */ +int drm_gem_fb_simple_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe, + struct drm_plane_state *plane_state) +{ + return drm_gem_fb_prepare_fb(&pipe->plane, plane_state); +} +EXPORT_SYMBOL(drm_gem_fb_simple_display_pipe_prepare_fb); + /** * drm_gem_fbdev_fb_create - Create a GEM backed &drm_framebuffer for fbdev * emulation diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c index e68b528ae64d..7e8e24d0b7a7 100644 --- a/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c +++ b/drivers/gpu/drm/tinydrm/core/tinydrm-pipe.c @@ -138,23 +138,6 @@ void tinydrm_display_pipe_update(struct drm_simple_display_pipe *pipe, } EXPORT_SYMBOL(tinydrm_display_pipe_update); -/** - * tinydrm_display_pipe_prepare_fb - Display pipe prepare_fb helper - * @pipe: Simple display pipe - * @plane_state: Plane state - * - * This function uses drm_gem_fb_prepare_fb() to check if the plane FB has an - * dma-buf attached, extracts the exclusive fence and attaches it to plane - * state for the atomic helper to wait on. Drivers can use this as their - * &drm_simple_display_pipe_funcs->prepare_fb callback. - */ -int tinydrm_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe, - struct drm_plane_state *plane_state) -{ - return drm_gem_fb_prepare_fb(&pipe->plane, plane_state); -} -EXPORT_SYMBOL(tinydrm_display_pipe_prepare_fb); - static int tinydrm_rotate_mode(struct drm_display_mode *mode, unsigned int rotation) { diff --git a/drivers/gpu/drm/tinydrm/ili9225.c b/drivers/gpu/drm/tinydrm/ili9225.c index 0874e877b111..841c69aba059 100644 --- a/drivers/gpu/drm/tinydrm/ili9225.c +++ b/drivers/gpu/drm/tinydrm/ili9225.c @@ -354,7 +354,7 @@ static const struct drm_simple_display_pipe_funcs ili9225_pipe_funcs = { .enable = ili9225_pipe_enable, .disable = ili9225_pipe_disable, .update = tinydrm_display_pipe_update, - .prepare_fb = tinydrm_display_pipe_prepare_fb, + .prepare_fb = drm_gem_fb_simple_display_pipe_prepare_fb, }; static const struct drm_display_mode ili9225_mode = { diff --git a/drivers/gpu/drm/tinydrm/mi0283qt.c b/drivers/gpu/drm/tinydrm/mi0283qt.c index 4e6d2ee94e55..d5ef65179c16 100644 --- a/drivers/gpu/drm/tinydrm/mi0283qt.c +++ b/drivers/gpu/drm/tinydrm/mi0283qt.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include