From 0e4a4a08cd78efcaddbc2e4c5ed86b5a5cb8a15e Mon Sep 17 00:00:00 2001 From: Michal Vokáč Date: Tue, 13 Apr 2021 16:45:57 +0200 Subject: ARM: dts: imx6dl-yapp4: Fix RGMII connection to QCA8334 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FEC does not have a PHY so it should not have a phy-handle. It is connected to the switch at RGMII level so we need a fixed-link sub-node on both ends. This was not a problem until the qca8k.c driver was converted to PHYLINK by commit b3591c2a3661 ("net: dsa: qca8k: Switch to PHYLINK instead of PHYLIB"). That commit revealed the FEC configuration was not correct. Fixes: 87489ec3a77f ("ARM: dts: imx: Add Y Soft IOTA Draco, Hydra and Ursa boards") Cc: stable@vger.kernel.org Signed-off-by: Michal Vokáč Reviewed-by: Andrew Lunn Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6dl-yapp4-common.dtsi | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi index 7d2c72562c73..9148a01ed6d9 100644 --- a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi +++ b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi @@ -105,9 +105,13 @@ phy-reset-gpios = <&gpio1 25 GPIO_ACTIVE_LOW>; phy-reset-duration = <20>; phy-supply = <&sw2_reg>; - phy-handle = <ðphy0>; status = "okay"; + fixed-link { + speed = <1000>; + full-duplex; + }; + mdio { #address-cells = <1>; #size-cells = <0>; -- cgit v1.2.3 From 8967b27a6c1c19251989c7ab33c058d16e4a5f53 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 26 Apr 2021 12:23:21 +0200 Subject: ARM: dts: imx6q-dhcom: Add PU,VDD1P1,VDD2P5 regulators Per schematic, both PU and SOC regulator are supplied from LTC3676 SW1 via VDDSOC_IN rail, add the PU input. Both VDD1P1, VDD2P5 are supplied from LTC3676 SW2 via VDDHIGH_IN rail, add both inputs. While no instability or problems are currently observed, the regulators should be fully described in DT and that description should fully match the hardware, else this might lead to unforseen issues later. Fix this. Fixes: 52c7a088badd ("ARM: dts: imx6q: Add support for the DHCOM iMX6 SoM and PDK2") Reviewed-by: Fabio Estevam Signed-off-by: Marek Vasut Cc: Christoph Niedermaier Cc: Fabio Estevam Cc: Ludwig Zenz Cc: NXP Linux Team Cc: Shawn Guo Cc: stable@vger.kernel.org Reviewed-by: Christoph Niedermaier Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6q-dhcom-som.dtsi | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6q-dhcom-som.dtsi b/arch/arm/boot/dts/imx6q-dhcom-som.dtsi index 236fc205c389..d0768ae429fa 100644 --- a/arch/arm/boot/dts/imx6q-dhcom-som.dtsi +++ b/arch/arm/boot/dts/imx6q-dhcom-som.dtsi @@ -406,6 +406,18 @@ vin-supply = <&sw1_reg>; }; +®_pu { + vin-supply = <&sw1_reg>; +}; + +®_vdd1p1 { + vin-supply = <&sw2_reg>; +}; + +®_vdd2p5 { + vin-supply = <&sw2_reg>; +}; + &uart1 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_uart1>; -- cgit v1.2.3 From df61cd9393845383adc4ea2410f2a91e1d1972b6 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Fri, 23 Apr 2021 11:31:20 +0300 Subject: arm64: dts: ti: k3-am654-base-board: remove ov5640 AM654 EVM boards are not shipped with OV5640 sensor module, it is a separate purchase. OV5640 module is also just one of the possible sensors or capture boards you can connect. However, for some reason, OV5640 has been added to the board dts file, making it cumbersome to use other sensors. Remove the OV5640 from the dts file so that it is easy to use other sensors via DT overlays. Signed-off-by: Tomi Valkeinen Acked-by: Pratyush Yadav Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20210423083120.73476-1-tomi.valkeinen@ideasonboard.com --- arch/arm64/boot/dts/ti/k3-am654-base-board.dts | 31 -------------------------- 1 file changed, 31 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am654-base-board.dts b/arch/arm64/boot/dts/ti/k3-am654-base-board.dts index 9e87fb313a54..eddb2ffb93ca 100644 --- a/arch/arm64/boot/dts/ti/k3-am654-base-board.dts +++ b/arch/arm64/boot/dts/ti/k3-am654-base-board.dts @@ -85,12 +85,6 @@ gpios = <&wkup_gpio0 27 GPIO_ACTIVE_LOW>; }; }; - - clk_ov5640_fixed: clock { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <24000000>; - }; }; &wkup_pmx0 { @@ -287,23 +281,6 @@ pinctrl-names = "default"; pinctrl-0 = <&main_i2c1_pins_default>; clock-frequency = <400000>; - - ov5640: camera@3c { - compatible = "ovti,ov5640"; - reg = <0x3c>; - - clocks = <&clk_ov5640_fixed>; - clock-names = "xclk"; - - port { - csi2_cam0: endpoint { - remote-endpoint = <&csi2_phy0>; - clock-lanes = <0>; - data-lanes = <1 2>; - }; - }; - }; - }; &main_i2c2 { @@ -496,14 +473,6 @@ }; }; -&csi2_0 { - csi2_phy0: endpoint { - remote-endpoint = <&csi2_cam0>; - clock-lanes = <0>; - data-lanes = <1 2>; - }; -}; - &mcu_cpsw { pinctrl-names = "default"; pinctrl-0 = <&mcu_cpsw_pins_default &mcu_mdio_pins_default>; -- cgit v1.2.3 From 52ae30f55a2a40cff549fac95de82f25403bd387 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Mon, 10 May 2021 23:36:01 +0530 Subject: arm64: dts: ti: j7200-main: Mark Main NAVSS as dma-coherent Traffic through main NAVSS interconnect is coherent wrt ARM caches on J7200 SoC. Add missing dma-coherent property to main_navss node. Also add dma-ranges to be consistent with mcu_navss node and with AM65/J721e main_navss and mcu_navss nodes. Fixes: d361ed88455fe ("arm64: dts: ti: Add support for J7200 SoC") Signed-off-by: Vignesh Raghavendra Reviewed-by: Peter Ujfalusi Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20210510180601.19458-1-vigneshr@ti.com --- arch/arm64/boot/dts/ti/k3-j7200-main.dtsi | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi index f86c493a44f1..a6826f1888ef 100644 --- a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi @@ -85,6 +85,8 @@ #size-cells = <2>; ranges = <0x00 0x30000000 0x00 0x30000000 0x00 0x0c400000>; ti,sci-dev-id = <199>; + dma-coherent; + dma-ranges; main_navss_intr: interrupt-controller1 { compatible = "ti,sci-intr"; -- cgit v1.2.3 From a0812885fa7a1074c8003484b8176ffe28d5df68 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 10 May 2021 09:50:30 -0500 Subject: arm64: dts: ti: k3-*: Rename the TI-SCI clocks node name We currently use clocks as the node name for the node representing TI-SCI clock nodes. This is better renamed to being clock-controller as that is a better representative of the system controller function as a clock controller for the SoC. Signed-off-by: Nishanth Menon Reviewed-by: Tero Kristo Link: https://lore.kernel.org/r/20210510145033.7426-2-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am64-main.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi index b2bcbf23eefd..e1216073e3df 100644 --- a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi @@ -148,7 +148,7 @@ #power-domain-cells = <2>; }; - k3_clks: clocks { + k3_clks: clock-controller { compatible = "ti,k2g-sci-clk"; #clock-cells = <2>; }; diff --git a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi index ed42f13e7663..2ae1f9214b8a 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi @@ -23,7 +23,7 @@ #power-domain-cells = <2>; }; - k3_clks: clocks { + k3_clks: clock-controller { compatible = "ti,k2g-sci-clk"; #clock-cells = <2>; }; diff --git a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi index 5e74e43822c3..9dba2df3569f 100644 --- a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi @@ -23,7 +23,7 @@ #power-domain-cells = <2>; }; - k3_clks: clocks { + k3_clks: clock-controller { compatible = "ti,k2g-sci-clk"; #clock-cells = <2>; }; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi index d56e3475aee7..b83801feeb10 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi @@ -23,7 +23,7 @@ #power-domain-cells = <2>; }; - k3_clks: clocks { + k3_clks: clock-controller { compatible = "ti,k2g-sci-clk"; #clock-cells = <2>; }; -- cgit v1.2.3 From 830454bbd628330c3779c3de637b709dae790da0 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 10 May 2021 09:50:31 -0500 Subject: arm64: dts: ti: k3-am65-wakeup: Add debug region to TI-SCI node Lets add the TISCI debug region to TI-SCI region in line with TI-SCI documentation[1]. While at it, lets rename the node to indicate the address usage. [1] http://downloads.ti.com/tisci/esd/latest/4_trace/trace.html Signed-off-by: Nishanth Menon Reviewed-by: Tero Kristo Link: https://lore.kernel.org/r/20210510145033.7426-3-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi index 2ae1f9214b8a..444842a2d556 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi @@ -6,7 +6,7 @@ */ &cbass_wakeup { - dmsc: dmsc { + dmsc: dmsc@44083000 { compatible = "ti,am654-sci"; ti,host-id = <12>; #address-cells = <1>; @@ -18,6 +18,9 @@ mboxes= <&secure_proxy_main 11>, <&secure_proxy_main 13>; + reg-names = "debug_messages"; + reg = <0x44083000 0x1000>; + k3_pds: power-controller { compatible = "ti,sci-pm-domain"; #power-domain-cells = <2>; -- cgit v1.2.3 From 421c06b8761abd7d953148f5b955b4149df9846e Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 10 May 2021 09:50:32 -0500 Subject: arm64: dts: ti: k3-am65-wakeup: Drop un-necessary properties from dmsc node The DMSC node does'nt require any of "#address-cells", "#size-cells" or "ranges" property as the child nodes are representations of SoC's system controller itself, so align it with the bindings. Signed-off-by: Nishanth Menon Reviewed-by: Tero Kristo Link: https://lore.kernel.org/r/20210510145033.7426-4-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi index 444842a2d556..80d4df775f43 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi @@ -9,9 +9,6 @@ dmsc: dmsc@44083000 { compatible = "ti,am654-sci"; ti,host-id = <12>; - #address-cells = <1>; - #size-cells = <1>; - ranges; mbox-names = "rx", "tx"; -- cgit v1.2.3 From 9d3c9378f96a95f15881ee3373d2c2f773273fc2 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 10 May 2021 09:50:33 -0500 Subject: arm64: dts: ti: k3-*: Rename the TI-SCI node Lets rename the node name of TI-SCI node to be system-controller as it is a better standardized name for the function that TI-SCI plays in the SoC. Signed-off-by: Nishanth Menon Reviewed-by: Tero Kristo Link: https://lore.kernel.org/r/20210510145033.7426-5-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am64-main.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi | 2 +- arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi index e1216073e3df..dc52178d9b64 100644 --- a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi @@ -134,7 +134,7 @@ }; }; - dmsc: dmsc@44043000 { + dmsc: system-controller@44043000 { compatible = "ti,k2g-sci"; ti,host-id = <12>; mbox-names = "rx", "tx"; diff --git a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi index 80d4df775f43..822f4cff1db4 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi @@ -6,7 +6,7 @@ */ &cbass_wakeup { - dmsc: dmsc@44083000 { + dmsc: system-controller@44083000 { compatible = "ti,am654-sci"; ti,host-id = <12>; diff --git a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi index 9dba2df3569f..65f3fabda114 100644 --- a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi @@ -6,7 +6,7 @@ */ &cbass_mcu_wakeup { - dmsc: dmsc@44083000 { + dmsc: system-controller@44083000 { compatible = "ti,k2g-sci"; ti,host-id = <12>; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi index b83801feeb10..94e821467eaa 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi @@ -6,7 +6,7 @@ */ &cbass_mcu_wakeup { - dmsc: dmsc@44083000 { + dmsc: system-controller@44083000 { compatible = "ti,k2g-sci"; ti,host-id = <12>; -- cgit v1.2.3 From 9ecdb6d6b11434494af4bad11b03f0dcda1eebbd Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 10 May 2021 09:54:29 -0500 Subject: arm64: dts: ti: k3-am65|j721e|am64: Map the dma / navigator subsystem via explicit ranges Instead of using empty ranges property, lets map explicitly the address range that is mapped onto the dma / navigator subsystems (navss/dmss). This is also exposed via the dtbs_check with dt-schema newer than 2021.03 version by throwing out following: arch/arm64/boot/dts/ti/k3-am654-base-board.dt.yaml: bus@100000: main-navss: {'type': 'object'} is not allowed for {'compatible': ['simple-mfd'], '#address-cells': [[2]], ..... This has already been correctly done for J7200, however was missed for other k3 SoCs. Fix that oversight. Signed-off-by: Nishanth Menon Reviewed-by: Tero Kristo Acked-by: Vignesh Raghavendra Link: https://lore.kernel.org/r/20210510145429.8752-1-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am64-main.dtsi | 4 ++-- arch/arm64/boot/dts/ti/k3-am65-main.dtsi | 4 ++-- arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi | 4 ++-- arch/arm64/boot/dts/ti/k3-j721e-main.dtsi | 4 ++-- arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi index dc52178d9b64..d5dc05d4cba6 100644 --- a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi @@ -42,12 +42,12 @@ }; }; - dmss: dmss { + dmss: bus@48000000 { compatible = "simple-mfd"; #address-cells = <2>; #size-cells = <2>; dma-ranges; - ranges; + ranges = <0x00 0x48000000 0x00 0x48000000 0x00 0x06400000>; ti,sci-dev-id = <25>; diff --git a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi index cb340d1b401f..e160f22a1518 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi @@ -444,11 +444,11 @@ ti,interrupt-ranges = <0 392 32>; }; - main-navss { + main_navss: bus@30800000 { compatible = "simple-mfd"; #address-cells = <2>; #size-cells = <2>; - ranges; + ranges = <0x0 0x30800000 0x0 0x30800000 0x0 0xbc00000>; dma-coherent; dma-ranges; diff --git a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi index 0388c02c2203..f5b8ef2f5f77 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi @@ -116,11 +116,11 @@ }; }; - mcu-navss { + mcu_navss: bus@28380000 { compatible = "simple-mfd"; #address-cells = <2>; #size-cells = <2>; - ranges; + ranges = <0x00 0x28380000 0x00 0x28380000 0x00 0x03880000>; dma-coherent; dma-ranges; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi index c2aa45a3ac79..1a7b1cf7f794 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi @@ -87,11 +87,11 @@ ti,interrupt-ranges = <8 392 56>; }; - main-navss { + main_navss: bus@30000000 { compatible = "simple-mfd"; #address-cells = <2>; #size-cells = <2>; - ranges; + ranges = <0x00 0x30000000 0x00 0x30000000 0x00 0x0c400000>; dma-coherent; dma-ranges; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi index 94e821467eaa..c85f98b81987 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi @@ -249,11 +249,11 @@ }; }; - mcu-navss { + mcu_navss: bus@28380000 { compatible = "simple-mfd"; #address-cells = <2>; #size-cells = <2>; - ranges; + ranges = <0x00 0x28380000 0x00 0x28380000 0x00 0x03880000>; dma-coherent; dma-ranges; -- cgit v1.2.3 From cab12badfc99f93c1dccf192dd150f94b687a27c Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Tue, 11 May 2021 14:48:21 -0500 Subject: arm64: dts: ti: k3*: Introduce reg definition for interrupt routers Interrupt routers are memory mapped peripherals, that are organized in our dts bus hierarchy to closely represents the actual hardware behavior. However, without explicitly calling out the reg property, using 2021.03+ dt-schema package, this exposes the following problem with dtbs_check: /arch/arm64/boot/dts/ti/k3-am654-base-board.dt.yaml: bus@100000: interrupt-controller0: {'type': 'object'} is not allowed for {'compatible': ['ti,sci-intr'], ..... Even though we don't use interrupt router directly via memory mapped registers and have to use it via the system controller, the hardware block is memory mapped, so describe the base address in device tree. This is a valid, comprehensive description of hardware and permitted by the existing ti,sci-intr schema. Reviewed-by: Tero Kristo Reviewed-by: Lokesh Vutla Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20210511194821.13919-1-nm@ti.com --- arch/arm64/boot/dts/ti/k3-am64-main.dtsi | 3 ++- arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi | 3 ++- arch/arm64/boot/dts/ti/k3-am65-main.dtsi | 6 ++++-- arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi | 3 ++- arch/arm64/boot/dts/ti/k3-j7200-main.dtsi | 6 ++++-- arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi | 3 ++- arch/arm64/boot/dts/ti/k3-j721e-main.dtsi | 6 ++++-- arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi | 3 ++- 8 files changed, 22 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi index d5dc05d4cba6..ca59d1f711f8 100644 --- a/arch/arm64/boot/dts/ti/k3-am64-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am64-main.dtsi @@ -373,8 +373,9 @@ clocks = <&k3_clks 145 0>; }; - main_gpio_intr: interrupt-controller0 { + main_gpio_intr: interrupt-controller@a00000 { compatible = "ti,sci-intr"; + reg = <0x00 0x00a00000 0x00 0x800>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi b/arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi index 99e94dee1bd4..deb19ae5e168 100644 --- a/arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi @@ -74,8 +74,9 @@ clocks = <&k3_clks 148 0>; }; - mcu_gpio_intr: interrupt-controller1 { + mcu_gpio_intr: interrupt-controller@4210000 { compatible = "ti,sci-intr"; + reg = <0x00 0x04210000 0x00 0x200>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi index e160f22a1518..6cd3131eb9ff 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi @@ -433,8 +433,9 @@ #phy-cells = <0>; }; - intr_main_gpio: interrupt-controller0 { + intr_main_gpio: interrupt-controller@a00000 { compatible = "ti,sci-intr"; + reg = <0x0 0x00a00000 0x0 0x400>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; @@ -454,8 +455,9 @@ ti,sci-dev-id = <118>; - intr_main_navss: interrupt-controller1 { + intr_main_navss: interrupt-controller@310e0000 { compatible = "ti,sci-intr"; + reg = <0x0 0x310e0000 0x0 0x2000>; ti,intr-trigger-type = <4>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi index 822f4cff1db4..7cb864b4d74a 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi @@ -69,8 +69,9 @@ power-domains = <&k3_pds 115 TI_SCI_PD_EXCLUSIVE>; }; - intr_wkup_gpio: interrupt-controller2 { + intr_wkup_gpio: interrupt-controller@42200000 { compatible = "ti,sci-intr"; + reg = <0x42200000 0x200>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi index a6826f1888ef..19fea8adbcff 100644 --- a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi @@ -68,8 +68,9 @@ }; }; - main_gpio_intr: interrupt-controller0 { + main_gpio_intr: interrupt-controller@a00000 { compatible = "ti,sci-intr"; + reg = <0x00 0x00a00000 0x00 0x800>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; @@ -88,8 +89,9 @@ dma-coherent; dma-ranges; - main_navss_intr: interrupt-controller1 { + main_navss_intr: interrupt-controller@310e0000 { compatible = "ti,sci-intr"; + reg = <0x00 0x310e0000 0x00 0x4000>; ti,intr-trigger-type = <4>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi index 65f3fabda114..5663fe3ea466 100644 --- a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi @@ -96,8 +96,9 @@ clock-names = "fclk"; }; - wkup_gpio_intr: interrupt-controller2 { + wkup_gpio_intr: interrupt-controller@42200000 { compatible = "ti,sci-intr"; + reg = <0x00 0x42200000 0x00 0x400>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi index 1a7b1cf7f794..3bcafe4c1742 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi @@ -76,8 +76,9 @@ }; }; - main_gpio_intr: interrupt-controller0 { + main_gpio_intr: interrupt-controller@a00000 { compatible = "ti,sci-intr"; + reg = <0x00 0x00a00000 0x00 0x800>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; @@ -97,8 +98,9 @@ ti,sci-dev-id = <199>; - main_navss_intr: interrupt-controller1 { + main_navss_intr: interrupt-controller@310e0000 { compatible = "ti,sci-intr"; + reg = <0x0 0x310e0000 0x0 0x4000>; ti,intr-trigger-type = <4>; interrupt-controller; interrupt-parent = <&gic500>; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi index c85f98b81987..5e825e4d0306 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi @@ -96,8 +96,9 @@ clock-names = "fclk"; }; - wkup_gpio_intr: interrupt-controller2 { + wkup_gpio_intr: interrupt-controller@42200000 { compatible = "ti,sci-intr"; + reg = <0x00 0x42200000 0x00 0x400>; ti,intr-trigger-type = <1>; interrupt-controller; interrupt-parent = <&gic500>; -- cgit v1.2.3 From fcb8283920b135bca2916133e2383a501ad57eaa Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Tue, 27 Apr 2021 06:33:57 +0800 Subject: KVM: arm64: Fix boolreturn.cocci warnings arch/arm64/kvm/mmu.c:1114:9-10: WARNING: return of 0/1 in function 'kvm_age_gfn' with return type bool arch/arm64/kvm/mmu.c:1084:9-10: WARNING: return of 0/1 in function 'kvm_set_spte_gfn' with return type bool arch/arm64/kvm/mmu.c:1127:9-10: WARNING: return of 0/1 in function 'kvm_test_age_gfn' with return type bool arch/arm64/kvm/mmu.c:1070:9-10: WARNING: return of 0/1 in function 'kvm_unmap_gfn_range' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Fixes: cd4c71835228 ("KVM: arm64: Convert to the gfn-based MMU notifier callbacks") Reported-by: kernel test robot Signed-off-by: kernel test robot Reviewed-by: Sean Christopherson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210426223357.GA45871@cd4295a34ed8 --- arch/arm64/kvm/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c5d1f3c87dbd..c10207fed2f3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1156,13 +1156,13 @@ out_unlock: bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); - return 0; + return false; } bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1170,7 +1170,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(range->end - range->start != 1); @@ -1190,7 +1190,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) PAGE_SIZE, __pfn_to_phys(pfn), KVM_PGTABLE_PROT_R, NULL); - return 0; + return false; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1200,7 +1200,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t pte; if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); @@ -1213,7 +1213,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT); -- cgit v1.2.3 From eaa9b88dae64254a87d3d83b77afa71ee992f502 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 14 May 2021 08:56:39 +0000 Subject: KVM: arm64: Mark pkvm_pgtable_mm_ops static It is not used outside of setup.c, mark it static. Fixes:f320bc742bc2 ("KVM: arm64: Prepare the creation of s1 mappings at EL2") Reported-by: kernel test robot Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210514085640.3917886-2-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 7488f53b0aa2..a3d3a275344e 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -17,7 +17,6 @@ #include struct hyp_pool hpool; -struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; unsigned long hyp_nr_cpus; #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ @@ -27,6 +26,7 @@ static void *vmemmap_base; static void *hyp_pgt_base; static void *host_s2_mem_pgt_base; static void *host_s2_dev_pgt_base; +static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static int divide_memory_pool(void *virt, unsigned long size) { -- cgit v1.2.3 From 3fdc15fe8c6445175d61f0fac111d2ee9354e385 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 14 May 2021 08:56:40 +0000 Subject: KVM: arm64: Mark the host stage-2 memory pools static The host stage-2 memory pools are not used outside of mem_protect.c, mark them static. Fixes: 1025c8c0c6ac ("KVM: arm64: Wrap the host with a stage 2") Reported-by: kernel test robot Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210514085640.3917886-3-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index e342f7f4f4fb..4b60c0056c04 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -23,8 +23,8 @@ extern unsigned long hyp_nr_cpus; struct host_kvm host_kvm; -struct hyp_pool host_s2_mem; -struct hyp_pool host_s2_dev; +static struct hyp_pool host_s2_mem; +static struct hyp_pool host_s2_dev; /* * Copies of the host's CPU features registers holding sanitized values. -- cgit v1.2.3 From f5e30680616ab09e690b153b7a68ff7dd13e6579 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 6 May 2021 14:31:42 +0100 Subject: KVM: arm64: Move __adjust_pc out of line In order to make it easy to call __adjust_pc() from the EL1 code (in the case of nVHE), rename it to __kvm_adjust_pc() and move it out of line. No expected functional change. Reviewed-by: Alexandru Elisei Reviewed-by: Zenghui Yu Tested-by: Zenghui Yu Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 5.11 --- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/kvm/hyp/exception.c | 18 +++++++++++++++++- arch/arm64/kvm/hyp/include/hyp/adjust_pc.h | 18 ------------------ arch/arm64/kvm/hyp/nvhe/switch.c | 3 +-- arch/arm64/kvm/hyp/vhe/switch.c | 3 +-- 5 files changed, 21 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index cf8df032b9c3..d5b11037401d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -201,6 +201,8 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); +extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); + extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 73629094f903..0812a496725f 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -296,7 +296,7 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) *vcpu_pc(vcpu) = vect_offset; } -void kvm_inject_exception(struct kvm_vcpu *vcpu) +static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { @@ -329,3 +329,19 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu) } } } + +/* + * Adjust the guest PC on entry, depending on flags provided by EL1 + * for the purpose of emulation (MMIO, sysreg) or exception injection. + */ +void __kvm_adjust_pc(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + kvm_inject_exception(vcpu); + vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_EXCEPT_MASK); + } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + kvm_skip_instr(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + } +} diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h index 61716359035d..4fdfeabefeb4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -13,8 +13,6 @@ #include #include -void kvm_inject_exception(struct kvm_vcpu *vcpu); - static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { @@ -43,22 +41,6 @@ static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); } -/* - * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg) or exception injection. - */ -static inline void __adjust_pc(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { - kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { - kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; - } -} - /* * Skip an instruction while host sysregs are live. * Assumes host is always 64-bit. diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index e9f6ea704d07..f7af9688c1f7 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier */ -#include #include #include @@ -201,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ __debug_save_host_buffers_nvhe(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); /* * We must restore the 32-bit state before the sysregs, thanks diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7b8f7db5c1ed..b3229924d243 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier */ -#include #include #include @@ -132,7 +131,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); -- cgit v1.2.3 From 26778aaa134a9aefdf5dbaad904054d7be9d656d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 6 May 2021 15:20:12 +0100 Subject: KVM: arm64: Commit pending PC adjustemnts before returning to userspace KVM currently updates PC (and the corresponding exception state) using a two phase approach: first by setting a set of flags, then by converting these flags into a state update when the vcpu is about to enter the guest. However, this creates a disconnect with userspace if the vcpu thread returns there with any exception/PC flag set. In this case, the exposed context is wrong, as userspace doesn't have access to these flags (they aren't architectural). It also means that these flags are preserved across a reset, which isn't expected. To solve this problem, force an explicit synchronisation of the exception state on vcpu exit to userspace. As an optimisation for nVHE systems, only perform this when there is something pending. Reported-by: Zenghui Yu Reviewed-by: Alexandru Elisei Reviewed-by: Zenghui Yu Tested-by: Zenghui Yu Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 5.11 --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/arm.c | 11 +++++++++++ arch/arm64/kvm/hyp/exception.c | 4 ++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 ++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index d5b11037401d..5e9b33cbac51 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -63,6 +63,7 @@ #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 #define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1cb39c0803a4..1126eae27400 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -897,6 +897,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_sigset_deactivate(vcpu); + /* + * In the unlikely event that we are returning to userspace + * with pending exceptions or PC adjustment, commit these + * adjustments in order to give userspace a consistent view of + * the vcpu state. Note that this relies on __kvm_adjust_pc() + * being preempt-safe on VHE. + */ + if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_INCREMENT_PC))) + kvm_call_hyp(__kvm_adjust_pc, vcpu); + vcpu_put(vcpu); return ret; } diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 0812a496725f..11541b94b328 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -331,8 +331,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) } /* - * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg) or exception injection. + * Adjust the guest PC (and potentially exception state) depending on + * flags provided by the emulation code. */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f36420a80474..1632f001f4ed 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -28,6 +28,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); } +static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __kvm_adjust_pc(kern_hyp_va(vcpu)); +} + static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) { __kvm_flush_vm_context(); @@ -170,6 +177,7 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), -- cgit v1.2.3 From cb853ded1d25e5b026ce115dbcde69e3d7e2e831 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 14 May 2021 09:05:41 +0100 Subject: KVM: arm64: Fix debug register indexing Commit 03fdfb2690099 ("KVM: arm64: Don't write junk to sysregs on reset") flipped the register number to 0 for all the debug registers in the sysreg table, hereby indicating that these registers live in a separate shadow structure. However, the author of this patch failed to realise that all the accessors are using that particular index instead of the register encoding, resulting in all the registers hitting index 0. Not quite a valid implementation of the architecture... Address the issue by fixing all the accessors to use the CRm field of the encoding, which contains the debug register index. Fixes: 03fdfb2690099 ("KVM: arm64: Don't write junk to sysregs on reset") Reported-by: Ricardo Koller Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org --- arch/arm64/kvm/sys_regs.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 76ea2800c33e..1a7968ad078c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -399,14 +399,14 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -414,7 +414,7 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -424,7 +424,7 @@ static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -434,21 +434,21 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; } static bool trap_bcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -456,7 +456,7 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -467,7 +467,7 @@ static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -477,22 +477,22 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; } static bool trap_wvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); + trace_trap_reg(__func__, rd->CRm, p->is_write, + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); return true; } @@ -500,7 +500,7 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -510,7 +510,7 @@ static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -520,21 +520,21 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; } static bool trap_wcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -542,7 +542,7 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -552,7 +552,7 @@ static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -562,7 +562,7 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; } static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) -- cgit v1.2.3 From 3c4e0147c269738a19c7d70cd32395600bcc0714 Mon Sep 17 00:00:00 2001 From: Maciej Falkowski Date: Thu, 1 Apr 2021 18:11:27 +0200 Subject: ARM: OMAP1: Fix use of possibly uninitialized irq variable The current control flow of IRQ number assignment to `irq` variable allows a request of IRQ of unspecified value, generating a warning under Clang compilation with omap1_defconfig on linux-next: arch/arm/mach-omap1/pm.c:656:11: warning: variable 'irq' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] else if (cpu_is_omap16xx()) ^~~~~~~~~~~~~~~~~ ./arch/arm/mach-omap1/include/mach/soc.h:123:30: note: expanded from macro 'cpu_is_omap16xx' ^~~~~~~~~~~~~ arch/arm/mach-omap1/pm.c:658:18: note: uninitialized use occurs here if (request_irq(irq, omap_wakeup_interrupt, 0, "peripheral wakeup", ^~~ arch/arm/mach-omap1/pm.c:656:7: note: remove the 'if' if its condition is always true else if (cpu_is_omap16xx()) ^~~~~~~~~~~~~~~~~~~~~~ arch/arm/mach-omap1/pm.c:611:9: note: initialize the variable 'irq' to silence this warning int irq; ^ = 0 1 warning generated. The patch provides a default value to the `irq` variable along with a validity check. Signed-off-by: Maciej Falkowski Link: https://github.com/ClangBuiltLinux/linux/issues/1324 Signed-off-by: Tony Lindgren --- arch/arm/mach-omap1/pm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index 2c1e2b32b9b3..a745d64d4699 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -655,9 +655,13 @@ static int __init omap_pm_init(void) irq = INT_7XX_WAKE_UP_REQ; else if (cpu_is_omap16xx()) irq = INT_1610_WAKE_UP_REQ; - if (request_irq(irq, omap_wakeup_interrupt, 0, "peripheral wakeup", - NULL)) - pr_err("Failed to request irq %d (peripheral wakeup)\n", irq); + else + irq = -1; + + if (irq >= 0) { + if (request_irq(irq, omap_wakeup_interrupt, 0, "peripheral wakeup", NULL)) + pr_err("Failed to request irq %d (peripheral wakeup)\n", irq); + } /* Program new power ramp-up time * (0 for most boards since we don't lower voltage when in deep sleep) -- cgit v1.2.3 From 7c302314f37b44595f180198fca5ca646bce4a5f Mon Sep 17 00:00:00 2001 From: Maciej Falkowski Date: Thu, 1 Apr 2021 18:20:32 +0200 Subject: ARM: OMAP1: isp1301-omap: Add missing gpiod_add_lookup_table function The gpiod table was added without any usage making it unused as reported by Clang compilation from omap1_defconfig on linux-next: arch/arm/mach-omap1/board-h2.c:347:34: warning: unused variable 'isp1301_gpiod_table' [-Wunused-variable] static struct gpiod_lookup_table isp1301_gpiod_table = { ^ 1 warning generated. The patch adds the missing gpiod_add_lookup_table() function. Signed-off-by: Maciej Falkowski Fixes: f3ef38160e3d ("usb: isp1301-omap: Convert to use GPIO descriptors") Link: https://github.com/ClangBuiltLinux/linux/issues/1325 Signed-off-by: Tony Lindgren --- arch/arm/mach-omap1/board-h2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mach-omap1/board-h2.c b/arch/arm/mach-omap1/board-h2.c index c40cf5ef8607..977b0b744c22 100644 --- a/arch/arm/mach-omap1/board-h2.c +++ b/arch/arm/mach-omap1/board-h2.c @@ -320,7 +320,7 @@ static int tps_setup(struct i2c_client *client, void *context) { if (!IS_BUILTIN(CONFIG_TPS65010)) return -ENOSYS; - + tps65010_config_vregs1(TPS_LDO2_ENABLE | TPS_VLDO2_3_0V | TPS_LDO1_ENABLE | TPS_VLDO1_3_0V); @@ -394,6 +394,8 @@ static void __init h2_init(void) BUG_ON(gpio_request(H2_NAND_RB_GPIO_PIN, "NAND ready") < 0); gpio_direction_input(H2_NAND_RB_GPIO_PIN); + gpiod_add_lookup_table(&isp1301_gpiod_table); + omap_cfg_reg(L3_1610_FLASH_CS2B_OE); omap_cfg_reg(M8_1610_FLASH_CS2B_WE); -- cgit v1.2.3 From 040ab72ee10ea88e1883ad143b3e2b77596abc31 Mon Sep 17 00:00:00 2001 From: Yongqiang Liu Date: Thu, 1 Apr 2021 13:15:33 +0000 Subject: ARM: OMAP2+: Fix build warning when mmc_omap is not built GCC reports the following warning with W=1: arch/arm/mach-omap2/board-n8x0.c:325:19: warning: variable 'index' set but not used [-Wunused-but-set-variable] 325 | int bit, *openp, index; | ^~~~~ Fix this by moving CONFIG_MMC_OMAP to cover the rest codes in the n8x0_mmc_callback(). Signed-off-by: Yongqiang Liu Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 418a61ecb827..5e86145db0e2 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -322,6 +322,7 @@ static int n8x0_mmc_get_cover_state(struct device *dev, int slot) static void n8x0_mmc_callback(void *data, u8 card_mask) { +#ifdef CONFIG_MMC_OMAP int bit, *openp, index; if (board_is_n800()) { @@ -339,7 +340,6 @@ static void n8x0_mmc_callback(void *data, u8 card_mask) else *openp = 0; -#ifdef CONFIG_MMC_OMAP omap_mmc_notify_cover_event(mmc_device, index, *openp); #else pr_warn("MMC: notify cover event not available\n"); -- cgit v1.2.3 From 76d0fc5e9bc650766a90cc3ffd2a29248df0f020 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 18 May 2021 10:08:37 +0100 Subject: arm64: Fix stale link in the arch_counter_enforce_ordering() comment With infradead.org archives gone, update the link to lore.kernel.org as these links are deemed stable. Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/barrier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 2175ec0004ed..451e11e5fd23 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -74,7 +74,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long idx, * This insanity brought to you by speculative system register reads, * out-of-order memory accesses, sequence locks and Thomas Gleixner. * - * http://lists.infradead.org/pipermail/linux-arm-kernel/2019-February/631195.html + * https://lore.kernel.org/r/alpine.DEB.2.21.1902081950260.1662@nanos.tec.linutronix.de/ */ #define arch_counter_enforce_ordering(val) do { \ u64 tmp, _val = (val); \ -- cgit v1.2.3 From dabea675faf16e8682aa478ff3ce65dd775620bc Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 8 Apr 2021 13:02:18 +0200 Subject: arm64: dts: ls1028a: fix memory node While enabling EDAC support for the LS1028A it was discovered that the memory node has a wrong endianness setting as well as a wrong interrupt assignment. Fix both. This was tested on a sl28 board. To force ECC errors, you can use the error injection supported by the controller in hardware (with CONFIG_EDAC_DEBUG enabled): # enable error injection $ echo 0x100 > /sys/devices/system/edac/mc/mc0/inject_ctrl # flip lowest bit of the data $ echo 0x1 > /sys/devices/system/edac/mc/mc0/inject_data_lo Fixes: 8897f3255c9c ("arm64: dts: Add support for NXP LS1028A SoC") Signed-off-by: Michael Walle Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index eca06a0c3cf8..a30249ebffa8 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -197,8 +197,8 @@ ddr: memory-controller@1080000 { compatible = "fsl,qoriq-memory-controller"; reg = <0x0 0x1080000 0x0 0x1000>; - interrupts = ; - big-endian; + interrupts = ; + little-endian; }; dcfg: syscon@1e00000 { -- cgit v1.2.3 From eac2f3059e02382d91f8c887462083841d6ea2a3 Mon Sep 17 00:00:00 2001 From: Chen Huang Date: Thu, 29 Apr 2021 07:03:48 +0000 Subject: riscv: stacktrace: fix the riscv stacktrace when CONFIG_FRAME_POINTER enabled As [1] and [2] said, the arch_stack_walk should not to trace itself, or it will leave the trace unexpectedly when called. The example is when we do "cat /sys/kernel/debug/page_owner", all pages' stack is the same. arch_stack_walk+0x18/0x20 stack_trace_save+0x40/0x60 register_dummy_stack+0x24/0x5e init_page_owner+0x2e So we use __builtin_frame_address(1) as the first frame to be walked. And mark the arch_stack_walk() noinline. We found that pr_cont will affact pages' stack whose task state is RUNNING when testing "echo t > /proc/sysrq-trigger". So move the place of pr_cont and mark the function dump_backtrace() noinline. Also we move the case when task == NULL into else branch, and test for it in "echo c > /proc/sysrq-trigger". [1] https://lore.kernel.org/lkml/20210319184106.5688-1-mark.rutland@arm.com/ [2] https://lore.kernel.org/lkml/20210317142050.57712-1-chenjun102@huawei.com/ Signed-off-by: Chen Huang Fixes: 5d8544e2d007 ("RISC-V: Generic library routines and assembly") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 2b3e0cb90d78..bde85fc53357 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -27,10 +27,10 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = frame_pointer(regs); sp = user_stack_pointer(regs); pc = instruction_pointer(regs); - } else if (task == NULL || task == current) { - fp = (unsigned long)__builtin_frame_address(0); - sp = sp_in_global; - pc = (unsigned long)walk_stackframe; + } else if (task == current) { + fp = (unsigned long)__builtin_frame_address(1); + sp = (unsigned long)__builtin_frame_address(0); + pc = (unsigned long)__builtin_return_address(0); } else { /* task blocked in __switch_to */ fp = task->thread.s[0]; @@ -106,15 +106,15 @@ static bool print_trace_address(void *arg, unsigned long pc) return true; } -void dump_backtrace(struct pt_regs *regs, struct task_struct *task, +noinline void dump_backtrace(struct pt_regs *regs, struct task_struct *task, const char *loglvl) { - pr_cont("%sCall Trace:\n", loglvl); walk_stackframe(task, regs, print_trace_address, (void *)loglvl); } void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { + pr_cont("%sCall Trace:\n", loglvl); dump_backtrace(NULL, task, loglvl); } @@ -139,7 +139,7 @@ unsigned long get_wchan(struct task_struct *task) #ifdef CONFIG_STACKTRACE -void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, +noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { walk_stackframe(task, regs, consume_entry, cookie); -- cgit v1.2.3 From 97a031082320897ee5b06352d0ab3d7cf47321d3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 7 May 2021 17:47:15 +0800 Subject: riscv: Select ARCH_USE_MEMTEST As of commit dce44566192e ("mm/memtest: add ARCH_USE_MEMTEST"), architectures must select ARCH_USE_MEMTESET to enable CONFIG_MEMTEST. Signed-off-by: Kefeng Wang Fixes: f6e5aedf470b ("riscv: Add support for memtest") Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a8ad8eb76120..c5914e70a0fd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -34,6 +34,7 @@ config RISCV select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_SUPPORTS_HUGETLBFS if MMU + select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT -- cgit v1.2.3 From e98d98028989e023e0cbff539dc616c4e5036839 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 7 May 2021 21:44:39 +0200 Subject: arm64: dts: zii-ultra: remove second GEN_3V3 regulator instance When adding the sound support a second instance of the GEN_3V3 regulator was added by accident. Remove it and point the consumers to the first instance. Fixes: 663a5b5efa51 ("arm64: dts: zii-ultra: add sound support") Signed-off-by: Lucas Stach Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mq-zii-ultra-rmb3.dts | 10 +++++----- arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi | 19 +++++-------------- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra-rmb3.dts b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra-rmb3.dts index 631e01c1b9fd..be1e7d6f0ecb 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra-rmb3.dts +++ b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra-rmb3.dts @@ -88,11 +88,11 @@ pinctrl-0 = <&pinctrl_codec2>; reg = <0x18>; #sound-dai-cells = <0>; - HPVDD-supply = <®_3p3v>; - SPRVDD-supply = <®_3p3v>; - SPLVDD-supply = <®_3p3v>; - AVDD-supply = <®_3p3v>; - IOVDD-supply = <®_3p3v>; + HPVDD-supply = <®_gen_3p3>; + SPRVDD-supply = <®_gen_3p3>; + SPLVDD-supply = <®_gen_3p3>; + AVDD-supply = <®_gen_3p3>; + IOVDD-supply = <®_gen_3p3>; DVDD-supply = <&vgen4_reg>; reset-gpios = <&gpio3 4 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi index 4dc8383478ee..1e5d34e81ab7 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi @@ -77,15 +77,6 @@ regulator-always-on; }; - reg_3p3v: regulator-3p3v { - compatible = "regulator-fixed"; - vin-supply = <®_3p3_main>; - regulator-name = "GEN_3V3"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - reg_usdhc2_vmmc: regulator-vsd-3v3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_usdhc2>; @@ -415,11 +406,11 @@ pinctrl-0 = <&pinctrl_codec1>; reg = <0x18>; #sound-dai-cells = <0>; - HPVDD-supply = <®_3p3v>; - SPRVDD-supply = <®_3p3v>; - SPLVDD-supply = <®_3p3v>; - AVDD-supply = <®_3p3v>; - IOVDD-supply = <®_3p3v>; + HPVDD-supply = <®_gen_3p3>; + SPRVDD-supply = <®_gen_3p3>; + SPLVDD-supply = <®_gen_3p3>; + AVDD-supply = <®_gen_3p3>; + IOVDD-supply = <®_gen_3p3>; DVDD-supply = <&vgen4_reg>; reset-gpios = <&gpio3 3 GPIO_ACTIVE_LOW>; }; -- cgit v1.2.3 From ac0cbf9d13dccfd09bebc2f8f5697b6d3ffe27c4 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 7 May 2021 21:44:40 +0200 Subject: arm64: dts: zii-ultra: fix 12V_MAIN voltage As this is a fixed regulator on the board there was no harm in the wrong voltage being specified, apart from a confusing reporting to userspace. Fixes: 4a13b3bec3b4 ("arm64: dts: imx: add Zii Ultra board support") Signed-off-by: Lucas Stach Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi index 1e5d34e81ab7..a08a568c31d9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi @@ -45,8 +45,8 @@ reg_12p0_main: regulator-12p0-main { compatible = "regulator-fixed"; regulator-name = "12V_MAIN"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; + regulator-min-microvolt = <12000000>; + regulator-max-microvolt = <12000000>; regulator-always-on; }; -- cgit v1.2.3 From 779b56bb679767712761a79232331f8519402e75 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 8 May 2021 13:03:19 -0300 Subject: ARM: imx: pm-imx27: Include "common.h" Since commit 879c0e5e0ac7 ("ARM: imx: Remove i.MX27 board files") the following W=1 build warning is seen: arch/arm/mach-imx/pm-imx27.c:40:13: warning: no previous prototype for 'imx27_pm_init' [-Wmissing-prototypes] Fix it by including the "common.h" header file, which contains the prototype for imx27_pm_init(). Fixes: 879c0e5e0ac7 ("ARM: imx: Remove i.MX27 board files") Reported-by: kernel test robot Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/mach-imx/pm-imx27.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/mach-imx/pm-imx27.c b/arch/arm/mach-imx/pm-imx27.c index 020e6deb67c8..237e8aa9fe83 100644 --- a/arch/arm/mach-imx/pm-imx27.c +++ b/arch/arm/mach-imx/pm-imx27.c @@ -12,6 +12,7 @@ #include #include +#include "common.h" #include "hardware.h" static int mx27_suspend_enter(suspend_state_t state) -- cgit v1.2.3 From 02ccdeed1817a587161ad091887e11ac8a2586b2 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sat, 8 May 2021 23:43:47 +0800 Subject: riscv: kprobes: Fix build error when MMU=n lkp reported a randconfig failure: arch/riscv/kernel/probes/kprobes.c:90:22: error: use of undeclared identifier 'PAGE_KERNEL_READ_EXEC' We implemented the alloc_insn_page() to allocate PAGE_KERNEL_READ_EXEC page for kprobes insn page for STRICT_MODULE_RWX. But if MMU=n, we should fall back to the generic weak alloc_insn_page() by generic kprobe subsystem. Fixes: cdd1b2bd358f ("riscv: kprobes: Implement alloc_insn_page()") Signed-off-by: Jisheng Zhang Reported-by: kernel test robot Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/kprobes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 10b965c34536..15cc65ac7ca6 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -84,6 +84,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } +#ifdef CONFIG_MMU void *alloc_insn_page(void) { return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, @@ -91,6 +92,7 @@ void *alloc_insn_page(void) VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __builtin_return_address(0)); } +#endif /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) -- cgit v1.2.3 From bab0d47c0ebb50ae0bcfa4e84986a60113bf7d6b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 9 May 2021 00:44:43 +0800 Subject: riscv: kexec: Fix W=1 build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 build warning(s): In file included from include/linux/kexec.h:28, from arch/riscv/kernel/machine_kexec.c:7: arch/riscv/include/asm/kexec.h:45:1: warning: ‘extern’ is not at beginning of declaration [-Wold-style-declaration] 45 | const extern unsigned char riscv_kexec_relocate[]; | ^~~~~ arch/riscv/include/asm/kexec.h:46:1: warning: ‘extern’ is not at beginning of declaration [-Wold-style-declaration] 46 | const extern unsigned int riscv_kexec_relocate_size; | ^~~~~ arch/riscv/kernel/machine_kexec.c:125:6: warning: no previous prototype for ‘machine_shutdown’ [-Wmissing-prototypes] 125 | void machine_shutdown(void) | ^~~~~~~~~~~~~~~~ arch/riscv/kernel/machine_kexec.c:147:1: warning: no previous prototype for ‘machine_crash_shutdown’ [-Wmissing-prototypes] 147 | machine_crash_shutdown(struct pt_regs *regs) | ^~~~~~~~~~~~~~~~~~~~~~ arch/riscv/kernel/machine_kexec.c:23: warning: Function parameter or member 'image' not described in 'kexec_image_info' arch/riscv/kernel/machine_kexec.c:53: warning: Function parameter or member 'image' not described in 'machine_kexec_prepare' arch/riscv/kernel/machine_kexec.c:114: warning: Function parameter or member 'image' not described in 'machine_kexec_cleanup' arch/riscv/kernel/machine_kexec.c:148: warning: Function parameter or member 'regs' not described in 'machine_crash_shutdown' arch/riscv/kernel/machine_kexec.c:167: warning: Function parameter or member 'image' not described in 'machine_kexec' Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kexec.h | 4 ++-- arch/riscv/kernel/machine_kexec.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index 1e954101906a..e4e291d40759 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -42,8 +42,8 @@ struct kimage_arch { unsigned long fdt_addr; }; -const extern unsigned char riscv_kexec_relocate[]; -const extern unsigned int riscv_kexec_relocate_size; +extern const unsigned char riscv_kexec_relocate[]; +extern const unsigned int riscv_kexec_relocate_size; typedef void (*riscv_kexec_method)(unsigned long first_ind_entry, unsigned long jump_addr, diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index cc048143fba5..9e99e1db156b 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -14,8 +14,9 @@ #include /* For set_memory_x() */ #include /* For unreachable() */ #include /* For cpu_down() */ +#include -/** +/* * kexec_image_info - Print received image details */ static void @@ -39,7 +40,7 @@ kexec_image_info(const struct kimage *image) } } -/** +/* * machine_kexec_prepare - Initialize kexec * * This function is called from do_kexec_load, when the user has @@ -100,7 +101,7 @@ machine_kexec_prepare(struct kimage *image) } -/** +/* * machine_kexec_cleanup - Cleanup any leftovers from * machine_kexec_prepare * @@ -135,7 +136,7 @@ void machine_shutdown(void) #endif } -/** +/* * machine_crash_shutdown - Prepare to kexec after a kernel crash * * This function is called by crash_kexec just before machine_kexec @@ -151,7 +152,7 @@ machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -/** +/* * machine_kexec - Jump to the loaded kimage * * This function is called by kernel_kexec which is called by the -- cgit v1.2.3 From 25201269c6ec3e9398426962ccdd55428261f7d0 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 14 May 2021 20:55:52 +0200 Subject: arm64: dts: freescale: sl28: var4: fix RGMII clock and voltage During hardware validation it was noticed that the clock isn't continuously enabled when there is no link. This is because the 125MHz clock is derived from the internal PLL which seems to go into some kind of power-down mode every once in a while. The LS1028A expects a contiuous clock. Thus enable the PLL all the time. Also, the RGMII pad voltage is wrong. It was configured to 2.5V (that is the VDDH regulator). The correct voltage is 1.8V, i.e. the VDDIO regulator. This fix is for the freescale/fsl-ls1028a-kontron-sl28-var4.dts. Fixes: 815364d0424e ("arm64: dts: freescale: add Kontron sl28 support") Signed-off-by: Michael Walle Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts index df212ed5bb94..e65d1c477e2c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts @@ -31,11 +31,10 @@ reg = <0x4>; eee-broken-1000t; eee-broken-100tx; - qca,clk-out-frequency = <125000000>; qca,clk-out-strength = ; - - vddio-supply = <&vddh>; + qca,keep-pll-enabled; + vddio-supply = <&vddio>; vddio: vddio-regulator { regulator-name = "VDDIO"; -- cgit v1.2.3 From 52387bb9a4a75b88887383cb91d3995ae6f4044a Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 14 May 2021 20:55:53 +0200 Subject: arm64: dts: freescale: sl28: var1: fix RGMII clock and voltage During hardware validation it was noticed that the clock isn't continuously enabled when there is no link. This is because the 125MHz clock is derived from the internal PLL which seems to go into some kind of power-down mode every once in a while. The LS1028A expects a contiuous clock. Thus enable the PLL all the time. Also, the RGMII pad voltage is wrong, it was configured to 2.5V (that is the VDDH regulator). The correct voltage is 1.8V, i.e. the VDDIO regulator. This fix is for the freescale/fsl-ls1028a-kontron-sl28-var1.dts. Fixes: 642856097c18 ("arm64: dts: freescale: sl28: add variant 1") Signed-off-by: Michael Walle Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts index 6c309b97587d..e8d31279b7a3 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts @@ -46,7 +46,8 @@ eee-broken-100tx; qca,clk-out-frequency = <125000000>; qca,clk-out-strength = ; - vddio-supply = <&vddh>; + qca,keep-pll-enabled; + vddio-supply = <&vddio>; vddio: vddio-regulator { regulator-name = "VDDIO"; -- cgit v1.2.3 From 7c8f0338cdacc90fdf6468adafa8e27952987f00 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 20 May 2021 18:42:12 -0300 Subject: ARM: dts: imx7d-meerkat96: Fix the 'tuning-step' property According to Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml, the correct name of the property is 'fsl,tuning-step'. Fix it accordingly. Signed-off-by: Fabio Estevam Fixes: ae7b3384b61b ("ARM: dts: Add support for 96Boards Meerkat96 board") Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7d-meerkat96.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx7d-meerkat96.dts b/arch/arm/boot/dts/imx7d-meerkat96.dts index 5339210b63d0..dd8003bd1fc0 100644 --- a/arch/arm/boot/dts/imx7d-meerkat96.dts +++ b/arch/arm/boot/dts/imx7d-meerkat96.dts @@ -193,7 +193,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc1>; keep-power-in-suspend; - tuning-step = <2>; + fsl,tuning-step = <2>; vmmc-supply = <®_3p3v>; no-1-8-v; broken-cd; -- cgit v1.2.3 From 0e2fa4959c4f44815ce33e46e4054eeb0f346053 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 20 May 2021 18:42:13 -0300 Subject: ARM: dts: imx7d-pico: Fix the 'tuning-step' property According to Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml, the correct name of the property is 'fsl,tuning-step'. Fix it accordingly. Signed-off-by: Fabio Estevam Fixes: f13f571ac8a1 ("ARM: dts: imx7d-pico: Extend peripherals support") Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7d-pico.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx7d-pico.dtsi b/arch/arm/boot/dts/imx7d-pico.dtsi index e57da0d32b98..e519897fae08 100644 --- a/arch/arm/boot/dts/imx7d-pico.dtsi +++ b/arch/arm/boot/dts/imx7d-pico.dtsi @@ -351,7 +351,7 @@ pinctrl-2 = <&pinctrl_usdhc1_200mhz>; cd-gpios = <&gpio5 0 GPIO_ACTIVE_LOW>; bus-width = <4>; - tuning-step = <2>; + fsl,tuning-step = <2>; vmmc-supply = <®_3p3v>; wakeup-source; no-1-8-v; -- cgit v1.2.3 From b73eb6b3b91ff7d76cff5f8c7ab92fe0c51e3829 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 21 May 2021 09:54:07 +0200 Subject: ARM: dts: imx: emcon-avari: Fix nxp,pca8574 #gpio-cells According to the DT bindings, #gpio-cells must be two. Fixes: 63e71fedc07c4ece ("ARM: dts: Add support for emtrion emCON-MX6 series") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6qdl-emcon-avari.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6qdl-emcon-avari.dtsi b/arch/arm/boot/dts/imx6qdl-emcon-avari.dtsi index 828cf3e39784..c4e146f3341b 100644 --- a/arch/arm/boot/dts/imx6qdl-emcon-avari.dtsi +++ b/arch/arm/boot/dts/imx6qdl-emcon-avari.dtsi @@ -126,7 +126,7 @@ compatible = "nxp,pca8574"; reg = <0x3a>; gpio-controller; - #gpio-cells = <1>; + #gpio-cells = <2>; }; }; -- cgit v1.2.3 From 778a136e48be6b1b703328a0a4d6d459cf97449f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 18 May 2021 16:43:35 +0200 Subject: KVM: SVM: Drop unneeded CONFIG_X86_LOCAL_APIC check AVIC dependency on CONFIG_X86_LOCAL_APIC is dead code since commit e42eef4ba388 ("KVM: add X86_LOCAL_APIC dependency"). Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210518144339.1987982-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 -- arch/x86/kvm/svm/svm.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 712b4e0de481..1c1bf911e02b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -29,9 +29,7 @@ /* enable / disable AVIC */ int avic; -#ifdef CONFIG_X86_LOCAL_APIC module_param(avic, int, S_IRUGO); -#endif #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dfa351e605de..8c3918a11826 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1010,9 +1010,7 @@ static __init int svm_hardware_setup(void) } if (avic) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); -- cgit v1.2.3 From 377872b3355b9a7f04f25388e2c9399845259c05 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 18 May 2021 16:43:36 +0200 Subject: KVM: VMX: Drop unneeded CONFIG_X86_LOCAL_APIC check CONFIG_X86_LOCAL_APIC is always on when CONFIG_KVM (on x86) since commit e42eef4ba388 ("KVM: add X86_LOCAL_APIC dependency"). Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210518144339.1987982-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8dee8a5fbc17..aa0e7872fcc9 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void) static inline bool cpu_has_vmx_posted_intr(void) { - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_load_ia32_efer(void) -- cgit v1.2.3 From 28a4aa1160d71187a44414dac40b57d1fd9fcd77 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 May 2021 18:22:28 +0200 Subject: KVM: SVM: make the avic parameter a bool Make it consistent with kvm_intel.enable_apicv. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1c1bf911e02b..0e62e6a2438c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -28,8 +28,8 @@ #include "svm.h" /* enable / disable AVIC */ -int avic; -module_param(avic, int, S_IRUGO); +bool avic; +module_param(avic, bool, S_IRUGO); #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e44567ceb865..70419e417c0d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -479,7 +479,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern int avic; +extern bool avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { -- cgit v1.2.3 From e69012400b0cb42b2070748322cb72f9effec00f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 25 May 2021 10:45:51 +0800 Subject: arm64: mm: don't use CON and BLK mapping if KFENCE is enabled When we added KFENCE support for arm64, we intended that it would force the entire linear map to be mapped at page granularity, but we only enforced this in arch_add_memory() and not in map_mem(), so memory mapped at boot time can be mapped at a larger granularity. When booting a kernel with KFENCE=y and RODATA_FULL=n, this results in the following WARNING at boot: [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: CPU: 0 PID: 0 at mm/memory.c:2462 apply_to_pmd_range+0xec/0x190 [ 0.000000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.13.0-rc1+ #10 [ 0.000000] Hardware name: linux,dummy-virt (DT) [ 0.000000] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO BTYPE=--) [ 0.000000] pc : apply_to_pmd_range+0xec/0x190 [ 0.000000] lr : __apply_to_page_range+0x94/0x170 [ 0.000000] sp : ffffffc010573e20 [ 0.000000] x29: ffffffc010573e20 x28: ffffff801f400000 x27: ffffff801f401000 [ 0.000000] x26: 0000000000000001 x25: ffffff801f400fff x24: ffffffc010573f28 [ 0.000000] x23: ffffffc01002b710 x22: ffffffc0105fa450 x21: ffffffc010573ee4 [ 0.000000] x20: ffffff801fffb7d0 x19: ffffff801f401000 x18: 00000000fffffffe [ 0.000000] x17: 000000000000003f x16: 000000000000000a x15: ffffffc01060b940 [ 0.000000] x14: 0000000000000000 x13: 0098968000000000 x12: 0000000098968000 [ 0.000000] x11: 0000000000000000 x10: 0000000098968000 x9 : 0000000000000001 [ 0.000000] x8 : 0000000000000000 x7 : ffffffc010573ee4 x6 : 0000000000000001 [ 0.000000] x5 : ffffffc010573f28 x4 : ffffffc01002b710 x3 : 0000000040000000 [ 0.000000] x2 : ffffff801f5fffff x1 : 0000000000000001 x0 : 007800005f400705 [ 0.000000] Call trace: [ 0.000000] apply_to_pmd_range+0xec/0x190 [ 0.000000] __apply_to_page_range+0x94/0x170 [ 0.000000] apply_to_page_range+0x10/0x20 [ 0.000000] __change_memory_common+0x50/0xdc [ 0.000000] set_memory_valid+0x30/0x40 [ 0.000000] kfence_init_pool+0x9c/0x16c [ 0.000000] kfence_init+0x20/0x98 [ 0.000000] start_kernel+0x284/0x3f8 Fixes: 840b23986344 ("arm64, kfence: enable KFENCE for ARM64") Cc: # 5.12.x Signed-off-by: Jisheng Zhang Acked-by: Mark Rutland Acked-by: Marco Elver Tested-by: Marco Elver Link: https://lore.kernel.org/r/20210525104551.2ec37f77@xhacker.debian Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6dd9369e3ea0..89b66ef43a0f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -515,7 +515,8 @@ static void __init map_mem(pgd_t *pgdp) */ BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end)); - if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) + if (rodata_full || crash_mem_map || debug_pagealloc_enabled() || + IS_ENABLED(CONFIG_KFENCE)) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* -- cgit v1.2.3 From ff4cff962a7eedc73e54b5096693da7f86c61346 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 May 2021 17:01:08 -0700 Subject: MIPS: alchemy: xxs1500: add gpio-au1000.h header file board-xxs1500.c references 2 functions without declaring them, so add the header file to placate the build. ../arch/mips/alchemy/board-xxs1500.c: In function 'board_setup': ../arch/mips/alchemy/board-xxs1500.c:56:2: error: implicit declaration of function 'alchemy_gpio1_input_enable' [-Werror=implicit-function-declaration] 56 | alchemy_gpio1_input_enable(); ../arch/mips/alchemy/board-xxs1500.c:57:2: error: implicit declaration of function 'alchemy_gpio2_enable'; did you mean 'alchemy_uart_enable'? [-Werror=implicit-function-declaration] 57 | alchemy_gpio2_enable(); Fixes: 8e026910fcd4 ("MIPS: Alchemy: merge GPR/MTX-1/XXS1500 board code into single files") Signed-off-by: Randy Dunlap Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Cc: Manuel Lauss Cc: Ralf Baechle Acked-by: Manuel Lauss Signed-off-by: Thomas Bogendoerfer --- arch/mips/alchemy/board-xxs1500.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/mips/alchemy/board-xxs1500.c b/arch/mips/alchemy/board-xxs1500.c index b184baa4e56a..f175bce2987f 100644 --- a/arch/mips/alchemy/board-xxs1500.c +++ b/arch/mips/alchemy/board-xxs1500.c @@ -18,6 +18,7 @@ #include #include #include +#include #include const char *get_system_type(void) -- cgit v1.2.3 From 6855adc2c5d9dff08be9e6e01deb319738b28780 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 20 May 2021 22:13:43 -0700 Subject: MIPS: launch.h: add include guard to prevent build errors arch/mips/include/asm/mips-boards/launch.h needs an include guard to prevent it from being #included more than once. Prevents these build errors: In file included from ../arch/mips/mti-malta/malta-amon.c:16: ../arch/mips/include/asm/mips-boards/launch.h:8:8: error: redefinition of 'struct cpulaunch' 8 | struct cpulaunch { | ^~~~~~~~~ In file included from ../arch/mips/include/asm/mips-cps.h:13, from ../arch/mips/include/asm/smp-ops.h:16, from ../arch/mips/include/asm/smp.h:21, from ../include/linux/smp.h:114, from ../arch/mips/mti-malta/malta-amon.c:12: ../arch/mips/include/asm/mips-boards/launch.h:8:8: note: originally defined here 8 | struct cpulaunch { | ^~~~~~~~~ make[3]: [../scripts/Makefile.build:273: arch/mips/mti-malta/malta-amon.o] Error 1 (ignored) Fixes: 6decd1aad15f ("MIPS: add support for buggy MT7621S core detection") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Cc: Ilya Lipnitskiy Reviewed-by: Ilya Lipnitskiy Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/mips-boards/launch.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/mips/include/asm/mips-boards/launch.h b/arch/mips/include/asm/mips-boards/launch.h index f93aa5ee2e2e..3481ed4c117b 100644 --- a/arch/mips/include/asm/mips-boards/launch.h +++ b/arch/mips/include/asm/mips-boards/launch.h @@ -3,6 +3,9 @@ * */ +#ifndef _ASM_MIPS_BOARDS_LAUNCH_H +#define _ASM_MIPS_BOARDS_LAUNCH_H + #ifndef _ASSEMBLER_ struct cpulaunch { @@ -34,3 +37,5 @@ struct cpulaunch { /* Polling period in count cycles for secondary CPU's */ #define LAUNCHPERIOD 10000 + +#endif /* _ASM_MIPS_BOARDS_LAUNCH_H */ -- cgit v1.2.3 From fef532ea0cd871afab7d9a7b6e9da99ac2c24371 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 May 2021 17:54:17 -0700 Subject: MIPS: ralink: export rt_sysc_membase for rt2880_wdt.c rt2880_wdt.c uses (well, attempts to use) rt_sysc_membase. However, when this watchdog driver is built as a loadable module, there is a build error since the rt_sysc_membase symbol is not exported. Export it to quell the build error. ERROR: modpost: "rt_sysc_membase" [drivers/watchdog/rt2880_wdt.ko] undefined! Fixes: 473cf939ff34 ("watchdog: add ralink watchdog driver") Signed-off-by: Randy Dunlap Cc: Guenter Roeck Cc: Wim Van Sebroeck Cc: John Crispin Cc: linux-mips@vger.kernel.org Cc: linux-watchdog@vger.kernel.org Acked-by: Guenter Roeck Signed-off-by: Thomas Bogendoerfer --- arch/mips/ralink/of.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/ralink/of.c b/arch/mips/ralink/of.c index 0c5de07da097..0135376c5de5 100644 --- a/arch/mips/ralink/of.c +++ b/arch/mips/ralink/of.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,7 @@ __iomem void *rt_sysc_membase; __iomem void *rt_memc_membase; +EXPORT_SYMBOL_GPL(rt_sysc_membase); __iomem void *plat_of_remap_node(const char *node) { -- cgit v1.2.3 From 78cf0eb926cb1abeff2106bae67752e032fe5f3e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sat, 15 May 2021 19:02:01 +0800 Subject: MIPS: Fix kernel hang under FUNCTION_GRAPH_TRACER and PREEMPT_TRACER When update the latest mainline kernel with the following three configs, the kernel hangs during startup: (1) CONFIG_FUNCTION_GRAPH_TRACER=y (2) CONFIG_PREEMPT_TRACER=y (3) CONFIG_FTRACE_STARTUP_TEST=y When update the latest mainline kernel with the above two configs (1) and (2), the kernel starts normally, but it still hangs when execute the following command: echo "function_graph" > /sys/kernel/debug/tracing/current_tracer Without CONFIG_PREEMPT_TRACER=y, the above two kinds of kernel hangs disappeared, so it seems that CONFIG_PREEMPT_TRACER has some influences with function_graph tracer at the first glance. I use ejtag to find out the epc address is related with preempt_enable() in the file arch/mips/lib/mips-atomic.c, because function tracing can trace the preempt_{enable,disable} calls that are traced, replace them with preempt_{enable,disable}_notrace to prevent function tracing from going into an infinite loop, and then it can fix the kernel hang issue. By the way, it seems that this commit is a complement and improvement of commit f93a1a00f2bd ("MIPS: Fix crash that occurs when function tracing is enabled"). Signed-off-by: Tiezhu Yang Cc: Steven Rostedt Signed-off-by: Thomas Bogendoerfer --- arch/mips/lib/mips-atomic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/mips/lib/mips-atomic.c b/arch/mips/lib/mips-atomic.c index de03838b343b..a9b72eacfc0b 100644 --- a/arch/mips/lib/mips-atomic.c +++ b/arch/mips/lib/mips-atomic.c @@ -37,7 +37,7 @@ */ notrace void arch_local_irq_disable(void) { - preempt_disable(); + preempt_disable_notrace(); __asm__ __volatile__( " .set push \n" @@ -53,7 +53,7 @@ notrace void arch_local_irq_disable(void) : /* no inputs */ : "memory"); - preempt_enable(); + preempt_enable_notrace(); } EXPORT_SYMBOL(arch_local_irq_disable); @@ -61,7 +61,7 @@ notrace unsigned long arch_local_irq_save(void) { unsigned long flags; - preempt_disable(); + preempt_disable_notrace(); __asm__ __volatile__( " .set push \n" @@ -78,7 +78,7 @@ notrace unsigned long arch_local_irq_save(void) : /* no inputs */ : "memory"); - preempt_enable(); + preempt_enable_notrace(); return flags; } @@ -88,7 +88,7 @@ notrace void arch_local_irq_restore(unsigned long flags) { unsigned long __tmp1; - preempt_disable(); + preempt_disable_notrace(); __asm__ __volatile__( " .set push \n" @@ -106,7 +106,7 @@ notrace void arch_local_irq_restore(unsigned long flags) : "0" (flags) : "memory"); - preempt_enable(); + preempt_enable_notrace(); } EXPORT_SYMBOL(arch_local_irq_restore); -- cgit v1.2.3 From bae989c4bc53f861cc1b706aab0194703e9907a8 Mon Sep 17 00:00:00 2001 From: Maciej Falkowski Date: Thu, 1 Apr 2021 18:04:34 +0200 Subject: ARM: OMAP1: ams-delta: remove unused function ams_delta_camera_power The ams_delta_camera_power() function is unused as reports Clang compilation with omap1_defconfig on linux-next: arch/arm/mach-omap1/board-ams-delta.c:462:12: warning: unused function 'ams_delta_camera_power' [-Wunused-function] static int ams_delta_camera_power(struct device *dev, int power) ^ 1 warning generated. The soc_camera support was dropped without removing ams_delta_camera_power() function, making it unused. Fixes: ce548396a433 ("media: mach-omap1: board-ams-delta.c: remove soc_camera dependencies") Signed-off-by: Maciej Falkowski Reviewed-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Tony Lindgren Link: https://github.com/ClangBuiltLinux/linux/issues/1326 --- arch/arm/mach-omap1/board-ams-delta.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c index 2ee527c00284..1026a816dcc0 100644 --- a/arch/arm/mach-omap1/board-ams-delta.c +++ b/arch/arm/mach-omap1/board-ams-delta.c @@ -458,20 +458,6 @@ static struct gpiod_lookup_table leds_gpio_table = { #ifdef CONFIG_LEDS_TRIGGERS DEFINE_LED_TRIGGER(ams_delta_camera_led_trigger); - -static int ams_delta_camera_power(struct device *dev, int power) -{ - /* - * turn on camera LED - */ - if (power) - led_trigger_event(ams_delta_camera_led_trigger, LED_FULL); - else - led_trigger_event(ams_delta_camera_led_trigger, LED_OFF); - return 0; -} -#else -#define ams_delta_camera_power NULL #endif static struct platform_device ams_delta_audio_device = { -- cgit v1.2.3 From e3e880bb1518eb10a4b4bb4344ed614d6856f190 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 26 May 2021 22:18:31 +0800 Subject: KVM: arm64: Resolve all pending PC updates before immediate exit Commit 26778aaa134a ("KVM: arm64: Commit pending PC adjustemnts before returning to userspace") fixed the PC updating issue by forcing an explicit synchronisation of the exception state on vcpu exit to userspace. However, we forgot to take into account the case where immediate_exit is set by userspace and KVM_RUN will exit immediately. Fix it by resolving all pending PC updates before returning to userspace. Since __kvm_adjust_pc() relies on a loaded vcpu context, I moved the immediate_exit checking right after vcpu_load(). We will get some overhead if immediate_exit is true (which should hopefully be rare). Fixes: 26778aaa134a ("KVM: arm64: Commit pending PC adjustemnts before returning to userspace") Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210526141831.1662-1-yuzenghui@huawei.com Cc: stable@vger.kernel.org # 5.11 --- arch/arm64/kvm/arm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1126eae27400..e720148232a0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -720,11 +720,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) return ret; } - if (run->immediate_exit) - return -EINTR; - vcpu_load(vcpu); + if (run->immediate_exit) { + ret = -EINTR; + goto out; + } + kvm_sigset_activate(vcpu); ret = 1; @@ -897,6 +899,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_sigset_deactivate(vcpu); +out: /* * In the unlikely event that we are returning to userspace * with pending exceptions or PC adjustment, commit these -- cgit v1.2.3 From 66e94d5cafd4decd4f92d16a022ea587d7f4094f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 24 May 2021 18:07:52 +0100 Subject: KVM: arm64: Prevent mixed-width VM creation It looks like we have tolerated creating mixed-width VMs since... forever. However, that was never the intention, and we'd rather not have to support that pointless complexity. Forbid such a setup by making sure all the vcpus have the same register width. Reported-by: Steven Price Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210524170752.1549797-1-maz@kernel.org --- arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/kvm/reset.c | 28 ++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f612c090f2e4..01b9857757f2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -463,4 +463,9 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; } +static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) +{ + return test_bit(feature, vcpu->arch.features); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 956cdc240148..d37ebee085cf 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -166,6 +166,25 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) return 0; } +static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *tmp; + bool is32bit; + int i; + + is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) + return false; + + /* Check that the vcpus are either all 32bit or all 64bit */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit) + return false; + } + + return true; +} + /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -217,13 +236,14 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } } + if (!vcpu_allowed_register_width(vcpu)) { + ret = -EINVAL; + goto out; + } + switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) { - ret = -EINVAL; - goto out; - } pstate = VCPU_RESET_PSTATE_SVC; } else { pstate = VCPU_RESET_PSTATE_EL1; -- cgit v1.2.3 From 6bd5b743686243dae7351d5dcceeb7f171201bb4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:31 -0700 Subject: KVM: PPC: exit halt polling on need_resched() This is inspired by commit 262de4102c7bb8 (kvm: exit halt polling on need_resched() as well). Due to PPC implements an arch specific halt polling logic, we have to the need_resched() check there as well. This patch adds a helper function that can be shared between book3s and generic halt-polling loops. Reviewed-by: David Matlack Reviewed-by: Venkatesh Srinivas Cc: Ben Segall Cc: Venkatesh Srinivas Cc: Jim Mattson Cc: David Matlack Cc: Paul Mackerras Cc: Suraj Jitindar Singh Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-1-git-send-email-wanpengli@tencent.com> [Make the function inline. - Paolo] Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 2 +- include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 3 +-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 28a80d240b76..7360350e66ff 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3936,7 +3936,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) break; } cur = ktime_get(); - } while (single_task_running() && ktime_before(cur, stop)); + } while (kvm_vcpu_can_poll(cur, stop)); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2f34487e21f2..5d4b96b36ec0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,11 @@ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) return !!map->hva; } +static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) +{ + return single_task_running() && !need_resched() && ktime_before(cur, stop); +} + /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6b4feb92dc79..5f40725144f5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2973,8 +2973,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) goto out; } poll_end = cur = ktime_get(); - } while (single_task_running() && !need_resched() && - ktime_before(cur, stop)); + } while (kvm_vcpu_can_poll(cur, stop)); } prepare_to_rcuwait(&vcpu->wait); -- cgit v1.2.3 From 72b268a8e9307a1757f61af080e990b5baa11d2a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:32 -0700 Subject: KVM: X86: Bail out of direct yield in case of under-committed scenarios In case of under-committed scenarios, vCPUs can be scheduled easily; kvm_vcpu_yield_to adds extra overhead, and it is also common to see when vcpu->ready is true but yield later failing due to p->state is TASK_RUNNING. Let's bail out in such scenarios by checking the length of current cpu runqueue, which can be treated as a hint of under-committed instead of guarantee of accuracy. 30%+ of directed-yield attempts can now avoid the expensive lookups in kvm_sched_yield() in an under-committed scenario. Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b6bca616929..dfb7c320581f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8360,6 +8360,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) vcpu->stat.directed_yield_attempted++; + if (single_task_running()) + goto no_yield; + rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); -- cgit v1.2.3 From 1eff0ada88b48e4ac1e3fe26483b3684fedecd27 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:33 -0700 Subject: KVM: X86: Fix vCPU preempted state from guest's point of view Commit 66570e966dd9 (kvm: x86: only provide PV features if enabled in guest's CPUID) avoids to access pv tlb shootdown host side logic when this pv feature is not exposed to guest, however, kvm_steal_time.preempted not only leveraged by pv tlb shootdown logic but also mitigate the lock holder preemption issue. From guest's point of view, vCPU is always preempted since we lose the reset of kvm_steal_time.preempted before vmentry if pv tlb shootdown feature is not exposed. This patch fixes it by clearing kvm_steal_time.preempted before vmentry. Fixes: 66570e966dd9 (kvm: x86: only provide PV features if enabled in guest's CPUID) Reviewed-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-3-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfb7c320581f..bed7b5348c0e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3105,6 +3105,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + } else { + st->preempted = 0; } vcpu->arch.st.preempted = 0; -- cgit v1.2.3 From da6d63a0062a3ee721b84123b83ec093f25759b0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:34 -0700 Subject: KVM: X86: hyper-v: Task srcu lock when accessing kvm_memslots() WARNING: suspicious RCU usage 5.13.0-rc1 #4 Not tainted ----------------------------- ./include/linux/kvm_host.h:710 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by hyperv_clock/8318: #0: ffffb6b8cb05a7d8 (&hv->hv_lock){+.+.}-{3:3}, at: kvm_hv_invalidate_tsc_page+0x3e/0xa0 [kvm] stack backtrace: CPU: 3 PID: 8318 Comm: hyperv_clock Not tainted 5.13.0-rc1 #4 Call Trace: dump_stack+0x87/0xb7 lockdep_rcu_suspicious+0xce/0xf0 kvm_write_guest_page+0x1c1/0x1d0 [kvm] kvm_write_guest+0x50/0x90 [kvm] kvm_hv_invalidate_tsc_page+0x79/0xa0 [kvm] kvm_gen_update_masterclock+0x1d/0x110 [kvm] kvm_arch_vm_ioctl+0x2a7/0xc50 [kvm] kvm_vm_ioctl+0x123/0x11d0 [kvm] __x64_sys_ioctl+0x3ed/0x9d0 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae kvm_memslots() will be called by kvm_write_guest(), so we should take the srcu lock. Fixes: e880c6ea5 (KVM: x86: hyper-v: Prevent using not-yet-updated TSC page by secondary CPUs) Reviewed-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-4-git-send-email-wanpengli@tencent.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f98370a39936..f00830e5202f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); u64 gfn; + int idx; if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || @@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; hv->tsc_ref.tsc_sequence = 0; + + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&kvm->srcu); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + srcu_read_unlock(&kvm->srcu, idx); out_unlock: mutex_unlock(&hv->hv_lock); -- cgit v1.2.3 From 9805cf03fdb6828091fe09e4ef0fb544fca3eaf6 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:35 -0700 Subject: KVM: LAPIC: Narrow the timer latency between wait_lapic_expire and world switch Let's treat lapic_timer_advance_ns automatic tuning logic as hypervisor overhead, move it before wait_lapic_expire instead of between wait_lapic_expire and the world switch, the wait duration should be calculated by the up-to-date guest_tsc after the overhead of automatic tuning logic. This patch reduces ~30+ cycles for kvm-unit-tests/tscdeadline-latency when testing busy waits. Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-5-git-send-email-wanpengli@tencent.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c0ebef560bd1..5d91f2367c31 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1598,11 +1598,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; + if (lapic_timer_advance_dynamic) { + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); + /* + * If the timer fired early, reread the TSC to account for the + * overhead of the above adjustment to avoid waiting longer + * than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + } + if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - - if (lapic_timer_advance_dynamic) - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 57ab87947abfc4e0b0b9864dc4717326a1c28a39 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 May 2021 10:41:16 -0300 Subject: KVM: x86: add start_assignment hook to kvm_x86_ops Add a start_assignment hook to kvm_x86_ops, which is called when kvm_arch_start_assignment is done. The hook is required to update the wakeup vector of a sleeping vCPU when a device is assigned to the guest. Signed-off-by: Marcelo Tosatti Message-Id: <20210525134321.254128742@redhat.com> Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 323641097f63..e7bef91cee04 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(start_assignment) KVM_X86_OP_NULL(apicv_post_state_restore) KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) KVM_X86_OP_NULL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55efbacfc244..9c7ced0e3171 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1352,6 +1352,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bed7b5348c0e..98538b1cb453 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11504,7 +11504,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { - atomic_inc(&kvm->arch.assigned_device_count); + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) + static_call_cond(kvm_x86_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); -- cgit v1.2.3 From 084071d5e9226add45a6031928bf10e6afc855fd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 May 2021 10:41:17 -0300 Subject: KVM: rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK KVM_REQ_UNBLOCK will be used to exit a vcpu from its inner vcpu halt emulation loop. Rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK, switch PowerPC to arch specific request bit. Signed-off-by: Marcelo Tosatti Message-Id: <20210525134321.303768132@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/vcpu-requests.rst | 8 +++++--- arch/powerpc/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 ++ 6 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst index 5feb3706a7ae..af1b37441e0a 100644 --- a/Documentation/virt/kvm/vcpu-requests.rst +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -118,10 +118,12 @@ KVM_REQ_MMU_RELOAD necessary to inform each VCPU to completely refresh the tables. This request is used for that. -KVM_REQ_PENDING_TIMER +KVM_REQ_UNBLOCK - This request may be made from a timer handler run on the host on behalf - of a VCPU. It informs the VCPU thread to inject a timer interrupt. + This request informs the vCPU to exit kvm_vcpu_block. It is used for + example from timer handlers that run on the host on behalf of a vCPU, + or in order to update the interrupt routing and ensure that assigned + devices will wake up the vCPU. KVM_REQ_UNHALT diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1e83359f286b..7f2e90db2050 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -51,6 +51,7 @@ /* PPC-specific vcpu->requests bit members */ #define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) #define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) +#define KVM_REQ_PENDING_TIMER KVM_ARCH_REQ(2) #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5d91f2367c31..8120e8614b92 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1669,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(&apic->lapic_timer.pending); - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 98538b1cb453..fe464b66898f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9501,7 +9501,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5d4b96b36ec0..76102efbf079 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -147,7 +147,7 @@ static inline bool is_error_page(struct page *page) */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 #define KVM_REQUEST_ARCH_BASE 8 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5f40725144f5..37a2d500a148 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2929,6 +2929,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) goto out; if (signal_pending(current)) goto out; + if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) + goto out; ret = 0; out: -- cgit v1.2.3 From a2486020a82eefad686993695eb42d1b64f3f2fd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 26 May 2021 14:20:14 -0300 Subject: KVM: VMX: update vcpu posted-interrupt descriptor when assigning device For VMX, when a vcpu enters HLT emulation, pi_post_block will: 1) Add vcpu to per-cpu list of blocked vcpus. 2) Program the posted-interrupt descriptor "notification vector" to POSTED_INTR_WAKEUP_VECTOR With interrupt remapping, an interrupt will set the PIR bit for the vector programmed for the device on the CPU, test-and-set the ON bit on the posted interrupt descriptor, and if the ON bit is clear generate an interrupt for the notification vector. This way, the target CPU wakes upon a device interrupt and wakes up the target vcpu. Problem is that pi_post_block only programs the notification vector if kvm_arch_has_assigned_device() is true. Its possible for the following to happen: 1) vcpu V HLTs on pcpu P, kvm_arch_has_assigned_device is false, notification vector is not programmed 2) device is assigned to VM 3) device interrupts vcpu V, sets ON bit (notification vector not programmed, so pcpu P remains in idle) 4) vcpu 0 IPIs vcpu V (in guest), but since pi descriptor ON bit is set, kvm_vcpu_kick is skipped 5) vcpu 0 busy spins on vcpu V's response for several seconds, until RCU watchdog NMIs all vCPUs. To fix this, use the start_assignment kvm_x86_ops callback to kick vcpus out of the halt loop, so the notification vector is properly reprogrammed to the wakeup vector. Reported-by: Pei Zhang Signed-off-by: Marcelo Tosatti Message-Id: <20210526172014.GA29007@fuller.cnet> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 14 ++++++++++++++ arch/x86/kvm/vmx/posted_intr.h | 1 + arch/x86/kvm/vmx/vmx.c | 1 + virt/kvm/kvm_main.c | 1 + 4 files changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 459748680daf..5f81ef092bd4 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -237,6 +237,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) } +/* + * Bail out of the block loop if the VM has an assigned + * device, but the blocking vCPU didn't reconfigure the + * PI.NV to the wakeup vector, i.e. the assigned device + * came along after the initial check in pi_pre_block(). + */ +void vmx_pi_start_assignment(struct kvm *kvm) +{ + if (!irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); +} + /* * pi_update_irte - set IRTE for Posted-Interrupts * diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 0bdc41391c5b..7f7b2326caf5 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bceb5ca3a89..639ec3eba9b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7721,6 +7721,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nested_ops = &vmx_nested_ops, .update_pi_irte = pi_update_irte, + .start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 37a2d500a148..6a6bc7af0e28 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -307,6 +307,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { return kvm_make_all_cpus_request_except(kvm, req, NULL); } +EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) -- cgit v1.2.3 From bedd9195df3dfea7165e7d6f7519a1568bc41936 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 May 2021 16:32:27 +0000 Subject: KVM: x86/mmu: Fix comment mentioning skip_4k This comment was left over from a previous version of the patch that introduced wrprot_gfn_range, when skip_4k was passed in instead of min_level. Signed-off-by: David Matlack Message-Id: <20210526163227.3113557-1-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 95eeb5ac6a8a..237317b1eddd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1192,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) -- cgit v1.2.3 From 7adc7b225cddcfd0f346d10144fd7a3d3d9f9ea7 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Wed, 12 May 2021 09:20:50 +1200 Subject: powerpc/fsl: set fsl,i2c-erratum-a004447 flag for P2041 i2c controllers The i2c controllers on the P2040/P2041 have an erratum where the documented scheme for i2c bus recovery will not work (A-004447). A different mechanism is needed which is documented in the P2040 Chip Errata Rev Q (latest available at the time of writing). Signed-off-by: Chris Packham Acked-by: Michael Ellerman Signed-off-by: Wolfram Sang --- arch/powerpc/boot/dts/fsl/p2041si-post.dtsi | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi index 872e4485dc3f..ddc018d42252 100644 --- a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi @@ -371,7 +371,23 @@ }; /include/ "qoriq-i2c-0.dtsi" + i2c@118000 { + fsl,i2c-erratum-a004447; + }; + + i2c@118100 { + fsl,i2c-erratum-a004447; + }; + /include/ "qoriq-i2c-1.dtsi" + i2c@119000 { + fsl,i2c-erratum-a004447; + }; + + i2c@119100 { + fsl,i2c-erratum-a004447; + }; + /include/ "qoriq-duart-0.dtsi" /include/ "qoriq-duart-1.dtsi" /include/ "qoriq-gpio-0.dtsi" -- cgit v1.2.3 From 19ae697a1e4edf1d755b413e3aa38da65e2db23b Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Wed, 12 May 2021 09:20:51 +1200 Subject: powerpc/fsl: set fsl,i2c-erratum-a004447 flag for P1010 i2c controllers The i2c controllers on the P1010 have an erratum where the documented scheme for i2c bus recovery will not work (A-004447). A different mechanism is needed which is documented in the P1010 Chip Errata Rev L. Signed-off-by: Chris Packham Acked-by: Michael Ellerman Signed-off-by: Wolfram Sang --- arch/powerpc/boot/dts/fsl/p1010si-post.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi index c2717f31925a..ccda0a91abf0 100644 --- a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi @@ -122,7 +122,15 @@ }; /include/ "pq3-i2c-0.dtsi" + i2c@3000 { + fsl,i2c-erratum-a004447; + }; + /include/ "pq3-i2c-1.dtsi" + i2c@3100 { + fsl,i2c-erratum-a004447; + }; + /include/ "pq3-duart-0.dtsi" /include/ "pq3-espi-0.dtsi" spi0: spi@7000 { -- cgit v1.2.3 From 82123a3d1d5a306fdf50c968a474cc60fe43a80f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 19 May 2021 16:17:17 +0530 Subject: powerpc/kprobes: Fix validation of prefixed instructions across page boundary When checking if the probed instruction is the suffix of a prefixed instruction, we access the instruction at the previous word. If the probed instruction is the very first word of a module, we can end up trying to access an invalid page. Fix this by skipping the check for all instructions at the beginning of a page. Prefixed instructions cannot cross a 64-byte boundary and as such, we don't expect to encounter a suffix as the very first word in a page for kernel text. Even if there are prefixed instructions crossing a page boundary (from a module, for instance), the instruction will be illegal, so preventing probing on the suffix of such prefix instructions isn't worthwhile. Fixes: b4657f7650ba ("powerpc/kprobes: Don't allow breakpoints on suffixes") Cc: stable@vger.kernel.org # v5.8+ Reported-by: Christophe Leroy Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0df9a032a05576a2fa8e97d1b769af2ff0eafbd6.1621416666.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01ab2163659e..e8c2a6373157 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -108,7 +108,6 @@ int arch_prepare_kprobe(struct kprobe *p) int ret = 0; struct kprobe *prev; struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->addr); - struct ppc_inst prefix = ppc_inst_read((struct ppc_inst *)(p->addr - 1)); if ((unsigned long)p->addr & 0x03) { printk("Attempt to register kprobe at an unaligned address\n"); @@ -116,7 +115,8 @@ int arch_prepare_kprobe(struct kprobe *p) } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) { printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n"); ret = -EINVAL; - } else if (ppc_inst_prefixed(prefix)) { + } else if ((unsigned long)p->addr & ~PAGE_MASK && + ppc_inst_prefixed(ppc_inst_read((struct ppc_inst *)(p->addr - 1)))) { printk("Cannot register a kprobe on the second word of prefixed instruction\n"); ret = -EINVAL; } -- cgit v1.2.3 From 5362a4b6ee6136018558ef6b2c4701aa15ebc602 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 May 2021 22:00:05 +1000 Subject: powerpc: Fix reverse map real-mode address lookup with huge vmalloc real_vmalloc_addr() does not currently work for huge vmalloc, which is what the reverse map can be allocated with for radix host, hash guest. Extract the hugepage aware equivalent from eeh code into a helper, and convert existing sites including this one to use it. Fixes: 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210526120005.3432222-1-npiggin@gmail.com --- arch/powerpc/include/asm/pte-walk.h | 29 +++++++++++++++++++++++++++++ arch/powerpc/kernel/eeh.c | 23 +---------------------- arch/powerpc/kernel/io-workarounds.c | 16 +++------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 15 ++------------- 4 files changed, 35 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h index 33fa5dd8ee6a..714a35f0d425 100644 --- a/arch/powerpc/include/asm/pte-walk.h +++ b/arch/powerpc/include/asm/pte-walk.h @@ -31,6 +31,35 @@ static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift) pgd_t *pgdir = init_mm.pgd; return __find_linux_pte(pgdir, ea, NULL, hshift); } + +/* + * Convert a kernel vmap virtual address (vmalloc or ioremap space) to a + * physical address, without taking locks. This can be used in real-mode. + */ +static inline phys_addr_t ppc_find_vmap_phys(unsigned long addr) +{ + pte_t *ptep; + phys_addr_t pa; + int hugepage_shift; + + /* + * init_mm does not free page tables, and does not do THP. It may + * have huge pages from huge vmalloc / ioremap etc. + */ + ptep = find_init_mm_pte(addr, &hugepage_shift); + if (WARN_ON(!ptep)) + return 0; + + pa = PFN_PHYS(pte_pfn(*ptep)); + + if (!hugepage_shift) + hugepage_shift = PAGE_SHIFT; + + pa |= addr & ((1ul << hugepage_shift) - 1); + + return pa; +} + /* * This is what we should always use. Any other lockless page table lookup needs * careful audit against THP split. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index f24cd53ff26e..3bbdcc86d01b 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -346,28 +346,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) */ static inline unsigned long eeh_token_to_phys(unsigned long token) { - pte_t *ptep; - unsigned long pa; - int hugepage_shift; - - /* - * We won't find hugepages here(this is iomem). Hence we are not - * worried about _PAGE_SPLITTING/collapse. Also we will not hit - * page table free, because of init_mm. - */ - ptep = find_init_mm_pte(token, &hugepage_shift); - if (!ptep) - return token; - - pa = pte_pfn(*ptep); - - /* On radix we can do hugepage mappings for io, so handle that */ - if (!hugepage_shift) - hugepage_shift = PAGE_SHIFT; - - pa <<= PAGE_SHIFT; - pa |= token & ((1ul << hugepage_shift) - 1); - return pa; + return ppc_find_vmap_phys(token); } /* diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 51bbaae94ccc..c877f074d174 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -55,7 +55,6 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) #ifdef CONFIG_PPC_INDIRECT_MMIO struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { - unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -65,22 +64,13 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) bus = &iowa_busses[token - 1]; else { unsigned long vaddr, paddr; - pte_t *ptep; vaddr = (unsigned long)PCI_FIX_ADDR(addr); if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - /* - * We won't find huge pages here (iomem). Also can't hit - * a page table free due to init_mm - */ - ptep = find_init_mm_pte(vaddr, &hugepage_shift); - if (ptep == NULL) - paddr = 0; - else { - WARN_ON(hugepage_shift); - paddr = pte_pfn(*ptep) << PAGE_SHIFT; - } + + paddr = ppc_find_vmap_phys(vaddr); + bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 7af7c70f1468..7a0f12404e0e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -23,20 +23,9 @@ #include /* Translate address of a vmalloc'd thing to a linear map address */ -static void *real_vmalloc_addr(void *x) +static void *real_vmalloc_addr(void *addr) { - unsigned long addr = (unsigned long) x; - pte_t *p; - /* - * assume we don't have huge pages in vmalloc space... - * So don't worry about THP collapse/split. Called - * Only in realmode with MSR_EE = 0, hence won't need irq_save/restore. - */ - p = find_init_mm_pte(addr, NULL); - if (!p || !pte_present(*p)) - return NULL; - addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); - return __va(addr); + return __va(ppc_find_vmap_phys((unsigned long)addr)); } /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ -- cgit v1.2.3 From 1438709e6328925ef496dafd467dbd0353137434 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 May 2021 22:58:51 +1000 Subject: KVM: PPC: Book3S HV: Save host FSCR in the P7/8 path Similar to commit 25edcc50d76c ("KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path"), ensure the P7/8 path saves and restores the host FSCR. The logic explained in that patch actually applies there to the old path well: a context switch can be made before kvmppc_vcpu_run_hv restores the host FSCR and returns. Now both the p9 and the p7/8 paths now save and restore their FSCR, it no longer needs to be restored at the end of kvmppc_vcpu_run_hv Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs") Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210526125851.3436735-1-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 1 - arch/powerpc/kvm/book3s_hv_rmhandlers.S | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 28a80d240b76..13728495ac66 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4455,7 +4455,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) mtspr(SPRN_EBBRR, ebb_regs[1]); mtspr(SPRN_BESCR, ebb_regs[2]); mtspr(SPRN_TAR, user_tar); - mtspr(SPRN_FSCR, current->thread.fscr); } mtspr(SPRN_VRSAVE, user_vrsave); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5e634db4809b..004f0d4e665f 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -59,6 +59,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_UAMOR (SFS-88) #define STACK_SLOT_DAWR1 (SFS-96) #define STACK_SLOT_DAWRX1 (SFS-104) +#define STACK_SLOT_FSCR (SFS-112) /* the following is used by the P9 short path */ #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ @@ -686,6 +687,8 @@ BEGIN_FTR_SECTION std r6, STACK_SLOT_DAWR0(r1) std r7, STACK_SLOT_DAWRX0(r1) std r8, STACK_SLOT_IAMR(r1) + mfspr r5, SPRN_FSCR + std r5, STACK_SLOT_FSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION mfspr r6, SPRN_DAWR1 @@ -1663,6 +1666,10 @@ FTR_SECTION_ELSE ld r7, STACK_SLOT_HFSCR(r1) mtspr SPRN_HFSCR, r7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + ld r5, STACK_SLOT_FSCR(r1) + mtspr SPRN_FSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* * Restore various registers to 0, where non-zero values * set by the guest could disrupt the host. -- cgit v1.2.3 From e87e46d5f3182f82d997641d95db01a7feacef92 Mon Sep 17 00:00:00 2001 From: Yuan Yao Date: Wed, 26 May 2021 14:38:28 +0800 Subject: KVM: X86: Use kvm_get_linear_rip() in single-step and #DB/#BP interception The kvm_get_linear_rip() handles x86/long mode cases well and has better readability, __kvm_set_rflags() also use the paired function kvm_is_linear_rip() to check the vcpu->arch.singlestep_rip set in kvm_arch_vcpu_ioctl_set_guest_debug(), so change the "CS.BASE + RIP" code in kvm_arch_vcpu_ioctl_set_guest_debug() and handle_exception_nmi() to this one. Signed-off-by: Yuan Yao Message-Id: <20210526063828.1173-1-yuan.yao@linux.intel.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 ++--- arch/x86/kvm/x86.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 639ec3eba9b8..50b42d7a8a11 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4843,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; @@ -4933,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe464b66898f..2d725567961f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10120,8 +10120,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); /* * Trigger an rflags update that will inject or remove the trace -- cgit v1.2.3 From da6393cdd8aaa354b3a2437cd73ebb34cac958e3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 27 May 2021 17:01:36 -0700 Subject: KVM: X86: Fix warning caused by stale emulation context Reported by syzkaller: WARNING: CPU: 7 PID: 10526 at linux/arch/x86/kvm//x86.c:7621 x86_emulate_instruction+0x41b/0x510 [kvm] RIP: 0010:x86_emulate_instruction+0x41b/0x510 [kvm] Call Trace: kvm_mmu_page_fault+0x126/0x8f0 [kvm] vmx_handle_exit+0x11e/0x680 [kvm_intel] vcpu_enter_guest+0xd95/0x1b40 [kvm] kvm_arch_vcpu_ioctl_run+0x377/0x6a0 [kvm] kvm_vcpu_ioctl+0x389/0x630 [kvm] __x64_sys_ioctl+0x8e/0xd0 do_syscall_64+0x3c/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Commit 4a1e10d5b5d8 ("KVM: x86: handle hardware breakpoints during emulation()) adds hardware breakpoints check before emulation the instruction and parts of emulation context initialization, actually we don't have the EMULTYPE_NO_DECODE flag here and the emulation context will not be reused. Commit c8848cee74ff ("KVM: x86: set ctxt->have_exception in x86_decode_insn()) triggers the warning because it catches the stale emulation context has #UD, however, it is not during instruction decoding which should result in EMULATION_FAILED. This patch fixes it by moving the second part emulation context initialization into init_emulate_ctxt() and before hardware breakpoints check. The ctxt->ud will be dropped by a follow-up patch. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=134683fdd00000 Reported-by: syzbot+71271244f206d17f6441@syzkaller.appspotmail.com Fixes: 4a1e10d5b5d8 (KVM: x86: handle hardware breakpoints during emulation) Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-Id: <1622160097-37633-1-git-send-email-wanpengli@tencent.com> --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d725567961f..622cba2ed699 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7228,6 +7228,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -7563,11 +7568,6 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); -- cgit v1.2.3 From b35491e66c87946f380ebf8ab10a7e1f795e5ece Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 27 May 2021 17:01:37 -0700 Subject: KVM: X86: Kill off ctxt->ud ctxt->ud is consumed only by x86_decode_insn(), we can kill it off by passing emulation_type to x86_decode_insn() and dropping ctxt->ud altogether. Tracking that info in ctxt for literally one call is silly. Suggested-by: Sean Christopherson Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-Id: <1622160097-37633-2-git-send-email-wanpengli@tencent.com> --- arch/x86/kvm/emulate.c | 5 +++-- arch/x86/kvm/kvm_emulate.h | 3 +-- arch/x86/kvm/x86.c | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a0ccdb56076..5e5de05a8fbf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5111,7 +5111,7 @@ done: return rc; } -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -5322,7 +5322,8 @@ done_prefixes: ctxt->execute = opcode.u.execute; - if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && + likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; if (unlikely(ctxt->d & diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index f016838faedd..3e870bf9ca4d 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -314,7 +314,6 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ - bool ud; /* inject an #UD if host doesn't support insn */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; @@ -491,7 +490,7 @@ enum x86_intercept { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 622cba2ed699..1cd6d4685932 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7568,9 +7568,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; -- cgit v1.2.3 From 7d65f9e80646c595e8c853640a9d0768a33e204c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 May 2021 13:08:41 +0200 Subject: x86/apic: Mark _all_ legacy interrupts when IO/APIC is missing PIC interrupts do not support affinity setting and they can end up on any online CPU. Therefore, it's required to mark the associated vectors as system-wide reserved. Otherwise, the corresponding irq descriptors are copied to the secondary CPUs but the vectors are not marked as assigned or reserved. This works correctly for the IO/APIC case. When the IO/APIC is disabled via config, kernel command line or lack of enumeration then all legacy interrupts are routed through the PIC, but nothing marks them as system-wide reserved vectors. As a consequence, a subsequent allocation on a secondary CPU can result in allocating one of these vectors, which triggers the BUG() in apic_update_vector() because the interrupt descriptor slot is not empty. Imran tried to work around that by marking those interrupts as allocated when a CPU comes online. But that's wrong in case that the IO/APIC is available and one of the legacy interrupts, e.g. IRQ0, has been switched to PIC mode because then marking them as allocated will fail as they are already marked as system vectors. Stay consistent and update the legacy vectors after attempting IO/APIC initialization and mark them as system vectors in case that no IO/APIC is available. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Imran Khan Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210519233928.2157496-1-imran.f.khan@oracle.com --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/apic/vector.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 412b51e059c8..48067af94678 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(void) extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); +extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4a39fb429f15..d262811ce14b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool upmode) end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); + lapic_update_legacy_vectors(); } #ifdef CONFIG_UP_LATE_INIT diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6dbdc7c22bb7..fb67ed5e7e6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -738,6 +738,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace) irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); } +void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + void __init lapic_assign_system_vectors(void) { unsigned int i, vector = 0; -- cgit v1.2.3 From ec3a5cb61146c91f0f7dcec8b7e7157a4879a9ee Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Fri, 14 May 2021 14:37:41 -0700 Subject: riscv: Use -mno-relax when using lld linker lld does not implement the RISCV relaxation optimizations like GNU ld therefore disable it when building with lld, Also pass it to assembler when using external GNU assembler ( LLVM_IAS != 1 ), this ensures that relevant assembler option is also enabled along. if these options are not used then we see following relocations in objects 0000000000000000 R_RISCV_ALIGN *ABS*+0x0000000000000002 These are then rejected by lld ld.lld: error: capability.c:(.fixup+0x0): relocation R_RISCV_ALIGN requires unimplemented linker relaxation; recompile with -mno-relax but the .o is already compiled with -mno-relax Signed-off-by: Khem Raj Reviewed-by: Nathan Chancellor Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 3eb9590a0775..4be020695428 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -38,6 +38,15 @@ else KBUILD_LDFLAGS += -melf32lriscv endif +ifeq ($(CONFIG_LD_IS_LLD),y) + KBUILD_CFLAGS += -mno-relax + KBUILD_AFLAGS += -mno-relax +ifneq ($(LLVM_IAS),1) + KBUILD_CFLAGS += -Wa,-mno-relax + KBUILD_AFLAGS += -Wa,-mno-relax +endif +endif + # ISA string setting riscv-march-$(CONFIG_ARCH_RV32I) := rv32ima riscv-march-$(CONFIG_ARCH_RV64I) := rv64ima -- cgit v1.2.3 From 4cce442ffe5448ef572adc8b3abe7001b398e709 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 29 Apr 2021 10:38:23 +0200 Subject: arm64: meson: select COMMON_CLK This fix the recent removal of clock drivers selection. While it is not necessary to select the clock drivers themselves, we need to select a proper implementation of the clock API, which for the meson, is CCF Fixes: ba66a25536dd ("arm64: meson: ship only the necessary clock controllers") Reviewed-by: Neil Armstrong Signed-off-by: Jerome Brunet Reviewed-by: Martin Blumenstingl Signed-off-by: Kevin Hilman Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210429083823.59546-1-jbrunet@baylibre.com --- arch/arm64/Kconfig.platforms | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 6409b47b73e4..7336c1fd0dda 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -165,6 +165,7 @@ config ARCH_MEDIATEK config ARCH_MESON bool "Amlogic Platforms" + select COMMON_CLK select MESON_IRQ_GPIO help This enables support for the arm64 based Amlogic SoCs -- cgit v1.2.3 From 4a0e3ff30980b7601b13dd3b7ee275212b852843 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 May 2021 06:58:47 -0700 Subject: perf/x86/intel/uncore: Fix a kernel WARNING triggered by maxcpus=1 A kernel WARNING may be triggered when setting maxcpus=1. The uncore counters are Die-scope. When probing a PCI device, only the BUS information can be retrieved. The uncore driver has to maintain a mapping table used to calculate the logical Die ID from a given BUS#. Before the patch ba9506be4e40, the mapping table stores the mapping information from the BUS# -> a Physical Socket ID. To calculate the logical die ID, perf does, - In snbep_pci2phy_map_init(), retrieve the BUS# -> a Physical Socket ID from the UBOX PCI configure space. - Calculate the mapping information (a BUS# -> a Physical Socket ID) for the other PCI BUS. - In the uncore_pci_probe(), get the physical Socket ID from a given BUS and the mapping table. - Calculate the logical Die ID Since only the logical Die ID is required, with the patch ba9506be4e40, the mapping table stores the mapping information from the BUS# -> a logical Die ID. Now perf does, - In snbep_pci2phy_map_init(), retrieve the BUS# -> a Physical Socket ID from the UBOX PCI configure space. - Calculate the logical Die ID - Calculate the mapping information (a BUS# -> a logical Die ID) for the other PCI BUS. - In the uncore_pci_probe(), get the logical die ID from a given BUS and the mapping table. When calculating the logical Die ID, -1 may be returned, especially when maxcpus=1. Here, -1 means the logical Die ID is not found. But when calculating the mapping information for the other PCI BUS, -1 indicates that it's the other PCI BUS that requires the calculation of the mapping. The driver will mistakenly do the calculation. Uses the -ENODEV to indicate the case which the logical Die ID is not found. The driver will not mess up the mapping table anymore. Fixes: ba9506be4e40 ("perf/x86/intel/uncore: Store the logical die id instead of the physical die id.") Reported-by: John Donnelly Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Donnelly Tested-by: John Donnelly Link: https://lkml.kernel.org/r/1622037527-156028-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 63f097289a84..1587d3289743 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1406,6 +1406,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool die_id = i; else die_id = topology_phys_to_logical_pkg(i); + if (die_id < 0) + die_id = -ENODEV; map->pbus_to_dieid[bus] = die_id; break; } @@ -1452,14 +1454,14 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool i = -1; if (reverse) { for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_dieid[bus] >= 0) + if (map->pbus_to_dieid[bus] != -1) i = map->pbus_to_dieid[bus]; else map->pbus_to_dieid[bus] = i; } } else { for (bus = 0; bus <= 255; bus++) { - if (map->pbus_to_dieid[bus] >= 0) + if (map->pbus_to_dieid[bus] != -1) i = map->pbus_to_dieid[bus]; else map->pbus_to_dieid[bus] = i; -- cgit v1.2.3 From 9a90ed065a155d13db0d0ffeaad5cc54e51c90c6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 May 2021 11:02:26 +0200 Subject: x86/thermal: Fix LVT thermal setup for SMI delivery mode There are machines out there with added value crap^WBIOS which provide an SMI handler for the local APIC thermal sensor interrupt. Out of reset, the BSP on those machines has something like 0x200 in that APIC register (timestamps left in because this whole issue is timing sensitive): [ 0.033858] read lvtthmr: 0x330, val: 0x200 which means: - bit 16 - the interrupt mask bit is clear and thus that interrupt is enabled - bits [10:8] have 010b which means SMI delivery mode. Now, later during boot, when the kernel programs the local APIC, it soft-disables it temporarily through the spurious vector register: setup_local_APIC: ... /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); which means (from the SDM): "10.4.7.2 Local APIC State After It Has Been Software Disabled ... * The mask bits for all the LVT entries are set. Attempts to reset these bits will be ignored." And this happens too: [ 0.124111] APIC: Switch to symmetric I/O mode setup [ 0.124117] lvtthmr 0x200 before write 0xf to APIC 0xf0 [ 0.124118] lvtthmr 0x10200 after write 0xf to APIC 0xf0 This results in CPU 0 soft lockups depending on the placement in time when the APIC soft-disable happens. Those soft lockups are not 100% reproducible and the reason for that can only be speculated as no one tells you what SMM does. Likely, it confuses the SMM code that the APIC is disabled and the thermal interrupt doesn't doesn't fire at all, leading to CPU 0 stuck in SMM forever... Now, before 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()") due to how the APIC_LVTTHMR was read before APIC initialization in mcheck_intel_therm_init(), it would read the value with the mask bit 16 clear and then intel_init_thermal() would replicate it onto the APs and all would be peachy - the thermal interrupt would remain enabled. But that commit moved that reading to a later moment in intel_init_thermal(), resulting in reading APIC_LVTTHMR on the BSP too late and with its interrupt mask bit set. Thus, revert back to the old behavior of reading the thermal LVT register before the APIC gets initialized. Fixes: 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()") Reported-by: James Feeney Signed-off-by: Borislav Petkov Cc: Cc: Zhang Rui Cc: Srinivas Pandruvada Link: https://lkml.kernel.org/r/YKIqDdFNaXYd39wz@zn.tnic --- arch/x86/include/asm/thermal.h | 4 +++- arch/x86/kernel/setup.c | 9 +++++++++ drivers/thermal/intel/therm_throt.c | 15 +++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h index ddbdefd5b94f..91a7b6687c3b 100644 --- a/arch/x86/include/asm/thermal.h +++ b/arch/x86/include/asm/thermal.h @@ -3,11 +3,13 @@ #define _ASM_X86_THERMAL_H #ifdef CONFIG_X86_THERMAL_VECTOR +void therm_lvt_init(void); void intel_init_thermal(struct cpuinfo_x86 *c); bool x86_thermal_enabled(void); void intel_thermal_interrupt(void); #else -static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +static inline void therm_lvt_init(void) { } +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } #endif #endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 72920af0b3c0..ff653d608d5f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -1226,6 +1227,14 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); + /* + * This needs to run before setup_local_APIC() which soft-disables the + * local APIC temporarily and that masks the thermal LVT interrupt, + * leading to softlockups on machines which have configured SMI + * interrupt delivery. + */ + therm_lvt_init(); + mcheck_init(); register_refined_jiffies(CLOCK_TICK_RATE); diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index f8e882592ba5..99abdc03c44c 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -621,6 +621,17 @@ bool x86_thermal_enabled(void) return atomic_read(&therm_throt_en); } +void __init therm_lvt_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -630,10 +641,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (!intel_thermal_supported(c)) return; - /* On the BSP? */ - if (c == &boot_cpu_data) - lvtthmr_init = apic_read(APIC_LVTTHMR); - /* * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler -- cgit v1.2.3 From 59cc84c802eb923805e7bba425976a3df5ce35d8 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Wed, 26 May 2021 16:45:40 +0200 Subject: Revert "powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE() to save TCEs" This reverts commit 3c0468d4451eb6b4f6604370639f163f9637a479. That commit was breaking alignment guarantees for the DMA address when allocating coherent mappings, as described in Documentation/core-api/dma-api-howto.rst It was also noticed by Mellanox' driver: [ 1515.763621] mlx5_core c002:01:00.0: mlx5_frag_buf_alloc_node:146:(pid 13402): unexpected map alignment: 0x0800000000c61000, page_shift=16 [ 1515.763635] mlx5_core c002:01:00.0: mlx5_cqwq_create:181:(pid 13402): mlx5_frag_buf_alloc_node() failed, -12 Fixes: 3c0468d4451e ("powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE() to save TCEs") Signed-off-by: Frederic Barrat Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210526144540.117795-1-fbarrat@linux.ibm.com --- arch/powerpc/kernel/iommu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 57d6b85e9b96..2af89a5e379f 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -898,7 +898,6 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, unsigned int order; unsigned int nio_pages, io_order; struct page *page; - size_t size_io = size; size = PAGE_ALIGN(size); order = get_order(size); @@ -925,9 +924,8 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - size_io = IOMMU_PAGE_ALIGN(size_io, tbl); - nio_pages = size_io >> tbl->it_page_shift; - io_order = get_iommu_order(size_io, tbl); + nio_pages = size >> tbl->it_page_shift; + io_order = get_iommu_order(size, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> tbl->it_page_shift, io_order, 0); if (mapping == DMA_MAPPING_ERROR) { @@ -942,9 +940,10 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, void *vaddr, dma_addr_t dma_handle) { if (tbl) { - size_t size_io = IOMMU_PAGE_ALIGN(size, tbl); - unsigned int nio_pages = size_io >> tbl->it_page_shift; + unsigned int nio_pages; + size = PAGE_ALIGN(size); + nio_pages = size >> tbl->it_page_shift; iommu_free(tbl, dma_handle, nio_pages); size = PAGE_ALIGN(size); free_pages((unsigned long)vaddr, get_order(size)); -- cgit v1.2.3 From 848ff3768684701a4ce73a2ec0e5d438d4e2b0da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 1 Jun 2021 06:09:03 -0700 Subject: perf/x86/intel/uncore: Fix M2M event umask for Ice Lake server Perf tool errors out with the latest event list for the Ice Lake server. event syntax error: 'unc_m2m_imc_reads.to_pmm' \___ value too big for format, maximum is 255 The same as the Snow Ridge server, the M2M uncore unit in the Ice Lake server has the unit mask extension field as well. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Reported-by: Jin Yao Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1622552943-119174-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 1587d3289743..3a75a2c601c2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5099,9 +5099,10 @@ static struct intel_uncore_type icx_uncore_m2m = { .perf_ctr = SNR_M2M_PCI_PMON_CTR0, .event_ctl = SNR_M2M_PCI_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, .ops = &snr_m2m_uncore_pci_ops, - .format_group = &skx_uncore_format_group, + .format_group = &snr_m2m_uncore_format_group, }; static struct attribute *icx_upi_uncore_formats_attr[] = { -- cgit v1.2.3 From 8a4102a0cf07cc76a18f373f6b49485258cc6af4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 16 May 2021 17:00:38 +0800 Subject: riscv: mm: Fix W+X mappings at boot When the kernel mapping was moved the last 2GB of the address space, (__va(PFN_PHYS(max_low_pfn))) is much smaller than the .data section start address, the last set_memory_nx() in protect_kernel_text_data() will fail, thus the .data section is still mapped as W+X. This results in below W+X mapping waring at boot. Fix it by passing the correct .data section page num to the set_memory_nx(). [ 0.396516] ------------[ cut here ]------------ [ 0.396889] riscv/mm: Found insecure W+X mapping at address (____ptrval____)/0xffffffff80c00000 [ 0.398347] WARNING: CPU: 0 PID: 1 at arch/riscv/mm/ptdump.c:258 note_page+0x244/0x24a [ 0.398964] Modules linked in: [ 0.399459] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc1+ #14 [ 0.400003] Hardware name: riscv-virtio,qemu (DT) [ 0.400591] epc : note_page+0x244/0x24a [ 0.401368] ra : note_page+0x244/0x24a [ 0.401772] epc : ffffffff80007c86 ra : ffffffff80007c86 sp : ffffffe000e7bc30 [ 0.402304] gp : ffffffff80caae88 tp : ffffffe000e70000 t0 : ffffffff80cb80cf [ 0.402800] t1 : ffffffff80cb80c0 t2 : 0000000000000000 s0 : ffffffe000e7bc80 [ 0.403310] s1 : ffffffe000e7bde8 a0 : 0000000000000053 a1 : ffffffff80c83ff0 [ 0.403805] a2 : 0000000000000010 a3 : 0000000000000000 a4 : 6c7e7a5137233100 [ 0.404298] a5 : 6c7e7a5137233100 a6 : 0000000000000030 a7 : ffffffffffffffff [ 0.404849] s2 : ffffffff80e00000 s3 : 0000000040000000 s4 : 0000000000000000 [ 0.405393] s5 : 0000000000000000 s6 : 0000000000000003 s7 : ffffffe000e7bd48 [ 0.405935] s8 : ffffffff81000000 s9 : ffffffffc0000000 s10: ffffffe000e7bd48 [ 0.406476] s11: 0000000000001000 t3 : 0000000000000072 t4 : ffffffffffffffff [ 0.407016] t5 : 0000000000000002 t6 : ffffffe000e7b978 [ 0.407435] status: 0000000000000120 badaddr: 0000000000000000 cause: 0000000000000003 [ 0.408052] Call Trace: [ 0.408343] [] note_page+0x244/0x24a [ 0.408855] [] ptdump_hole+0x14/0x1e [ 0.409263] [] walk_pgd_range+0x2a0/0x376 [ 0.409690] [] walk_page_range_novma+0x4e/0x6e [ 0.410146] [] ptdump_walk_pgd+0x48/0x78 [ 0.410570] [] ptdump_check_wx+0xb4/0xf8 [ 0.410990] [] mark_rodata_ro+0x26/0x2e [ 0.411407] [] kernel_init+0x44/0x108 [ 0.411814] [] ret_from_exception+0x0/0xc [ 0.412309] ---[ end trace 7ec3459f2547ea83 ]--- [ 0.413141] Checked W+X mappings: failed, 512 W+X pages found Fixes: 2bfc6cd81bd17e43 ("riscv: Move kernel mapping outside of linear mapping") Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 4faf8bd157ea..4c4c92ce0bb8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -746,14 +746,18 @@ void __init protect_kernel_text_data(void) unsigned long init_data_start = (unsigned long)__init_data_begin; unsigned long rodata_start = (unsigned long)__start_rodata; unsigned long data_start = (unsigned long)_data; - unsigned long max_low = (unsigned long)(__va(PFN_PHYS(max_low_pfn))); +#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) + unsigned long end_va = kernel_virt_addr + load_sz; +#else + unsigned long end_va = (unsigned long)(__va(PFN_PHYS(max_low_pfn))); +#endif set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT); set_memory_ro(init_text_start, (init_data_start - init_text_start) >> PAGE_SHIFT); set_memory_nx(init_data_start, (rodata_start - init_data_start) >> PAGE_SHIFT); /* rodata section is marked readonly in mark_rodata_ro */ set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); - set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT); + set_memory_nx(data_start, (end_va - data_start) >> PAGE_SHIFT); } void mark_rodata_ro(void) -- cgit v1.2.3 From b75db25c416b9f0edae7cd86c4901c216a52e7a0 Mon Sep 17 00:00:00 2001 From: Vincent Date: Sat, 22 May 2021 07:40:15 +0800 Subject: riscv: skip errata_cip_453.o if CONFIG_ERRATA_SIFIVE_CIP_453 is disabled The errata_cip_453.o should be built only when the Kconfig CONFIG_ERRATA_SIFIVE_CIP_453 is enabled. Reported-by: kernel test robot Signed-off-by: Vincent Fixes: 0e0d4992517f ("riscv: enable SiFive errata CIP-453 and CIP-1200 Kconfig only if CONFIG_64BIT=y") Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/sifive/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile index bdd5fc843b8e..2fde48db0619 100644 --- a/arch/riscv/errata/sifive/Makefile +++ b/arch/riscv/errata/sifive/Makefile @@ -1,2 +1,2 @@ -obj-y += errata_cip_453.o +obj-$(CONFIG_ERRATA_SIFIVE_CIP_453) += errata_cip_453.o obj-y += errata.o -- cgit v1.2.3 From da2d48808fbd1eddefefe245c6c0e92a9195df8b Mon Sep 17 00:00:00 2001 From: Wende Tan Date: Sat, 22 May 2021 17:49:51 +0000 Subject: RISC-V: Fix memblock_free() usages in init_resources() `memblock_free()` takes a physical address as its first argument. Fix the wrong usages in `init_resources()`. Fixes: ffe0e526126884cf036a6f724220f1f9b4094fd2 ("RISC-V: Improve init_resources()") Fixes: 797f0375dd2ef5cdc68ac23450cbae9a5c67a74e ("RISC-V: Do not allocate memblock while iterating reserved memblocks") Signed-off-by: Wende Tan Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 03901d3a8b02..9a1b7a0603b2 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -231,13 +231,13 @@ static void __init init_resources(void) /* Clean-up any unused pre-allocated resources */ mem_res_sz = (num_resources - res_idx + 1) * sizeof(*mem_res); - memblock_free((phys_addr_t) mem_res, mem_res_sz); + memblock_free(__pa(mem_res), mem_res_sz); return; error: /* Better an empty resource tree than an inconsistent one */ release_child_resources(&iomem_resource); - memblock_free((phys_addr_t) mem_res, mem_res_sz); + memblock_free(__pa(mem_res), mem_res_sz); } -- cgit v1.2.3 From d94b93a9101573eb75b819dee94b1417acff631b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Dec 2020 16:54:56 +0100 Subject: ARM: cpuidle: Avoid orphan section warning Since commit 83109d5d5fba ("x86/build: Warn on orphan section placement"), we get a warning for objects in orphan sections. The cpuidle implementation for OMAP causes this when CONFIG_CPU_IDLE is disabled: arm-linux-gnueabi-ld: warning: orphan section `__cpuidle_method_of_table' from `arch/arm/mach-omap2/pm33xx-core.o' being placed in section `__cpuidle_method_of_table' arm-linux-gnueabi-ld: warning: orphan section `__cpuidle_method_of_table' from `arch/arm/mach-omap2/pm33xx-core.o' being placed in section `__cpuidle_method_of_table' arm-linux-gnueabi-ld: warning: orphan section `__cpuidle_method_of_table' from `arch/arm/mach-omap2/pm33xx-core.o' being placed in section `__cpuidle_method_of_table' Change the definition of CPUIDLE_METHOD_OF_DECLARE() to silently drop the table and all code referenced from it when CONFIG_CPU_IDLE is disabled. Fixes: 06ee7a950b6a ("ARM: OMAP2+: pm33xx-core: Add cpuidle_ops for am335x/am437x") Signed-off-by: Arnd Bergmann Reviewed-by: Miguel Ojeda Reviewed-by: Nick Desaulniers Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201230155506.1085689-1-arnd@kernel.org --- arch/arm/include/asm/cpuidle.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h index 0d67ed682e07..bc4ffa7ca04c 100644 --- a/arch/arm/include/asm/cpuidle.h +++ b/arch/arm/include/asm/cpuidle.h @@ -7,9 +7,11 @@ #ifdef CONFIG_CPU_IDLE extern int arm_cpuidle_simple_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); +#define __cpuidle_method_section __used __section("__cpuidle_method_of_table") #else static inline int arm_cpuidle_simple_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { return -ENODEV; } +#define __cpuidle_method_section __maybe_unused /* drop silently */ #endif /* Common ARM WFI state */ @@ -42,8 +44,7 @@ struct of_cpuidle_method { #define CPUIDLE_METHOD_OF_DECLARE(name, _method, _ops) \ static const struct of_cpuidle_method __cpuidle_method_of_table_##name \ - __used __section("__cpuidle_method_of_table") \ - = { .method = _method, .ops = _ops } + __cpuidle_method_section = { .method = _method, .ops = _ops } extern int arm_cpuidle_suspend(int index); -- cgit v1.2.3 From 9bfecd05833918526cc7357d55e393393440c5fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 May 2021 11:17:30 +0200 Subject: x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid() While digesting the XSAVE-related horrors which got introduced with the supervisor/user split, the recent addition of ENQCMD-related functionality got on the radar and turned out to be similarly broken. update_pasid(), which is only required when X86_FEATURE_ENQCMD is available, is invoked from two places: 1) From switch_to() for the incoming task 2) Via a SMP function call from the IOMMU/SMV code #1 is half-ways correct as it hacks around the brokenness of get_xsave_addr() by enforcing the state to be 'present', but all the conditionals in that code are completely pointless for that. Also the invocation is just useless overhead because at that point it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task and all of this can be handled at return to user space. #2 is broken beyond repair. The comment in the code claims that it is safe to invoke this in an IPI, but that's just wishful thinking. FPU state of a running task is protected by fregs_lock() which is nothing else than a local_bh_disable(). As BH-disabled regions run usually with interrupts enabled the IPI can hit a code section which modifies FPU state and there is absolutely no guarantee that any of the assumptions which are made for the IPI case is true. Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is invoked with a NULL pointer argument, so it can hit a completely unrelated task and unconditionally force an update for nothing. Worse, it can hit a kernel thread which operates on a user space address space and set a random PASID for it. The offending commit does not cleanly revert, but it's sufficient to force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid() code to make this dysfunctional all over the place. Anything more complex would require more surgery and none of the related functions outside of the x86 core code are blatantly wrong, so removing those would be overkill. As nothing enables the PASID bit in the IA32_XSS MSR yet, which is required to make this actually work, this cannot result in a regression except for related out of tree train-wrecks, but they are broken already today. Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de --- arch/x86/include/asm/disabled-features.h | 7 ++-- arch/x86/include/asm/fpu/api.h | 6 +--- arch/x86/include/asm/fpu/internal.h | 7 ---- arch/x86/kernel/fpu/xstate.c | 57 -------------------------------- 4 files changed, 3 insertions(+), 74 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index b7dd944dc867..8f28fafa98b3 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -56,11 +56,8 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_IOMMU_SUPPORT -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif +/* Force disable because it's broken beyond repair */ +#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #ifdef CONFIG_X86_SGX # define DISABLE_SGX 0 diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index ed33a14188f6..23bef08a8388 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); */ #define PASID_DISABLED 0 -#ifdef CONFIG_IOMMU_SUPPORT -/* Update current's PASID MSR/state by mm's PASID. */ -void update_pasid(void); -#else static inline void update_pasid(void) { } -#endif + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8d33ad80704f..ceeba9f63172 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) pkru_val = pk->pkru; } __write_pkru(pkru_val); - - /* - * Expensive PASID MSR write will be avoided in update_pasid() because - * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated - * unless it's different from mm->pasid to reduce overhead. - */ - update_pasid(); } #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a85c64000218..d0eef963aad1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ - -#ifdef CONFIG_IOMMU_SUPPORT -void update_pasid(void) -{ - u64 pasid_state; - u32 pasid; - - if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) - return; - - if (!current->mm) - return; - - pasid = READ_ONCE(current->mm->pasid); - /* Set the valid bit in the PASID MSR/state only for valid pasid. */ - pasid_state = pasid == PASID_DISABLED ? - pasid : pasid | MSR_IA32_PASID_VALID; - - /* - * No need to hold fregs_lock() since the task's fpstate won't - * be changed by others (e.g. ptrace) while the task is being - * switched to or is in IPI. - */ - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - /* The MSR is active and can be directly updated. */ - wrmsrl(MSR_IA32_PASID, pasid_state); - } else { - struct fpu *fpu = ¤t->thread.fpu; - struct ia32_pasid_state *ppasid_state; - struct xregs_state *xsave; - - /* - * The CPU's xstate registers are not currently active. Just - * update the PASID state in the memory buffer here. The - * PASID MSR will be loaded when returning to user mode. - */ - xsave = &fpu->state.xsave; - xsave->header.xfeatures |= XFEATURE_MASK_PASID; - ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID); - /* - * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state - * won't be NULL and no need to check its value. - * - * Only update the task's PASID state when it's different - * from the mm's pasid. - */ - if (ppasid_state->pasid != pasid_state) { - /* - * Invalid fpregs so that state restoring will pick up - * the PASID state. - */ - __fpu_invalidate_fpregs_state(fpu); - ppasid_state->pasid = pasid_state; - } - } -} -#endif /* CONFIG_IOMMU_SUPPORT */ -- cgit v1.2.3 From 2b31e8ed96b260ce2c22bd62ecbb9458399e3b62 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Jun 2021 17:51:22 +0200 Subject: x86/alternative: Optimize single-byte NOPs at an arbitrary position Up until now the assumption was that an alternative patching site would have some instructions at the beginning and trailing single-byte NOPs (0x90) padding. Therefore, the patching machinery would go and optimize those single-byte NOPs into longer ones. However, this assumption is broken on 32-bit when code like hv_do_hypercall() in hyperv_init() would use the ratpoline speculation killer CALL_NOSPEC. The 32-bit version of that macro would align certain insns to 16 bytes, leading to the compiler issuing a one or more single-byte NOPs, depending on the holes it needs to fill for alignment. That would lead to the warning in optimize_nops() to fire: ------------[ cut here ]------------ Not a NOP at 0xc27fb598 WARNING: CPU: 0 PID: 0 at arch/x86/kernel/alternative.c:211 optimize_nops.isra.13 due to that function verifying whether all of the following bytes really are single-byte NOPs. Therefore, carve out the NOP padding into a separate function and call it for each NOP range beginning with a single-byte NOP. Fixes: 23c1ad538f4f ("x86/alternatives: Optimize optimize_nops()") Reported-by: Richard Narron Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://bugzilla.kernel.org/show_bug.cgi?id=213301 Link: https://lkml.kernel.org/r/20210601212125.17145-1-bp@alien8.de --- arch/x86/kernel/alternative.c | 64 +++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6974b5174495..6fe5b44fcbc9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -182,42 +182,70 @@ done: n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } +/* + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * + * @instr: instruction byte stream + * @instrlen: length of the above + * @off: offset within @instr where the first NOP has been detected + * + * Return: number of NOPs found (and replaced). + */ +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) +{ + unsigned long flags; + int i = off, nnops; + + while (i < instrlen) { + if (instr[i] != 0x90) + break; + + i++; + } + + nnops = i - off; + + if (nnops <= 1) + return nnops; + + local_irq_save(flags); + add_nops(instr + off, nnops); + local_irq_restore(flags); + + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + + return nnops; +} + /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { - unsigned long flags; struct insn insn; - int nop, i = 0; + int i = 0; /* - * Jump over the non-NOP insns, the remaining bytes must be single-byte - * NOPs, optimize them. + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger + * ones. */ for (;;) { if (insn_decode_kernel(&insn, &instr[i])) return; + /* + * See if this and any potentially following NOPs can be + * optimized. + */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - break; - - if ((i += insn.length) >= a->instrlen) - return; - } + i += optimize_nops_range(instr, a->instrlen, i); + else + i += insn.length; - for (nop = i; i < a->instrlen; i++) { - if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) + if (i >= a->instrlen) return; } - - local_irq_save(flags); - add_nops(instr + nop, i - nop); - local_irq_restore(flags); - - DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, nop, a->instrlen); } /* -- cgit v1.2.3 From f1d4d47c5851b348b7713007e152bc68b94d728b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 1 Jun 2021 10:53:52 +0300 Subject: x86/setup: Always reserve the first 1M of RAM There are BIOSes that are known to corrupt the memory under 1M, or more precisely under 640K because the memory above 640K is anyway reserved for the EGA/VGA frame buffer and BIOS. To prevent usage of the memory that will be potentially clobbered by the kernel, the beginning of the memory is always reserved. The exact size of the reserved area is determined by CONFIG_X86_RESERVE_LOW build time and the "reservelow=" command line option. The reserved range may be from 4K to 640K with the default of 64K. There are also configurations that reserve the entire 1M range, like machines with SandyBridge graphic devices or systems that enable crash kernel. In addition to the potentially clobbered memory, EBDA of unknown size may be as low as 128K and the memory above that EBDA start is also reserved early. It would have been possible to reserve the entire range under 1M unless for the real mode trampoline that must reside in that area. To accommodate placement of the real mode trampoline and keep the memory safe from being clobbered by BIOS, reserve the first 64K of RAM before memory allocations are possible and then, after the real mode trampoline is allocated, reserve the entire range from 0 to 1M. Update trim_snb_memory() and reserve_real_mode() to avoid redundant reservations of the same memory range. Also make sure the memory under 1M is not getting freed by efi_free_boot_services(). [ bp: Massage commit message and comments. ] Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") Signed-off-by: Mike Rapoport Signed-off-by: Borislav Petkov Tested-by: Hugh Dickins Link: https://bugzilla.kernel.org/show_bug.cgi?id=213177 Link: https://lkml.kernel.org/r/20210601075354.5149-2-rppt@kernel.org --- arch/x86/kernel/setup.c | 35 +++++++++++++++++++++-------------- arch/x86/platform/efi/quirks.c | 12 ++++++++++++ arch/x86/realmode/init.c | 14 ++++++++------ 3 files changed, 41 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ff653d608d5f..1e720626069a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -638,11 +638,11 @@ static void __init trim_snb_memory(void) * them from accessing certain memory ranges, namely anything below * 1M and in the pages listed in bad_pages[] above. * - * To avoid these pages being ever accessed by SNB gfx devices - * reserve all memory below the 1 MB mark and bad_pages that have - * not already been reserved at boot time. + * To avoid these pages being ever accessed by SNB gfx devices reserve + * bad_pages that have not already been reserved at boot time. + * All memory below the 1 MB mark is anyway reserved later during + * setup_arch(), so there is no need to reserve it here. */ - memblock_reserve(0, 1<<20); for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) @@ -734,14 +734,14 @@ static void __init early_reserve_memory(void) * The first 4Kb of memory is a BIOS owned area, but generally it is * not listed as such in the E820 table. * - * Reserve the first memory page and typically some additional - * memory (64KiB by default) since some BIOSes are known to corrupt - * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * Reserve the first 64K of memory since some BIOSes are known to + * corrupt low memory. After the real mode trampoline is allocated the + * rest of the memory below 640k is reserved. * * In addition, make sure page 0 is always reserved because on * systems with L1TF its contents can be leaked to user processes. */ - memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + memblock_reserve(0, SZ_64K); early_reserve_initrd(); @@ -752,6 +752,7 @@ static void __init early_reserve_memory(void) reserve_ibft_region(); reserve_bios_regions(); + trim_snb_memory(); } /* @@ -1082,14 +1083,20 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped< Date: Tue, 1 Jun 2021 16:52:03 +0800 Subject: x86/fault: Don't send SIGSEGV twice on SEGV_PKUERR __bad_area_nosemaphore() calls both force_sig_pkuerr() and force_sig_fault() when handling SEGV_PKUERR. This does not cause problems because the second signal is filtered by the legacy_queue() check in __send_signal() because in both cases, the signal is SIGSEGV, the second one seeing that the first one is already pending. This causes the kernel to do unnecessary work so send the signal only once for SEGV_PKUERR. [ bp: Massage commit message. ] Fixes: 9db812dbb29d ("signal/x86: Call force_sig_pkuerr from __bad_area_nosemaphore") Suggested-by: "Eric W. Biederman" Signed-off-by: Jiashuo Liang Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20210601085203.40214-1-liangjs@pku.edu.cn --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1c548ad00752..6bda7f67d737 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,8 +836,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); - - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + else + force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } -- cgit v1.2.3 From 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a Mon Sep 17 00:00:00 2001 From: Pu Wen Date: Wed, 2 Jun 2021 15:02:07 +0800 Subject: x86/sev: Check SME/SEV support in CPUID first The first two bits of the CPUID leaf 0x8000001F EAX indicate whether SEV or SME is supported, respectively. It's better to check whether SEV or SME is actually supported before accessing the MSR_AMD64_SEV to check whether SEV or SME is enabled. This is both a bare-metal issue and a guest/VM issue. Since the first generation Hygon Dhyana CPU doesn't support the MSR_AMD64_SEV, reading that MSR results in a #GP - either directly from hardware in the bare-metal case or via the hypervisor (because the RDMSR is actually intercepted) in the guest/VM case, resulting in a failed boot. And since this is very early in the boot phase, rdmsrl_safe()/native_read_msr_safe() can't be used. So check the CPUID bits first, before accessing the MSR. [ tlendacky: Expand and improve commit message. ] [ bp: Massage commit message. ] Fixes: eab696d8e8b9 ("x86/sev: Do not require Hypervisor CPUID bit for SEV guests") Signed-off-by: Pu Wen Signed-off-by: Borislav Petkov Acked-by: Tom Lendacky Cc: # v5.10+ Link: https://lkml.kernel.org/r/20210602070207.2480-1-puwen@hygon.cn --- arch/x86/mm/mem_encrypt_identity.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index a9639f663d25..470b20208430 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -504,10 +504,6 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* Check the SEV MSR whether SEV or SME is enabled */ - sev_status = __rdmsr(MSR_AMD64_SEV); - feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; - /* * Check for the SME/SEV feature: * CPUID Fn8000_001F[EAX] @@ -519,11 +515,16 @@ void __init sme_enable(struct boot_params *bp) eax = 0x8000001f; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & feature_mask)) + /* Check whether SEV or SME is supported */ + if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) return; me_mask = 1UL << (ebx & 0x3f); + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { /* -- cgit v1.2.3 From 50c25ee97cf6ab011542167ab590c17012cea4ed Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 4 Jun 2021 20:01:08 -0700 Subject: Revert "MIPS: make userspace mapping young by default" This reverts commit f685a533a7fab35c5d069dcd663f59c8e4171a75. The MIPS cache flush logic needs to know whether the mapping was already established to decide how to flush caches. This is done by checking the valid bit in the PTE. The commit above breaks this logic by setting the valid in the PTE in new mappings, which causes kernel crashes. Link: https://lkml.kernel.org/r/20210526094335.92948-1-tsbogend@alpha.franken.de Fixes: f685a533a7f ("MIPS: make userspace mapping young by default") Reported-by: Zhou Yanjie Signed-off-by: Thomas Bogendoerfer Cc: Huang Pei Cc: Nicholas Piggin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/cache.c | 30 ++++++++++++++---------------- include/linux/pgtable.h | 8 ++++++++ mm/memory.c | 4 ++++ 3 files changed, 26 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index a7bf0c80371c..830ab91e574f 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -158,31 +158,29 @@ unsigned long _page_cachable_default; EXPORT_SYMBOL(_page_cachable_default); #define PM(p) __pgprot(_page_cachable_default | (p)) -#define PVA(p) PM(_PAGE_VALID | _PAGE_ACCESSED | (p)) static inline void setup_protection_map(void) { protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[1] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[2] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[3] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = PVA(_PAGE_PRESENT); - protection_map[5] = PVA(_PAGE_PRESENT); - protection_map[6] = PVA(_PAGE_PRESENT); - protection_map[7] = PVA(_PAGE_PRESENT); + protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[4] = PM(_PAGE_PRESENT); + protection_map[5] = PM(_PAGE_PRESENT); + protection_map[6] = PM(_PAGE_PRESENT); + protection_map[7] = PM(_PAGE_PRESENT); protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[9] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | + protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); - protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = PVA(_PAGE_PRESENT); - protection_map[13] = PVA(_PAGE_PRESENT); - protection_map[14] = PVA(_PAGE_PRESENT); - protection_map[15] = PVA(_PAGE_PRESENT); + protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); + protection_map[12] = PM(_PAGE_PRESENT); + protection_map[13] = PM(_PAGE_PRESENT); + protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); + protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); } -#undef _PVA #undef PM void cpu_cache_init(void) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 46b13780c2c8..a43047b1030d 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -432,6 +432,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ +#ifndef pte_sw_mkyoung +static inline pte_t pte_sw_mkyoung(pte_t pte) +{ + return pte; +} +#define pte_sw_mkyoung pte_sw_mkyoung +#endif + #ifndef pte_savedwrite #define pte_savedwrite pte_write #endif diff --git a/mm/memory.c b/mm/memory.c index 730daa00952b..f3ffab9b9e39 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2939,6 +2939,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3602,6 +3603,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3786,6 +3788,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); -- cgit v1.2.3 From 4f13d471e5d11034d56161af56d0f9396bc0b384 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 7 Jun 2021 06:15:32 +0000 Subject: KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee Commit 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") uses the local stack to allocate the structures used to communicate with the PSP, which were earlier being kzalloced. This breaks SEV live migration for computing the SEND_START session length and SEND_UPDATE_DATA query length as session_len and trans_len and hdr_len fields are not zeroed respectively for the above commands before issuing the SEV Firmware API call, hence the firmware returns incorrect session length and update data header or trans length. Also the SEV Firmware API returns SEV_RET_INVALID_LEN firmware error for these length query API calls, and the return value and the firmware error needs to be passed to the userspace as it is, so need to remove the return check in the KVM code. Signed-off-by: Ashish Kalra Message-Id: <20210607061532.27459-1-Ashish.Kalra@amd.com> Fixes: 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5bc887e9a986..e0ce5da97fc2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_start data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (ret < 0) - return ret; params->session_len = data.session_len; if (copy_to_user((void __user *)(uintptr_t)argp->data, params, @@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_update_data data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); - if (ret < 0) - return ret; params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; -- cgit v1.2.3 From e898da784aed0ea65f7672d941c01dc9b79e6299 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 7 Jun 2021 00:19:43 -0700 Subject: KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer According to the SDM 10.5.4.1: A write of 0 to the initial-count register effectively stops the local APIC timer, in both one-shot and periodic mode. However, the lapic timer oneshot/periodic mode which is emulated by vmx-preemption timer doesn't stop by writing 0 to TMICT since vmx->hv_deadline_tsc is still programmed and the guest will receive the spurious timer interrupt later. This patch fixes it by also cancelling the vmx-preemption timer when writing 0 to the initial-count register. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1623050385-100988-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8120e8614b92..6d72d8f43310 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) static void cancel_hv_timer(struct kvm_lapic *apic); +static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break; - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; -- cgit v1.2.3 From b1bd5cba3306691c771d558e94baa73e8b0b96b7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2021 13:24:55 +0800 Subject: KVM: X86: MMU: Use the correct inherited permissions to get shadow page When computing the access permissions of a shadow page, use the effective permissions of the walk up to that point, i.e. the logic AND of its parents' permissions. Two guest PxE entries that point at the same table gfn need to be shadowed with different shadow pages if their parents' permissions are different. KVM currently uses the effective permissions of the last non-leaf entry for all non-leaf entries. Because all non-leaf SPTEs have full ("uwx") permissions, and the effective permissions are recorded only in role.access and merged into the leaves, this can lead to incorrect reuse of a shadow page and eventually to a missing guest protection page fault. For example, here is a shared pagetable: pgd[] pud[] pmd[] virtual address pointers /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) pgd-| (shared pmd[] as above) \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) pud1 and pud2 point to the same pmd table, so: - ptr1 and ptr3 points to the same page. - ptr2 and ptr4 points to the same page. (pud1 and pud2 here are pud entries, while pmd1 and pmd2 here are pmd entries) - First, the guest reads from ptr1 first and KVM prepares a shadow page table with role.access=u--, from ptr1's pud1 and ptr1's pmd1. "u--" comes from the effective permissions of pgd, pud1 and pmd1, which are stored in pt->access. "u--" is used also to get the pagetable for pud1, instead of "uw-". - Then the guest writes to ptr2 and KVM reuses pud1 which is present. The hypervisor set up a shadow page for ptr2 with pt->access is "uw-" even though the pud1 pmd (because of the incorrect argument to kvm_mmu_get_page in the previous step) has role.access="u--". - Then the guest reads from ptr3. The hypervisor reuses pud1's shadow pmd for pud2, because both use "u--" for their permissions. Thus, the shadow pmd already includes entries for both pmd1 and pmd2. - At last, the guest writes to ptr4. This causes no vmexit or pagefault, because pud1's shadow page structures included an "uw-" page even though its role.access was "u--". Any kind of shared pagetable might have the similar problem when in virtual machine without TDP enabled if the permissions are different from different ancestors. In order to fix the problem, we change pt->access to be an array, and any access in it will not include permissions ANDed from child ptes. The test code is: https://lore.kernel.org/kvm/20210603050537.19605-1-jiangshanlai@gmail.com/ Remember to test it with TDP disabled. The problem had existed long before the commit 41074d07c78b ("KVM: MMU: Fix inherited permissions for emulated guest pte updates"), and it is hard to find which is the culprit. So there is no fixes tag here. Signed-off-by: Lai Jiangshan Message-Id: <20210603052455.21023-1-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Fixes: cea0f0e7ea54 ("[PATCH] KVM: MMU: Shadow page table caching") Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 5bfe28b0728e..20d85daed395 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -171,8 +171,8 @@ Shadow pages contain the following information: shadow pages) so role.quadrant takes values in the range 0..3. Each quadrant maps 1GB virtual address space. role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. + Inherited guest access permissions from the parent ptes in the form uwx. + Note execute permission is positive, not negative. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 70b7e44e3035..823a5919f9fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -90,8 +90,8 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; @@ -418,13 +418,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -463,7 +465,8 @@ retry_walk: } pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); + __func__, (u64)pte, walker->pte_access, + walker->pt_access[walker->level - 1]); return 1; error: @@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; + unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; @@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = NULL; if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, false, access); } -- cgit v1.2.3 From af3511ff7fa2107d6410831f3d71030f5e8d2b25 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:46:28 +0800 Subject: KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior In record_steal_time(), st->preempted is read twice, and trace_kvm_pv_tlb_flush() might output result inconsistent if kvm_vcpu_flush_tlb_guest() see a different st->preempted later. It is a very trivial problem and hardly has actual harm and can be avoided by reseting and reading st->preempted in atomic way via xchg(). Signed-off-by: Lai Jiangshan Message-Id: <20210531174628.10265-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1cd6d4685932..e2144eedaf79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3101,9 +3101,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = xchg(&st->preempted, 0); + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); } else { st->preempted = 0; -- cgit v1.2.3 From f31500b0d437a2464ca5972d8f5439e156b74960 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 10:57:48 -0700 Subject: KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message Use the __string() machinery provided by the tracing subystem to make a copy of the string literals consumed by the "nested VM-Enter failed" tracepoint. A complete copy is necessary to ensure that the tracepoint can't outlive the data/memory it consumes and deference stale memory. Because the tracepoint itself is defined by kvm, if kvm-intel and/or kvm-amd are built as modules, the memory holding the string literals defined by the vendor modules will be freed when the module is unloaded, whereas the tracepoint and its data in the ring buffer will live until kvm is unloaded (or "indefinitely" if kvm is built-in). This bug has existed since the tracepoint was added, but was recently exposed by a new check in tracing to detect exactly this type of bug. fmt: '%s%s ' current_buffer: ' vmx_dirty_log_t-140127 [003] .... kvm_nested_vmenter_failed: ' WARNING: CPU: 3 PID: 140134 at kernel/trace/trace.c:3759 trace_check_vprintf+0x3be/0x3e0 CPU: 3 PID: 140134 Comm: less Not tainted 5.13.0-rc1-ce2e73ce600a-req #184 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:trace_check_vprintf+0x3be/0x3e0 Code: <0f> 0b 44 8b 4c 24 1c e9 a9 fe ff ff c6 44 02 ff 00 49 8b 97 b0 20 RSP: 0018:ffffa895cc37bcb0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffa895cc37bd08 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff9766cfad74f8 RBP: ffffffffc0a041d4 R08: ffff9766cfad74f0 R09: ffffa895cc37bad8 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffc0a041d4 R13: ffffffffc0f4dba8 R14: 0000000000000000 R15: ffff976409f2c000 FS: 00007f92fa200740(0000) GS:ffff9766cfac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559bd11b0000 CR3: 000000019fbaa002 CR4: 00000000001726e0 Call Trace: trace_event_printf+0x5e/0x80 trace_raw_output_kvm_nested_vmenter_failed+0x3a/0x60 [kvm] print_trace_line+0x1dd/0x4e0 s_show+0x45/0x150 seq_read_iter+0x2d5/0x4c0 seq_read+0x106/0x150 vfs_read+0x98/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Steven Rostedt Fixes: 380e0055bc7e ("KVM: nVMX: trace nested VM-Enter failures detected by H/W") Signed-off-by: Sean Christopherson Reviewed-by: Steven Rostedt (VMware) Message-Id: <20210607175748.674002-1-seanjc@google.com> --- arch/x86/kvm/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a61c015870e3..4f839148948b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed, TP_ARGS(msg, err), TP_STRUCT__entry( - __field(const char *, msg) + __string(msg, msg) __field(u32, err) ), TP_fast_assign( - __entry->msg = msg; + __assign_str(msg, msg); __entry->err = err; ), - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); -- cgit v1.2.3 From b53e84eed08b88fd3ff59e5c2a7f1a69d4004e32 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:22:56 +0800 Subject: KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync When using shadow paging, unload the guest MMU when emulating a guest TLB flush to ensure all roots are synchronized. From the guest's perspective, flushing the TLB ensures any and all modifications to its PTEs will be recognized by the CPU. Note, unloading the MMU is overkill, but is done to mirror KVM's existing handling of INVPCID(all) and ensure the bug is squashed. Future cleanup can be done to more precisely synchronize roots when servicing a guest TLB flush. If TDP is enabled, synchronizing the MMU is unnecessary even if nested TDP is in play, as a "legacy" TLB flush from L1 does not invalidate L1's TDP mappings. For EPT, an explicit INVEPT is required to invalidate guest-physical mappings; for NPT, guest mappings are always tagged with an ASID and thus can only be invalidated via the VMCB's ASID control. This bug has existed since the introduction of KVM_VCPU_FLUSH_TLB. It was only recently exposed after Linux guests stopped flushing the local CPU's TLB prior to flushing remote TLBs (see commit 4ce94eabac16, "x86/mm/tlb: Flush remote and local TLBs concurrently"), but is also visible in Windows 10 guests. Tested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Fixes: f38a7b75267f ("KVM: X86: support paravirtualized help for TLB shootdowns") Signed-off-by: Lai Jiangshan [sean: massaged comment and changelog] Message-Id: <20210531172256.2908-1-jiangshanlai@gmail.com> Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e2144eedaf79..9dd23bdfc6cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Unload the + * entire MMU here and the subsequent load will sync the + * shadow page tables, and also flush the TLB. + */ + kvm_mmu_unload(vcpu); + return; + } + static_call(kvm_x86_tlb_flush_guest)(vcpu); } -- cgit v1.2.3 From 218bf772bddd221489c38dde6ef8e917131161f6 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 2 Jun 2021 13:52:24 -0700 Subject: kvm: LAPIC: Restore guard to prevent illegal APIC register access Per the SDM, "any access that touches bytes 4 through 15 of an APIC register may cause undefined behavior and must not be executed." Worse, such an access in kvm_lapic_reg_read can result in a leak of kernel stack contents. Prior to commit 01402cf81051 ("kvm: LAPIC: write down valid APIC registers"), such an access was explicitly disallowed. Restore the guard that was removed in that commit. Fixes: 01402cf81051 ("kvm: LAPIC: write down valid APIC registers") Signed-off-by: Jim Mattson Reported-by: syzbot Message-Id: <20210602205224.3189316-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6d72d8f43310..17fa4ab1b834 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1410,6 +1410,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); + if (alignment + len > 4) + return 1; + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) return 1; -- cgit v1.2.3 From a8383dfb2138742a1bb77b481ada047aededa2ba Mon Sep 17 00:00:00 2001 From: CodyYao-oc Date: Mon, 7 Jun 2021 10:53:35 +0800 Subject: x86/nmi_watchdog: Fix old-style NMI watchdog regression on old Intel CPUs The following commit: 3a4ac121c2ca ("x86/perf: Add hardware performance events support for Zhaoxin CPU.") Got the old-style NMI watchdog logic wrong and broke it for basically every Intel CPU where it was active. Which is only truly old CPUs, so few people noticed. On CPUs with perf events support we turn off the old-style NMI watchdog, so it was pretty pointless to add the logic for X86_VENDOR_ZHAOXIN to begin with ... :-/ Anyway, the fix is to restore the old logic and add a 'break'. [ mingo: Wrote a new changelog. ] Fixes: 3a4ac121c2ca ("x86/perf: Add hardware performance events support for Zhaoxin CPU.") Signed-off-by: CodyYao-oc Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210607025335.9643-1-CodyYao-oc@zhaoxin.com --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 3ef5868ac588..7aecb2fc3186 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,7 +63,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } - fallthrough; + break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: return msr - MSR_ARCH_PERFMON_PERFCTR0; @@ -96,7 +96,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } - fallthrough; + break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: return msr - MSR_ARCH_PERFMON_EVENTSEL0; -- cgit v1.2.3 From 02ffbe6351f5c88337143bcbc649832ded7445c0 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 9 Jun 2021 20:22:17 +0800 Subject: KVM: SVM: fix doc warnings Fix kernel-doc warnings: arch/x86/kvm/svm/avic.c:233: warning: Function parameter or member 'activate' not described in 'avic_update_access_page' arch/x86/kvm/svm/avic.c:233: warning: Function parameter or member 'kvm' not described in 'avic_update_access_page' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'e' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'kvm' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'svm' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'vcpu_info' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:1009: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Signed-off-by: ChenXiaoSong Message-Id: <20210609122217.2967131-1-chenxiaosong2@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0e62e6a2438c..5e7e920113f3 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -221,7 +221,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, return &avic_physical_id_table[index]; } -/** +/* * Note: * AVIC hardware walks the nested page table to check permissions, * but does not use the SPA address specified in the leaf page @@ -764,7 +764,7 @@ out: return ret; } -/** +/* * Note: * The HW cannot support posting multicast/broadcast * interrupts to a vCPU. So, we still use legacy interrupt @@ -1005,7 +1005,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/** +/* * This function is called during VCPU halt/unhalt. */ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) -- cgit v1.2.3 From 551912d286e940e63abe9e005f434691ee24fd15 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 28 May 2021 15:07:56 -0500 Subject: KVM: x86: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a couple of warnings by explicitly adding break statements instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Message-Id: <20210528200756.GA39320@embeddedor> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9a48f138832d..b4da665bb892 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -655,6 +655,7 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) entry->ecx = F(RDPID); ++array->nent; + break; default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50b42d7a8a11..c2a779b688e6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6247,6 +6247,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) switch (kvm_get_apic_mode(vcpu)) { case LAPIC_MODE_INVALID: WARN_ONCE(true, "Invalid local APIC state"); + break; case LAPIC_MODE_DISABLED: break; case LAPIC_MODE_XAPIC: -- cgit v1.2.3 From 78fcb2c91adfec8ce3a2ba6b4d0dda89f2f4a7c6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:11 -0700 Subject: KVM: x86: Immediately reset the MMU context when the SMM flag is cleared Immediately reset the MMU context when the vCPU's SMM flag is cleared so that the SMM flag in the MMU role is always synchronized with the vCPU's flag. If RSM fails (which isn't correctly emulated), KVM will bail without calling post_leave_smm() and leave the MMU in a bad state. The bad MMU role can lead to a NULL pointer dereference when grabbing a shadow page's rmap for a page fault as the initial lookups for the gfn will happen with the vCPU's SMM flag (=0), whereas the rmap lookup will use the shadow page's SMM flag, which comes from the MMU (=1). SMM has an entirely different set of memslots, and so the initial lookup can find a memslot (SMM=0) and then explode on the rmap memslot lookup (SMM=1). general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 8410 Comm: syz-executor382 Not tainted 5.13.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__gfn_to_rmap arch/x86/kvm/mmu/mmu.c:935 [inline] RIP: 0010:gfn_to_rmap+0x2b0/0x4d0 arch/x86/kvm/mmu/mmu.c:947 Code: <42> 80 3c 20 00 74 08 4c 89 ff e8 f1 79 a9 00 4c 89 fb 4d 8b 37 44 RSP: 0018:ffffc90000ffef98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888015b9f414 RCX: ffff888019669c40 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000001 RBP: 0000000000000001 R08: ffffffff811d9cdb R09: ffffed10065a6002 R10: ffffed10065a6002 R11: 0000000000000000 R12: dffffc0000000000 R13: 0000000000000003 R14: 0000000000000001 R15: 0000000000000000 FS: 000000000124b300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000028e31000 CR4: 00000000001526e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rmap_add arch/x86/kvm/mmu/mmu.c:965 [inline] mmu_set_spte+0x862/0xe60 arch/x86/kvm/mmu/mmu.c:2604 __direct_map arch/x86/kvm/mmu/mmu.c:2862 [inline] direct_page_fault+0x1f74/0x2b70 arch/x86/kvm/mmu/mmu.c:3769 kvm_mmu_do_page_fault arch/x86/kvm/mmu.h:124 [inline] kvm_mmu_page_fault+0x199/0x1440 arch/x86/kvm/mmu/mmu.c:5065 vmx_handle_exit+0x26/0x160 arch/x86/kvm/vmx/vmx.c:6122 vcpu_enter_guest+0x3bdd/0x9630 arch/x86/kvm/x86.c:9428 vcpu_run+0x416/0xc20 arch/x86/kvm/x86.c:9494 kvm_arch_vcpu_ioctl_run+0x4e8/0xa40 arch/x86/kvm/x86.c:9722 kvm_vcpu_ioctl+0x70f/0xbb0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3460 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:1069 [inline] __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:1055 do_syscall_64+0x3f/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x440ce9 Cc: stable@vger.kernel.org Reported-by: syzbot+fb0b6a7e8713aeb0319c@syzkaller.appspotmail.com Fixes: 9ec19493fb86 ("KVM: x86: clear SMM flags before loading state while leaving SMM") Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9dd23bdfc6cc..54d212fe9b15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7106,7 +7106,10 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - emul_to_vcpu(ctxt)->arch.hflags = emul_flags; + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.hflags = emul_flags; + kvm_mmu_reset_context(vcpu); } static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, -- cgit v1.2.3 From 5e63215c2f64079fbd011df5005c8bea63f149c2 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 31 May 2021 11:53:42 +0300 Subject: riscv: xip: support runtime trap patching RISCV_ERRATA_ALTERNATIVE patches text at runtime which is currently not possible when the kernel is executed from the flash in XIP mode. Since runtime patching concerns only traps at the moment, let's just have all the traps reside in RAM anyway if RISCV_ERRATA_ALTERNATIVE is set. Thus, these functions will be patch-able even when the .text section is in flash. Signed-off-by: Vitaly Wool Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 13 +++++++++---- arch/riscv/kernel/vmlinux-xip.lds.S | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 0721b9798595..7bc88d8aab97 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -86,8 +86,13 @@ static void do_trap_error(struct pt_regs *regs, int signo, int code, } } +#if defined (CONFIG_XIP_KERNEL) && defined (CONFIG_RISCV_ERRATA_ALTERNATIVE) +#define __trap_section __section(".xip.traps") +#else +#define __trap_section +#endif #define DO_ERROR_INFO(name, signo, code, str) \ -asmlinkage __visible void name(struct pt_regs *regs) \ +asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ { \ do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ } @@ -111,7 +116,7 @@ DO_ERROR_INFO(do_trap_store_misaligned, int handle_misaligned_load(struct pt_regs *regs); int handle_misaligned_store(struct pt_regs *regs); -asmlinkage void do_trap_load_misaligned(struct pt_regs *regs) +asmlinkage void __trap_section do_trap_load_misaligned(struct pt_regs *regs) { if (!handle_misaligned_load(regs)) return; @@ -119,7 +124,7 @@ asmlinkage void do_trap_load_misaligned(struct pt_regs *regs) "Oops - load address misaligned"); } -asmlinkage void do_trap_store_misaligned(struct pt_regs *regs) +asmlinkage void __trap_section do_trap_store_misaligned(struct pt_regs *regs) { if (!handle_misaligned_store(regs)) return; @@ -146,7 +151,7 @@ static inline unsigned long get_break_insn_length(unsigned long pc) return GET_INSN_LENGTH(insn); } -asmlinkage __visible void do_trap_break(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) { #ifdef CONFIG_KPROBES if (kprobe_single_step_handler(regs)) diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S index 4b29b9917f99..a3ff09c4c3f9 100644 --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -99,9 +99,22 @@ SECTIONS } PERCPU_SECTION(L1_CACHE_BYTES) - . = ALIGN(PAGE_SIZE); + . = ALIGN(8); + .alternative : { + __alt_start = .; + *(.alternative) + __alt_end = .; + } __init_end = .; + . = ALIGN(16); + .xip.traps : { + __xip_traps_start = .; + *(.xip.traps) + __xip_traps_end = .; + } + + . = ALIGN(PAGE_SIZE); .sdata : { __global_pointer$ = . + 0x800; *(.sdata*) -- cgit v1.2.3 From 42e0e0b453bc6ead49c573ed512502069627546b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 11 May 2021 00:28:38 +0800 Subject: riscv: code patching only works on !XIP_KERNEL Some features which need code patching such as KPROBES, DYNAMIC_FTRACE KGDB can only work on !XIP_KERNEL. Add dependencies for these features that rely on code patching. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5914e70a0fd..18ec0f9bb8d5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -61,11 +61,11 @@ config RISCV select GENERIC_TIME_VSYSCALL if MMU && 64BIT select HANDLE_DOMAIN_IRQ select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_JUMP_LABEL_RELATIVE + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL + select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && 64BIT select HAVE_ARCH_KASAN_VMALLOC if MMU && 64BIT - select HAVE_ARCH_KGDB + select HAVE_ARCH_KGDB if !XIP_KERNEL select HAVE_ARCH_KGDB_QXFER_PKT select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER @@ -80,9 +80,9 @@ config RISCV select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO if MMU && 64BIT select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_KPROBES - select HAVE_KPROBES_ON_FTRACE - select HAVE_KRETPROBES + select HAVE_KPROBES if !XIP_KERNEL + select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL + select HAVE_KRETPROBES if !XIP_KERNEL select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -231,11 +231,11 @@ config ARCH_RV64I bool "RV64I" select 64BIT select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000 - select HAVE_DYNAMIC_FTRACE if MMU && $(cc-option,-fpatchable-function-entry=8) + select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && $(cc-option,-fpatchable-function-entry=8) select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_TRACER if !XIP_KERNEL select SWIOTLB if MMU endchoice -- cgit v1.2.3 From 96f1b00138cb8f04c742c82d0a7c460b2202e887 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 8 Jun 2021 19:39:25 -0700 Subject: ARCv2: save ABI registers across signal handling ARCv2 has some configuration dependent registers (r30, r58, r59) which could be targetted by the compiler. To keep the ABI stable, these were unconditionally part of the glibc ABI (sysdeps/unix/sysv/linux/arc/sys/ucontext.h:mcontext_t) however we missed populating them (by saving/restoring them across signal handling). This patch fixes the issue by - adding arcv2 ABI regs to kernel struct sigcontext - populating them during signal handling Change to struct sigcontext might seem like a glibc ABI change (although it primarily uses ucontext_t:mcontext_t) but the fact is - it has only been extended (existing fields are not touched) - the old sigcontext was ABI incomplete to begin with anyways Fixes: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/53 Cc: Tested-by: kernel test robot Reported-by: Vladimir Isaev Signed-off-by: Vineet Gupta --- arch/arc/include/uapi/asm/sigcontext.h | 1 + arch/arc/kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'arch') diff --git a/arch/arc/include/uapi/asm/sigcontext.h b/arch/arc/include/uapi/asm/sigcontext.h index 95f8a4380e11..7a5449dfcb29 100644 --- a/arch/arc/include/uapi/asm/sigcontext.h +++ b/arch/arc/include/uapi/asm/sigcontext.h @@ -18,6 +18,7 @@ */ struct sigcontext { struct user_regs_struct regs; + struct user_regs_arcv2 v2abi; }; #endif /* _ASM_ARC_SIGCONTEXT_H */ diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index b3ccb9e5ffe4..cb2f88502baf 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -61,6 +61,41 @@ struct rt_sigframe { unsigned int sigret_magic; }; +static int save_arcv2_regs(struct sigcontext *mctx, struct pt_regs *regs) +{ + int err = 0; +#ifndef CONFIG_ISA_ARCOMPACT + struct user_regs_arcv2 v2abi; + + v2abi.r30 = regs->r30; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + v2abi.r58 = regs->r58; + v2abi.r59 = regs->r59; +#else + v2abi.r58 = v2abi.r59 = 0; +#endif + err = __copy_to_user(&mctx->v2abi, &v2abi, sizeof(v2abi)); +#endif + return err; +} + +static int restore_arcv2_regs(struct sigcontext *mctx, struct pt_regs *regs) +{ + int err = 0; +#ifndef CONFIG_ISA_ARCOMPACT + struct user_regs_arcv2 v2abi; + + err = __copy_from_user(&v2abi, &mctx->v2abi, sizeof(v2abi)); + + regs->r30 = v2abi.r30; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + regs->r58 = v2abi.r58; + regs->r59 = v2abi.r59; +#endif +#endif + return err; +} + static int stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, sigset_t *set) @@ -94,6 +129,10 @@ stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, err = __copy_to_user(&(sf->uc.uc_mcontext.regs.scratch), &uregs.scratch, sizeof(sf->uc.uc_mcontext.regs.scratch)); + + if (is_isa_arcv2()) + err |= save_arcv2_regs(&(sf->uc.uc_mcontext), regs); + err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(sigset_t)); return err ? -EFAULT : 0; @@ -109,6 +148,10 @@ static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) err |= __copy_from_user(&uregs.scratch, &(sf->uc.uc_mcontext.regs.scratch), sizeof(sf->uc.uc_mcontext.regs.scratch)); + + if (is_isa_arcv2()) + err |= restore_arcv2_regs(&(sf->uc.uc_mcontext), regs); + if (err) return -EFAULT; -- cgit v1.2.3 From 110febc0148f8ab867344061d5cf95ee1e1ebb3e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 26 Feb 2021 10:35:39 -0800 Subject: ARC: fix CONFIG_HARDENED_USERCOPY Currently enabling this triggers a warning | usercopy: Kernel memory overwrite attempt detected to kernel text (offset 155633, size 11)! | usercopy: BUG: failure at mm/usercopy.c:99/usercopy_abort()! | |gcc generated __builtin_trap |Path: /bin/busybox |CPU: 0 PID: 84 Comm: init Not tainted 5.4.22 | |[ECR ]: 0x00090005 => gcc generated __builtin_trap |[EFA ]: 0x9024fcaa |[BLINK ]: usercopy_abort+0x8a/0x8c |[ERET ]: memfd_fcntl+0x0/0x470 |[STAT32]: 0x80080802 : IE K |... |... |Stack Trace: | memfd_fcntl+0x0/0x470 | usercopy_abort+0x8a/0x8c | __check_object_size+0x10e/0x138 | copy_strings+0x1f4/0x38c | __do_execve_file+0x352/0x848 | EV_Trap+0xcc/0xd0 The issue is triggered by an allocation in "init reclaimed" region. ARC _stext emcompasses the init region (for historical reasons we wanted the init.text to be under .text as well). This however trips up __check_object_size()->check_kernel_text_object() which treats this as object bleeding into kernel text. Fix that by rezoning _stext to start from regular kernel .text and leave out .init altogether. Fixes: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/15 Reported-by: Evgeniy Didin Reviewed-by: Kees Cook Signed-off-by: Vineet Gupta --- arch/arc/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S index 33ce59d91461..e2146a8da195 100644 --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -57,7 +57,6 @@ SECTIONS .init.ramfs : { INIT_RAM_FS } . = ALIGN(PAGE_SIZE); - _stext = .; HEAD_TEXT_SECTION INIT_TEXT_SECTION(L1_CACHE_BYTES) @@ -83,6 +82,7 @@ SECTIONS .text : { _text = .; + _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT -- cgit v1.2.3 From 858cf860494fab545abfa206d17efcb8bee73e36 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 31 May 2021 12:33:10 +0300 Subject: riscv: alternative: fix typo in macro name alternative-macros.h defines ALT_NEW_CONTENT in its assembly part and ALT_NEW_CONSTENT in the C part. Most likely it is the latter that is wrong. Fixes: 6f4eea90465ad (riscv: Introduce alternative mechanism to apply errata solution) Signed-off-by: Vitaly Wool Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 88c08705f64a..67406c376389 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -51,7 +51,7 @@ REG_ASM " " newlen "\n" \ ".word " errata_id "\n" -#define ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) \ +#define ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) \ ".if " __stringify(enable) " == 1\n" \ ".pushsection .alternative, \"a\"\n" \ ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \ @@ -69,7 +69,7 @@ "886 :\n" \ old_c "\n" \ "887 :\n" \ - ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) + ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) #define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) -- cgit v1.2.3 From 934002cd660b035b926438244b4294e647507e13 Mon Sep 17 00:00:00 2001 From: Alper Gun Date: Thu, 10 Jun 2021 17:46:04 +0000 Subject: KVM: SVM: Call SEV Guest Decommission if ASID binding fails Send SEV_CMD_DECOMMISSION command to PSP firmware if ASID binding fails. If a failure happens after a successful LAUNCH_START command, a decommission command should be executed. Otherwise, guest context will be unfreed inside the AMD SP. After the firmware will not have memory to allocate more SEV guest context, LAUNCH_START command will begin to fail with SEV_RET_RESOURCE_LIMIT error. The existing code calls decommission inside sev_unbind_asid, but it is not called if a failure happens before guest activation succeeds. If sev_bind_asid fails, decommission is never called. PSP firmware has a limit for the number of guests. If sev_asid_binding fails many times, PSP firmware will not have resources to create another guest context. Cc: stable@vger.kernel.org Fixes: 59414c989220 ("KVM: SVM: Add support for KVM_SEV_LAUNCH_START command") Reported-by: Peter Gonda Signed-off-by: Alper Gun Reviewed-by: Marc Orr Signed-off-by: Paolo Bonzini Message-Id: <20210610174604.2554090-1-alpergun@google.com> --- arch/x86/kvm/svm/sev.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e0ce5da97fc2..8d36f0c73071 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -199,9 +199,19 @@ static void sev_asid_free(struct kvm_sev_info *sev) sev->misc_cg = NULL; } -static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +static void sev_decommission(unsigned int handle) { struct sev_data_decommission decommission; + + if (!handle) + return; + + decommission.handle = handle; + sev_guest_decommission(&decommission, NULL); +} + +static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +{ struct sev_data_deactivate deactivate; if (!handle) @@ -214,9 +224,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_guest_deactivate(&deactivate, NULL); up_read(&sev_deactivate_lock); - /* decommission handle */ - decommission.handle = handle; - sev_guest_decommission(&decommission, NULL); + sev_decommission(handle); } static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) @@ -341,8 +349,10 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Bind ASID to this guest */ ret = sev_bind_asid(kvm, start.handle, error); - if (ret) + if (ret) { + sev_decommission(start.handle); goto e_free_session; + } /* return handle to userspace */ params.handle = start.handle; -- cgit v1.2.3 From dfdc0a714d241bfbf951886c373cd1ae463fcc25 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Jun 2021 21:59:33 -0700 Subject: KVM: X86: Fix x86_emulator slab cache leak Commit c9b8b07cded58 (KVM: x86: Dynamically allocate per-vCPU emulation context) tries to allocate per-vCPU emulation context dynamically, however, the x86_emulator slab cache is still exiting after the kvm module is unload as below after destroying the VM and unloading the kvm module. grep x86_emulator /proc/slabinfo x86_emulator 36 36 2672 12 8 : tunables 0 0 0 : slabdata 3 3 0 This patch fixes this slab cache leak by destroying the x86_emulator slab cache when the kvm module is unloaded. Fixes: c9b8b07cded58 (KVM: x86: Dynamically allocate per-vCPU emulation context) Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Message-Id: <1623387573-5969-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d212fe9b15..6d425310054b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8261,6 +8261,7 @@ void kvm_arch_exit(void) kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); free_percpu(user_return_msrs); + kmem_cache_destroy(x86_emulator_cache); kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); -- cgit v1.2.3 From 654430efde27248be563df9a88631204b5fe2df2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Jun 2021 15:00:26 -0700 Subject: KVM: x86/mmu: Calculate and check "full" mmu_role for nested MMU Calculate and check the full mmu_role when initializing the MMU context for the nested MMU, where "full" means the bits and pieces of the role that aren't handled by kvm_calc_mmu_role_common(). While the nested MMU isn't used for shadow paging, things like the number of levels in the guest's page tables are surprisingly important when walking the guest page tables. Failure to reinitialize the nested MMU context if L2's paging mode changes can result in unexpected and/or missed page faults, and likely other explosions. E.g. if an L1 vCPU is running both a 32-bit PAE L2 and a 64-bit L2, the "common" role calculation will yield the same role for both L2s. If the 64-bit L2 is run after the 32-bit PAE L2, L0 will fail to reinitialize the nested MMU context, ultimately resulting in a bad walk of L2's page tables as the MMU will still have a guest root_level of PT32E_ROOT_LEVEL. WARNING: CPU: 4 PID: 167334 at arch/x86/kvm/vmx/vmx.c:3075 ept_save_pdptrs+0x15/0xe0 [kvm_intel] Modules linked in: kvm_intel] CPU: 4 PID: 167334 Comm: CPU 3/KVM Not tainted 5.13.0-rc1-d849817d5673-reqs #185 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:ept_save_pdptrs+0x15/0xe0 [kvm_intel] Code: <0f> 0b c3 f6 87 d8 02 00f RSP: 0018:ffffbba702dbba00 EFLAGS: 00010202 RAX: 0000000000000011 RBX: 0000000000000002 RCX: ffffffff810a2c08 RDX: ffff91d7bc30acc0 RSI: 0000000000000011 RDI: ffff91d7bc30a600 RBP: ffff91d7bc30a600 R08: 0000000000000010 R09: 0000000000000007 R10: 0000000000000000 R11: 0000000000000000 R12: ffff91d7bc30a600 R13: ffff91d7bc30acc0 R14: ffff91d67c123460 R15: 0000000115d7e005 FS: 00007fe8e9ffb700(0000) GS:ffff91d90fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000029f15a001 CR4: 00000000001726e0 Call Trace: kvm_pdptr_read+0x3a/0x40 [kvm] paging64_walk_addr_generic+0x327/0x6a0 [kvm] paging64_gva_to_gpa_nested+0x3f/0xb0 [kvm] kvm_fetch_guest_virt+0x4c/0xb0 [kvm] __do_insn_fetch_bytes+0x11a/0x1f0 [kvm] x86_decode_insn+0x787/0x1490 [kvm] x86_decode_emulated_instruction+0x58/0x1e0 [kvm] x86_emulate_instruction+0x122/0x4f0 [kvm] vmx_handle_exit+0x120/0x660 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xe25/0x1cb0 [kvm] kvm_vcpu_ioctl+0x211/0x5a0 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Vitaly Kuznetsov Cc: stable@vger.kernel.org Fixes: bf627a928837 ("x86/kvm/mmu: check if MMU reconfiguration is needed in init_kvm_nested_mmu()") Signed-off-by: Sean Christopherson Message-Id: <20210610220026.1364486-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0144c40d09c7..8d5876dfc6b7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4739,9 +4739,33 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) context->inject_page_fault = kvm_inject_page_fault; } +static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false); + + /* + * Nested MMUs are used only for walking L2's gva->gpa, they never have + * shadow pages of their own and so "direct" has no meaning. Set it + * to "true" to try to detect bogus usage of the nested MMU. + */ + role.base.direct = true; + + if (!is_paging(vcpu)) + role.base.level = 0; + else if (is_long_mode(vcpu)) + role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (is_pae(vcpu)) + role.base.level = PT32E_ROOT_LEVEL; + else + role.base.level = PT32_ROOT_LEVEL; + + return role; +} + static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { - union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false); + union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_role.as_u64 == g_context->mmu_role.as_u64) -- cgit v1.2.3 From 2398ce80152aae33b9501ef54452e09e8e8d4262 Mon Sep 17 00:00:00 2001 From: Tor Vic Date: Thu, 10 Jun 2021 20:58:06 +0000 Subject: x86, lto: Pass -stack-alignment only on LLD < 13.0.0 Since LLVM commit 3787ee4, the '-stack-alignment' flag has been dropped [1], leading to the following error message when building a LTO kernel with Clang-13 and LLD-13: ld.lld: error: -plugin-opt=-: ld.lld: Unknown command line argument '-stack-alignment=8'. Try 'ld.lld --help' ld.lld: Did you mean '--stackrealign=8'? It also appears that the '-code-model' flag is not necessary anymore starting with LLVM-9 [2]. Drop '-code-model' and make '-stack-alignment' conditional on LLD < 13.0.0. These flags were necessary because these flags were not encoded in the IR properly, so the link would restart optimizations without them. Now there are properly encoded in the IR, and these flags exposing implementation details are no longer necessary. [1] https://reviews.llvm.org/D103048 [2] https://reviews.llvm.org/D52322 Cc: stable@vger.kernel.org Link: https://github.com/ClangBuiltLinux/linux/issues/1377 Signed-off-by: Tor Vic Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/f2c018ee-5999-741e-58d4-e482d5246067@mailbox.org --- arch/x86/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 307529417021..cb5e8d39cac1 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -200,8 +200,9 @@ endif KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG -KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ - -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0) +KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +endif endif ifdef CONFIG_X86_NEED_RELOCS -- cgit v1.2.3 From 0ddd7eaffa644baa78e247bbd220ab7195b1eed6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 4 Jun 2021 14:06:39 +0200 Subject: riscv: Fix BUILTIN_DTB for sifive and microchip soc Fix BUILTIN_DTB config which resulted in a dtb that was actually not built into the Linux image: in the same manner as Canaan soc does, create an object file from the dtb file that will get linked into the Linux image. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/microchip/Makefile | 1 + arch/riscv/boot/dts/sifive/Makefile | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile index 622b12771fd3..855c1502d912 100644 --- a/arch/riscv/boot/dts/microchip/Makefile +++ b/arch/riscv/boot/dts/microchip/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += microchip-mpfs-icicle-kit.dtb +obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/riscv/boot/dts/sifive/Makefile b/arch/riscv/boot/dts/sifive/Makefile index 74c47fe9fc22..d90e4eb0ade8 100644 --- a/arch/riscv/boot/dts/sifive/Makefile +++ b/arch/riscv/boot/dts/sifive/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_SOC_SIFIVE) += hifive-unleashed-a00.dtb \ hifive-unmatched-a00.dtb +obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) -- cgit v1.2.3 From cacf994a91d3a55c0c2f853d6429cd7b86113915 Mon Sep 17 00:00:00 2001 From: Mikel Rychliski Date: Fri, 11 Jun 2021 17:48:23 -0400 Subject: PCI: Add AMD RS690 quirk to enable 64-bit DMA Although the AMD RS690 chipset has 64-bit DMA support, BIOS implementations sometimes fail to configure the memory limit registers correctly. The Acer F690GVM mainboard uses this chipset and a Marvell 88E8056 NIC. The sky2 driver programs the NIC to use 64-bit DMA, which will not work: sky2 0000:02:00.0: error interrupt status=0x8 sky2 0000:02:00.0 eth0: tx timeout sky2 0000:02:00.0 eth0: transmit ring 0 .. 22 report=0 done=0 Other drivers required by this mainboard either don't support 64-bit DMA, or have it disabled using driver specific quirks. For example, the ahci driver has quirks to enable or disable 64-bit DMA depending on the BIOS version (see ahci_sb600_enable_64bit() in ahci.c). This ahci quirk matches against the SB600 SATA controller, but the real issue is almost certainly with the RS690 PCI host that it was commonly attached to. To avoid this issue in all drivers with 64-bit DMA support, fix the configuration of the PCI host. If the kernel is aware of physical memory above 4GB, but the BIOS never configured the PCI host with this information, update the registers with our values. [bhelgaas: drop PCI_DEVICE_ID_ATI_RS690 definition] Link: https://lore.kernel.org/r/20210611214823.4898-1-mikel@mikelr.com Signed-off-by: Mikel Rychliski Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'arch') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 02dc64625e64..2edd86649468 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -779,4 +779,48 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +#define RS690_LOWER_TOP_OF_DRAM2 0x30 +#define RS690_LOWER_TOP_OF_DRAM2_VALID 0x1 +#define RS690_UPPER_TOP_OF_DRAM2 0x31 +#define RS690_HTIU_NB_INDEX 0xA8 +#define RS690_HTIU_NB_INDEX_WR_ENABLE 0x100 +#define RS690_HTIU_NB_DATA 0xAC + +/* + * Some BIOS implementations support RAM above 4GB, but do not configure the + * PCI host to respond to bus master accesses for these addresses. These + * implementations set the TOP_OF_DRAM_SLOT1 register correctly, so PCI DMA + * works as expected for addresses below 4GB. + * + * Reference: "AMD RS690 ASIC Family Register Reference Guide" (pg. 2-57) + * https://www.amd.com/system/files/TechDocs/43372_rs690_rrg_3.00o.pdf + */ +static void rs690_fix_64bit_dma(struct pci_dev *pdev) +{ + u32 val = 0; + phys_addr_t top_of_dram = __pa(high_memory - 1) + 1; + + if (top_of_dram <= (1ULL << 32)) + return; + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_LOWER_TOP_OF_DRAM2); + pci_read_config_dword(pdev, RS690_HTIU_NB_DATA, &val); + + if (val) + return; + + pci_info(pdev, "Adjusting top of DRAM to %pa for 64-bit DMA support\n", &top_of_dram); + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_UPPER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE); + pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, top_of_dram >> 32); + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_LOWER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE); + pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, + top_of_dram | RS690_LOWER_TOP_OF_DRAM2_VALID); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); + #endif -- cgit v1.2.3