diff options
Diffstat (limited to 'arch')
544 files changed, 21512 insertions, 8521 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..17c42bc36321 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -892,27 +892,6 @@ config STRICT_MODULE_RWX config ARCH_HAS_PHYS_TO_DMA bool -config ARCH_HAS_REFCOUNT - bool - help - An architecture selects this when it has implemented refcount_t - using open coded assembly primitives that provide an optimized - refcount_t implementation, possibly at the expense of some full - refcount state checks of CONFIG_REFCOUNT_FULL=y. - - The refcount overflow check behavior, however, must be retained. - Catching overflows is the primary security concern for protecting - against bugs in reference counts. - -config REFCOUNT_FULL - bool "Perform full reference count validation at the expense of speed" - help - Enabling this switches the refcounting infrastructure from a fast - unchecked atomic_t implementation to a fully state checked - implementation, which can be (slightly) slower but provides protections - against various use-after-free conditions that can be used in - security flaw exploits. - config HAVE_ARCH_COMPILER_H bool help @@ -960,6 +939,14 @@ config RELR config ARCH_HAS_MEM_ENCRYPT bool +config HAVE_SPARSE_SYSCALL_NR + bool + help + An architecture should select this if its syscall numbering is sparse + to save space. For example, MIPS architecture has a syscall array with + entries at 4000, 5000 and 6000 locations. This option turns on syscall + related optimizations for a given architecture. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 4341ccf5c0c4..e7a59d927d78 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -824,7 +824,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) { /* This should never occur! */ irq_err_count++; - pr_warning("PMI: silly index %ld\n", la_ptr); + pr_warn("PMI: silly index %ld\n", la_ptr); wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask); return; } @@ -847,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, if (unlikely(!event)) { /* This should never occur! */ irq_err_count++; - pr_warning("PMI: No event at index %d!\n", idx); + pr_warn("PMI: No event at index %d!\n", idx); wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask); return; } diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index c4b5ceceab52..bc6f727278fd 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -1,4 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> #include <asm/cache.h> @@ -8,7 +12,7 @@ OUTPUT_FORMAT("elf64-alpha") OUTPUT_ARCH(alpha) ENTRY(__start) -PHDRS { kernel PT_LOAD; note PT_NOTE; } +PHDRS { text PT_LOAD; note PT_NOTE; } jiffies = jiffies_64; SECTIONS { @@ -27,17 +31,11 @@ SECTIONS LOCK_TEXT *(.fixup) *(.gnu.warning) - } :kernel + } :text swapper_pg_dir = SWAPPER_PGD; _etext = .; /* End of text section */ - NOTES :kernel :note - .dummy : { - *(.dummy) - } :kernel - - RODATA - EXCEPTION_TABLE(16) + RO_DATA(4096) /* Will be freed after init */ __init_begin = ALIGN(PAGE_SIZE); @@ -52,7 +50,7 @@ SECTIONS _sdata = .; /* Start of rw data section */ _data = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) .got : { *(.got) diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S index 6c693a9d29b6..54139a6f469b 100644 --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -95,13 +95,13 @@ SECTIONS _etext = .; _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) + RO_DATA(PAGE_SIZE) /* * 1. this is .data essentially * 2. THREAD_SIZE for init.task, must be kernel-stk sz aligned */ - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; @@ -118,8 +118,6 @@ SECTIONS /DISCARD/ : { *(.eh_frame) } #endif - NOTES - . = ALIGN(PAGE_SIZE); _end = . ; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8a50efb559f3..0d3c5d7cceb7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -117,7 +117,6 @@ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC - select REFCOUNT_FULL select RTC_LIB select SYS_SUPPORTS_APM_EMULATION # Above selects are sorted alphabetically; please add new ones diff --git a/arch/arm/boot/dts/am3517.dtsi b/arch/arm/boot/dts/am3517.dtsi index bf3002009b00..76f819f4ba48 100644 --- a/arch/arm/boot/dts/am3517.dtsi +++ b/arch/arm/boot/dts/am3517.dtsi @@ -16,6 +16,37 @@ can = &hecc; }; + cpus { + cpu: cpu@0 { + /* Based on OMAP3630 variants OPP50 and OPP100 */ + operating-points-v2 = <&cpu0_opp_table>; + + clock-latency = <300000>; /* From legacy driver */ + }; + }; + + cpu0_opp_table: opp-table { + compatible = "operating-points-v2-ti-cpu"; + syscon = <&scm_conf>; + /* + * AM3517 TRM only lists 600MHz @ 1.2V, but omap36xx + * appear to operate at 300MHz as well. Since AM3517 only + * lists one operating voltage, it will remain fixed at 1.2V + */ + opp50-300000000 { + opp-hz = /bits/ 64 <300000000>; + opp-microvolt = <1200000>; + opp-supported-hw = <0xffffffff 0xffffffff>; + opp-suspend; + }; + + opp100-600000000 { + opp-hz = /bits/ 64 <600000000>; + opp-microvolt = <1200000>; + opp-supported-hw = <0xffffffff 0xffffffff>; + }; + }; + ocp@68000000 { am35x_otg_hs: am35x_otg_hs@5c040000 { compatible = "ti,omap3-musb"; diff --git a/arch/arm/boot/dts/am3517_mt_ventoux.dts b/arch/arm/boot/dts/am3517_mt_ventoux.dts index e507e4ae0d88..e7d7124a34ba 100644 --- a/arch/arm/boot/dts/am3517_mt_ventoux.dts +++ b/arch/arm/boot/dts/am3517_mt_ventoux.dts @@ -8,7 +8,7 @@ / { model = "TeeJet Mt.Ventoux"; - compatible = "teejet,mt_ventoux", "ti,omap3"; + compatible = "teejet,mt_ventoux", "ti,am3517", "ti,omap3"; memory@80000000 { device_type = "memory"; diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index 0aaacea1d887..820ce3b60bb6 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -186,3 +186,30 @@ pinctrl-1 = <&mmc2_pins_hs>; pinctrl-2 = <&mmc2_pins_ddr_rev20 &mmc2_iodelay_ddr_conf>; }; + +&mac_sw { + pinctrl-names = "default", "sleep"; + status = "okay"; +}; + +&cpsw_port1 { + phy-handle = <ðphy0_sw>; + phy-mode = "rgmii"; + ti,dual-emac-pvid = <1>; +}; + +&cpsw_port2 { + phy-handle = <ðphy1_sw>; + phy-mode = "rgmii"; + ti,dual-emac-pvid = <2>; +}; + +&davinci_mdio_sw { + ethphy0_sw: ethernet-phy@0 { + reg = <0>; + }; + + ethphy1_sw: ethernet-phy@1 { + reg = <1>; + }; +}; diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts index ea1c119feaa5..c3d966904d64 100644 --- a/arch/arm/boot/dts/am572x-idk.dts +++ b/arch/arm/boot/dts/am572x-idk.dts @@ -27,3 +27,8 @@ pinctrl-1 = <&mmc2_pins_hs>; pinctrl-2 = <&mmc2_pins_ddr_rev20>; }; + +&mac { + status = "okay"; + dual_emac; +}; diff --git a/arch/arm/boot/dts/am574x-idk.dts b/arch/arm/boot/dts/am574x-idk.dts index 7935d70874ce..fa0088025b2c 100644 --- a/arch/arm/boot/dts/am574x-idk.dts +++ b/arch/arm/boot/dts/am574x-idk.dts @@ -35,3 +35,8 @@ pinctrl-1 = <&mmc2_pins_default>; pinctrl-2 = <&mmc2_pins_default>; }; + +&mac { + status = "okay"; + dual_emac; +}; diff --git a/arch/arm/boot/dts/am57xx-idk-common.dtsi b/arch/arm/boot/dts/am57xx-idk-common.dtsi index 423855a2a2d6..398721c7201c 100644 --- a/arch/arm/boot/dts/am57xx-idk-common.dtsi +++ b/arch/arm/boot/dts/am57xx-idk-common.dtsi @@ -363,11 +363,6 @@ ext-clk-src; }; -&mac { - status = "okay"; - dual_emac; -}; - &cpsw_emac0 { phy-handle = <ðphy0>; phy-mode = "rgmii"; diff --git a/arch/arm/boot/dts/dra7-l4.dtsi b/arch/arm/boot/dts/dra7-l4.dtsi index 5cac2dd58241..37e048771b0f 100644 --- a/arch/arm/boot/dts/dra7-l4.dtsi +++ b/arch/arm/boot/dts/dra7-l4.dtsi @@ -3079,6 +3079,58 @@ phys = <&phy_gmii_sel 2>; }; }; + + mac_sw: switch@0 { + compatible = "ti,dra7-cpsw-switch","ti,cpsw-switch"; + reg = <0x0 0x4000>; + ranges = <0 0 0x4000>; + clocks = <&gmac_main_clk>; + clock-names = "fck"; + #address-cells = <1>; + #size-cells = <1>; + syscon = <&scm_conf>; + status = "disabled"; + + interrupts = <GIC_SPI 334 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 335 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 336 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 337 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "rx_thresh", "rx", "tx", "misc"; + + ethernet-ports { + #address-cells = <1>; + #size-cells = <0>; + + cpsw_port1: port@1 { + reg = <1>; + label = "port1"; + mac-address = [ 00 00 00 00 00 00 ]; + phys = <&phy_gmii_sel 1>; + }; + + cpsw_port2: port@2 { + reg = <2>; + label = "port2"; + mac-address = [ 00 00 00 00 00 00 ]; + phys = <&phy_gmii_sel 2>; + }; + }; + + davinci_mdio_sw: mdio@1000 { + compatible = "ti,cpsw-mdio","ti,davinci_mdio"; + clocks = <&gmac_main_clk>; + clock-names = "fck"; + #address-cells = <1>; + #size-cells = <0>; + bus_freq = <1000000>; + reg = <0x1000 0x100>; + }; + + cpts { + clocks = <&gmac_clkctrl DRA7_GMAC_GMAC_CLKCTRL 25>; + clock-names = "cpts"; + }; + }; }; }; }; diff --git a/arch/arm/boot/dts/logicpd-som-lv-35xx-devkit.dts b/arch/arm/boot/dts/logicpd-som-lv-35xx-devkit.dts index f7a841a28865..2a0a98fe67f0 100644 --- a/arch/arm/boot/dts/logicpd-som-lv-35xx-devkit.dts +++ b/arch/arm/boot/dts/logicpd-som-lv-35xx-devkit.dts @@ -9,5 +9,5 @@ / { model = "LogicPD Zoom OMAP35xx SOM-LV Development Kit"; - compatible = "logicpd,dm3730-som-lv-devkit", "ti,omap3"; + compatible = "logicpd,dm3730-som-lv-devkit", "ti,omap3430", "ti,omap3"; }; diff --git a/arch/arm/boot/dts/logicpd-torpedo-35xx-devkit.dts b/arch/arm/boot/dts/logicpd-torpedo-35xx-devkit.dts index 7675bc3fa868..57bae2aa910e 100644 --- a/arch/arm/boot/dts/logicpd-torpedo-35xx-devkit.dts +++ b/arch/arm/boot/dts/logicpd-torpedo-35xx-devkit.dts @@ -9,5 +9,5 @@ / { model = "LogicPD Zoom OMAP35xx Torpedo Development Kit"; - compatible = "logicpd,dm3730-torpedo-devkit", "ti,omap3"; + compatible = "logicpd,dm3730-torpedo-devkit", "ti,omap3430", "ti,omap3"; }; diff --git a/arch/arm/boot/dts/motorola-cpcap-mapphone.dtsi b/arch/arm/boot/dts/motorola-cpcap-mapphone.dtsi index d1eae47b83f6..08bae935605c 100644 --- a/arch/arm/boot/dts/motorola-cpcap-mapphone.dtsi +++ b/arch/arm/boot/dts/motorola-cpcap-mapphone.dtsi @@ -43,11 +43,13 @@ compatible = "motorola,mapphone-cpcap-charger"; interrupts-extended = < &cpcap 13 0 &cpcap 12 0 &cpcap 29 0 &cpcap 28 0 - &cpcap 22 0 &cpcap 20 0 &cpcap 19 0 &cpcap 54 0 + &cpcap 22 0 &cpcap 21 0 &cpcap 20 0 &cpcap 19 0 + &cpcap 54 0 >; interrupt-names = "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn", - "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb"; + "rvrs_mode", "chrgcurr2", "chrgcurr1", "vbusvld", + "battdetb"; mode-gpios = <&gpio3 29 GPIO_ACTIVE_LOW &gpio3 23 GPIO_ACTIVE_LOW>; io-channels = <&cpcap_adc 0 &cpcap_adc 1 diff --git a/arch/arm/boot/dts/omap3-beagle-xm.dts b/arch/arm/boot/dts/omap3-beagle-xm.dts index 1aa99fc1487a..125ed933ca75 100644 --- a/arch/arm/boot/dts/omap3-beagle-xm.dts +++ b/arch/arm/boot/dts/omap3-beagle-xm.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3 BeagleBoard xM"; - compatible = "ti,omap3-beagle-xm", "ti,omap36xx", "ti,omap3"; + compatible = "ti,omap3-beagle-xm", "ti,omap3630", "ti,omap36xx", "ti,omap3"; cpus { cpu@0 { diff --git a/arch/arm/boot/dts/omap3-beagle.dts b/arch/arm/boot/dts/omap3-beagle.dts index e3df3c166902..4ed3f93f5841 100644 --- a/arch/arm/boot/dts/omap3-beagle.dts +++ b/arch/arm/boot/dts/omap3-beagle.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3 BeagleBoard"; - compatible = "ti,omap3-beagle", "ti,omap3"; + compatible = "ti,omap3-beagle", "ti,omap3430", "ti,omap3"; cpus { cpu@0 { diff --git a/arch/arm/boot/dts/omap3-cm-t3530.dts b/arch/arm/boot/dts/omap3-cm-t3530.dts index 76e52c78cbb4..32dbaeaed147 100644 --- a/arch/arm/boot/dts/omap3-cm-t3530.dts +++ b/arch/arm/boot/dts/omap3-cm-t3530.dts @@ -9,7 +9,7 @@ / { model = "CompuLab CM-T3530"; - compatible = "compulab,omap3-cm-t3530", "ti,omap34xx", "ti,omap3"; + compatible = "compulab,omap3-cm-t3530", "ti,omap3430", "ti,omap34xx", "ti,omap3"; /* Regulator to trigger the reset signal of the Wifi module */ mmc2_sdio_reset: regulator-mmc2-sdio-reset { diff --git a/arch/arm/boot/dts/omap3-cm-t3730.dts b/arch/arm/boot/dts/omap3-cm-t3730.dts index 6e944dfa0f3d..683819bf0915 100644 --- a/arch/arm/boot/dts/omap3-cm-t3730.dts +++ b/arch/arm/boot/dts/omap3-cm-t3730.dts @@ -9,7 +9,7 @@ / { model = "CompuLab CM-T3730"; - compatible = "compulab,omap3-cm-t3730", "ti,omap36xx", "ti,omap3"; + compatible = "compulab,omap3-cm-t3730", "ti,omap3630", "ti,omap36xx", "ti,omap3"; wl12xx_vmmc2: wl12xx_vmmc2 { compatible = "regulator-fixed"; diff --git a/arch/arm/boot/dts/omap3-devkit8000-lcd43.dts b/arch/arm/boot/dts/omap3-devkit8000-lcd43.dts index a80fc60bc773..afed85078ad8 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-lcd43.dts +++ b/arch/arm/boot/dts/omap3-devkit8000-lcd43.dts @@ -11,7 +11,7 @@ #include "omap3-devkit8000-lcd-common.dtsi" / { model = "TimLL OMAP3 Devkit8000 with 4.3'' LCD panel"; - compatible = "timll,omap3-devkit8000", "ti,omap3"; + compatible = "timll,omap3-devkit8000", "ti,omap3430", "ti,omap3"; lcd0: display { panel-timing { diff --git a/arch/arm/boot/dts/omap3-devkit8000-lcd70.dts b/arch/arm/boot/dts/omap3-devkit8000-lcd70.dts index 0753776071f8..07c51a105c0d 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-lcd70.dts +++ b/arch/arm/boot/dts/omap3-devkit8000-lcd70.dts @@ -11,7 +11,7 @@ #include "omap3-devkit8000-lcd-common.dtsi" / { model = "TimLL OMAP3 Devkit8000 with 7.0'' LCD panel"; - compatible = "timll,omap3-devkit8000", "ti,omap3"; + compatible = "timll,omap3-devkit8000", "ti,omap3430", "ti,omap3"; lcd0: display { panel-timing { diff --git a/arch/arm/boot/dts/omap3-devkit8000.dts b/arch/arm/boot/dts/omap3-devkit8000.dts index faafc48d8f61..162d0726b008 100644 --- a/arch/arm/boot/dts/omap3-devkit8000.dts +++ b/arch/arm/boot/dts/omap3-devkit8000.dts @@ -7,7 +7,7 @@ #include "omap3-devkit8000-common.dtsi" / { model = "TimLL OMAP3 Devkit8000"; - compatible = "timll,omap3-devkit8000", "ti,omap3"; + compatible = "timll,omap3-devkit8000", "ti,omap3430", "ti,omap3"; aliases { display1 = &dvi0; diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index b6ef1a7ac8a4..409a758c99f1 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -11,7 +11,7 @@ / { model = "OMAP3 GTA04"; - compatible = "ti,omap3-gta04", "ti,omap36xx", "ti,omap3"; + compatible = "ti,omap3-gta04", "ti,omap3630", "ti,omap36xx", "ti,omap3"; cpus { cpu@0 { diff --git a/arch/arm/boot/dts/omap3-ha-lcd.dts b/arch/arm/boot/dts/omap3-ha-lcd.dts index badb9b3c8897..c9ecbc45c8e2 100644 --- a/arch/arm/boot/dts/omap3-ha-lcd.dts +++ b/arch/arm/boot/dts/omap3-ha-lcd.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3 HEAD acoustics LCD-baseboard with TAO3530 SOM"; - compatible = "headacoustics,omap3-ha-lcd", "technexion,omap3-tao3530", "ti,omap34xx", "ti,omap3"; + compatible = "headacoustics,omap3-ha-lcd", "technexion,omap3-tao3530", "ti,omap3430", "ti,omap34xx", "ti,omap3"; }; &omap3_pmx_core { diff --git a/arch/arm/boot/dts/omap3-ha.dts b/arch/arm/boot/dts/omap3-ha.dts index a5365252bfbe..35c4e15abeb7 100644 --- a/arch/arm/boot/dts/omap3-ha.dts +++ b/arch/arm/boot/dts/omap3-ha.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3 HEAD acoustics baseboard with TAO3530 SOM"; - compatible = "headacoustics,omap3-ha", "technexion,omap3-tao3530", "ti,omap34xx", "ti,omap3"; + compatible = "headacoustics,omap3-ha", "technexion,omap3-tao3530", "ti,omap3430", "ti,omap34xx", "ti,omap3"; }; &omap3_pmx_core { diff --git a/arch/arm/boot/dts/omap3-igep0020-rev-f.dts b/arch/arm/boot/dts/omap3-igep0020-rev-f.dts index 03dcd05fb8a0..d134ce1cffc0 100644 --- a/arch/arm/boot/dts/omap3-igep0020-rev-f.dts +++ b/arch/arm/boot/dts/omap3-igep0020-rev-f.dts @@ -10,7 +10,7 @@ / { model = "IGEPv2 Rev. F (TI OMAP AM/DM37x)"; - compatible = "isee,omap3-igep0020-rev-f", "ti,omap36xx", "ti,omap3"; + compatible = "isee,omap3-igep0020-rev-f", "ti,omap3630", "ti,omap36xx", "ti,omap3"; /* Regulator to trigger the WL_EN signal of the Wifi module */ lbep5clwmc_wlen: regulator-lbep5clwmc-wlen { diff --git a/arch/arm/boot/dts/omap3-igep0020.dts b/arch/arm/boot/dts/omap3-igep0020.dts index 6d0519e3dfd0..e341535a7162 100644 --- a/arch/arm/boot/dts/omap3-igep0020.dts +++ b/arch/arm/boot/dts/omap3-igep0020.dts @@ -10,7 +10,7 @@ / { model = "IGEPv2 Rev. C (TI OMAP AM/DM37x)"; - compatible = "isee,omap3-igep0020", "ti,omap36xx", "ti,omap3"; + compatible = "isee,omap3-igep0020", "ti,omap3630", "ti,omap36xx", "ti,omap3"; vmmcsdio_fixed: fixedregulator-mmcsdio { compatible = "regulator-fixed"; diff --git a/arch/arm/boot/dts/omap3-igep0030-rev-g.dts b/arch/arm/boot/dts/omap3-igep0030-rev-g.dts index 060acd1e803a..9ca1d0f61964 100644 --- a/arch/arm/boot/dts/omap3-igep0030-rev-g.dts +++ b/arch/arm/boot/dts/omap3-igep0030-rev-g.dts @@ -10,7 +10,7 @@ / { model = "IGEP COM MODULE Rev. G (TI OMAP AM/DM37x)"; - compatible = "isee,omap3-igep0030-rev-g", "ti,omap36xx", "ti,omap3"; + compatible = "isee,omap3-igep0030-rev-g", "ti,omap3630", "ti,omap36xx", "ti,omap3"; /* Regulator to trigger the WL_EN signal of the Wifi module */ lbep5clwmc_wlen: regulator-lbep5clwmc-wlen { diff --git a/arch/arm/boot/dts/omap3-igep0030.dts b/arch/arm/boot/dts/omap3-igep0030.dts index 25170bd3c573..32f31035daa2 100644 --- a/arch/arm/boot/dts/omap3-igep0030.dts +++ b/arch/arm/boot/dts/omap3-igep0030.dts @@ -10,7 +10,7 @@ / { model = "IGEP COM MODULE Rev. E (TI OMAP AM/DM37x)"; - compatible = "isee,omap3-igep0030", "ti,omap36xx", "ti,omap3"; + compatible = "isee,omap3-igep0030", "ti,omap3630", "ti,omap36xx", "ti,omap3"; vmmcsdio_fixed: fixedregulator-mmcsdio { compatible = "regulator-fixed"; diff --git a/arch/arm/boot/dts/omap3-ldp.dts b/arch/arm/boot/dts/omap3-ldp.dts index 9a5fde2d9bce..ec9ba04ef43b 100644 --- a/arch/arm/boot/dts/omap3-ldp.dts +++ b/arch/arm/boot/dts/omap3-ldp.dts @@ -10,7 +10,7 @@ / { model = "TI OMAP3430 LDP (Zoom1 Labrador)"; - compatible = "ti,omap3-ldp", "ti,omap3"; + compatible = "ti,omap3-ldp", "ti,omap3430", "ti,omap3"; memory@80000000 { device_type = "memory"; diff --git a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi index c22833d4e568..73d477898ec2 100644 --- a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi +++ b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi @@ -7,7 +7,7 @@ / { model = "INCOstartec LILLY-A83X module (DM3730)"; - compatible = "incostartec,omap3-lilly-a83x", "ti,omap36xx", "ti,omap3"; + compatible = "incostartec,omap3-lilly-a83x", "ti,omap3630", "ti,omap36xx", "ti,omap3"; chosen { bootargs = "console=ttyO0,115200n8 vt.global_cursor_default=0 consoleblank=0"; diff --git a/arch/arm/boot/dts/omap3-lilly-dbb056.dts b/arch/arm/boot/dts/omap3-lilly-dbb056.dts index fec335400074..ecb4ef738e07 100644 --- a/arch/arm/boot/dts/omap3-lilly-dbb056.dts +++ b/arch/arm/boot/dts/omap3-lilly-dbb056.dts @@ -8,7 +8,7 @@ / { model = "INCOstartec LILLY-DBB056 (DM3730)"; - compatible = "incostartec,omap3-lilly-dbb056", "incostartec,omap3-lilly-a83x", "ti,omap36xx", "ti,omap3"; + compatible = "incostartec,omap3-lilly-dbb056", "incostartec,omap3-lilly-a83x", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &twl { diff --git a/arch/arm/boot/dts/omap3-n9.dts b/arch/arm/boot/dts/omap3-n9.dts index 74c0ff2350d3..2495a696cec6 100644 --- a/arch/arm/boot/dts/omap3-n9.dts +++ b/arch/arm/boot/dts/omap3-n9.dts @@ -12,7 +12,7 @@ / { model = "Nokia N9"; - compatible = "nokia,omap3-n9", "ti,omap36xx", "ti,omap3"; + compatible = "nokia,omap3-n9", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &i2c2 { diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index 84a5ade1e865..63659880eeb3 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -155,6 +155,12 @@ pwms = <&pwm9 0 26316 0>; /* 38000 Hz */ }; + rom_rng: rng { + compatible = "nokia,n900-rom-rng"; + clocks = <&rng_ick>; + clock-names = "ick"; + }; + /* controlled (enabled/disabled) directly by bcm2048 and wl1251 */ vctcxo: vctcxo { compatible = "fixed-clock"; diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index 6681d4519e97..a075b63f3087 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -11,13 +11,6 @@ cpus { cpu@0 { cpu0-supply = <&vcc>; - operating-points = < - /* kHz uV */ - 300000 1012500 - 600000 1200000 - 800000 1325000 - 1000000 1375000 - >; }; }; diff --git a/arch/arm/boot/dts/omap3-n950.dts b/arch/arm/boot/dts/omap3-n950.dts index 9886bf8b90ab..31d47a1fad84 100644 --- a/arch/arm/boot/dts/omap3-n950.dts +++ b/arch/arm/boot/dts/omap3-n950.dts @@ -12,7 +12,7 @@ / { model = "Nokia N950"; - compatible = "nokia,omap3-n950", "ti,omap36xx", "ti,omap3"; + compatible = "nokia,omap3-n950", "ti,omap3630", "ti,omap36xx", "ti,omap3"; keys { compatible = "gpio-keys"; diff --git a/arch/arm/boot/dts/omap3-overo-storm-alto35.dts b/arch/arm/boot/dts/omap3-overo-storm-alto35.dts index 18338576c41d..7f04dfad8203 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-alto35.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-alto35.dts @@ -14,5 +14,5 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Alto35"; - compatible = "gumstix,omap3-overo-alto35", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-alto35", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; diff --git a/arch/arm/boot/dts/omap3-overo-storm-chestnut43.dts b/arch/arm/boot/dts/omap3-overo-storm-chestnut43.dts index f204c8af8281..bc5a04e03336 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-chestnut43.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-chestnut43.dts @@ -14,7 +14,7 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Chestnut43"; - compatible = "gumstix,omap3-overo-chestnut43", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-chestnut43", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-overo-storm-gallop43.dts b/arch/arm/boot/dts/omap3-overo-storm-gallop43.dts index c633f7cee68e..065c31cbf0e2 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-gallop43.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-gallop43.dts @@ -14,7 +14,7 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Gallop43"; - compatible = "gumstix,omap3-overo-gallop43", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-gallop43", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-overo-storm-palo35.dts b/arch/arm/boot/dts/omap3-overo-storm-palo35.dts index fb88ebc9858c..e38c1c51392c 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-palo35.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-palo35.dts @@ -14,7 +14,7 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Palo35"; - compatible = "gumstix,omap3-overo-palo35", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-palo35", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-overo-storm-palo43.dts b/arch/arm/boot/dts/omap3-overo-storm-palo43.dts index 76cca00d97b6..e6dc23159c4d 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-palo43.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-palo43.dts @@ -14,7 +14,7 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Palo43"; - compatible = "gumstix,omap3-overo-palo43", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-palo43", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-overo-storm-summit.dts b/arch/arm/boot/dts/omap3-overo-storm-summit.dts index cc081a9e4c1e..587c08ce282d 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-summit.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-summit.dts @@ -14,7 +14,7 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Summit"; - compatible = "gumstix,omap3-overo-summit", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-summit", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-overo-storm-tobi.dts b/arch/arm/boot/dts/omap3-overo-storm-tobi.dts index 1de41c0826e0..f57de6010994 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-tobi.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-tobi.dts @@ -14,6 +14,6 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on Tobi"; - compatible = "gumstix,omap3-overo-tobi", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-tobi", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; diff --git a/arch/arm/boot/dts/omap3-overo-storm-tobiduo.dts b/arch/arm/boot/dts/omap3-overo-storm-tobiduo.dts index 9ed13118ed8e..281af6c113be 100644 --- a/arch/arm/boot/dts/omap3-overo-storm-tobiduo.dts +++ b/arch/arm/boot/dts/omap3-overo-storm-tobiduo.dts @@ -14,5 +14,5 @@ / { model = "OMAP36xx/AM37xx/DM37xx Gumstix Overo on TobiDuo"; - compatible = "gumstix,omap3-overo-tobiduo", "gumstix,omap3-overo", "ti,omap36xx", "ti,omap3"; + compatible = "gumstix,omap3-overo-tobiduo", "gumstix,omap3-overo", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; diff --git a/arch/arm/boot/dts/omap3-pandora-1ghz.dts b/arch/arm/boot/dts/omap3-pandora-1ghz.dts index 81b957f33c9f..ea509956d7ac 100644 --- a/arch/arm/boot/dts/omap3-pandora-1ghz.dts +++ b/arch/arm/boot/dts/omap3-pandora-1ghz.dts @@ -16,7 +16,7 @@ / { model = "Pandora Handheld Console 1GHz"; - compatible = "openpandora,omap3-pandora-1ghz", "ti,omap36xx", "ti,omap3"; + compatible = "openpandora,omap3-pandora-1ghz", "ti,omap3630", "ti,omap36xx", "ti,omap3"; }; &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/omap3-pandora-common.dtsi b/arch/arm/boot/dts/omap3-pandora-common.dtsi index ec5891718ae6..150d5be42d27 100644 --- a/arch/arm/boot/dts/omap3-pandora-common.dtsi +++ b/arch/arm/boot/dts/omap3-pandora-common.dtsi @@ -226,6 +226,17 @@ gpio = <&gpio6 4 GPIO_ACTIVE_HIGH>; /* GPIO_164 */ }; + /* wl1251 wifi+bt module */ + wlan_en: fixed-regulator-wg7210_en { + compatible = "regulator-fixed"; + regulator-name = "vwlan"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + startup-delay-us = <50000>; + enable-active-high; + gpio = <&gpio1 23 GPIO_ACTIVE_HIGH>; + }; + /* wg7210 (wifi+bt module) 32k clock buffer */ wg7210_32k: fixed-regulator-wg7210_32k { compatible = "regulator-fixed"; @@ -522,9 +533,30 @@ /*wp-gpios = <&gpio4 31 GPIO_ACTIVE_HIGH>;*/ /* GPIO_127 */ }; -/* mmc3 is probed using pdata-quirks to pass wl1251 card data */ &mmc3 { - status = "disabled"; + vmmc-supply = <&wlan_en>; + + bus-width = <4>; + non-removable; + ti,non-removable; + cap-power-off-card; + + pinctrl-names = "default"; + pinctrl-0 = <&mmc3_pins>; + + #address-cells = <1>; + #size-cells = <0>; + + wlan: wifi@1 { + compatible = "ti,wl1251"; + + reg = <1>; + + interrupt-parent = <&gpio1>; + interrupts = <21 IRQ_TYPE_LEVEL_HIGH>; /* GPIO_21 */ + + ti,wl1251-has-eeprom; + }; }; /* bluetooth*/ diff --git a/arch/arm/boot/dts/omap3-sbc-t3530.dts b/arch/arm/boot/dts/omap3-sbc-t3530.dts index ae96002abb3b..24bf3fd86641 100644 --- a/arch/arm/boot/dts/omap3-sbc-t3530.dts +++ b/arch/arm/boot/dts/omap3-sbc-t3530.dts @@ -8,7 +8,7 @@ / { model = "CompuLab SBC-T3530 with CM-T3530"; - compatible = "compulab,omap3-sbc-t3530", "compulab,omap3-cm-t3530", "ti,omap34xx", "ti,omap3"; + compatible = "compulab,omap3-sbc-t3530", "compulab,omap3-cm-t3530", "ti,omap3430", "ti,omap34xx", "ti,omap3"; aliases { display0 = &dvi0; diff --git a/arch/arm/boot/dts/omap3-sbc-t3730.dts b/arch/arm/boot/dts/omap3-sbc-t3730.dts index 7de6df16fc17..eb3893b9535e 100644 --- a/arch/arm/boot/dts/omap3-sbc-t3730.dts +++ b/arch/arm/boot/dts/omap3-sbc-t3730.dts @@ -8,7 +8,7 @@ / { model = "CompuLab SBC-T3730 with CM-T3730"; - compatible = "compulab,omap3-sbc-t3730", "compulab,omap3-cm-t3730", "ti,omap36xx", "ti,omap3"; + compatible = "compulab,omap3-sbc-t3730", "compulab,omap3-cm-t3730", "ti,omap3630", "ti,omap36xx", "ti,omap3"; aliases { display0 = &dvi0; diff --git a/arch/arm/boot/dts/omap3-sniper.dts b/arch/arm/boot/dts/omap3-sniper.dts index 40a87330e8c3..b6879cdc5c13 100644 --- a/arch/arm/boot/dts/omap3-sniper.dts +++ b/arch/arm/boot/dts/omap3-sniper.dts @@ -9,7 +9,7 @@ / { model = "LG Optimus Black"; - compatible = "lg,omap3-sniper", "ti,omap36xx", "ti,omap3"; + compatible = "lg,omap3-sniper", "ti,omap3630", "ti,omap36xx", "ti,omap3"; cpus { cpu@0 { diff --git a/arch/arm/boot/dts/omap3-thunder.dts b/arch/arm/boot/dts/omap3-thunder.dts index 6276e7079b36..64221e3b3477 100644 --- a/arch/arm/boot/dts/omap3-thunder.dts +++ b/arch/arm/boot/dts/omap3-thunder.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3 Thunder baseboard with TAO3530 SOM"; - compatible = "technexion,omap3-thunder", "technexion,omap3-tao3530", "ti,omap34xx", "ti,omap3"; + compatible = "technexion,omap3-thunder", "technexion,omap3-tao3530", "ti,omap3430", "ti,omap34xx", "ti,omap3"; }; &omap3_pmx_core { diff --git a/arch/arm/boot/dts/omap3-zoom3.dts b/arch/arm/boot/dts/omap3-zoom3.dts index db3a2fe84e99..d240e39f2151 100644 --- a/arch/arm/boot/dts/omap3-zoom3.dts +++ b/arch/arm/boot/dts/omap3-zoom3.dts @@ -9,7 +9,7 @@ / { model = "TI Zoom3"; - compatible = "ti,omap3-zoom3", "ti,omap36xx", "ti,omap3"; + compatible = "ti,omap3-zoom3", "ti,omap3630", "ti,omap36xx", "ti,omap3"; cpus { cpu@0 { diff --git a/arch/arm/boot/dts/omap3430-sdp.dts b/arch/arm/boot/dts/omap3430-sdp.dts index 0abd61108a53..7bfde8aac7ae 100644 --- a/arch/arm/boot/dts/omap3430-sdp.dts +++ b/arch/arm/boot/dts/omap3430-sdp.dts @@ -8,7 +8,7 @@ / { model = "TI OMAP3430 SDP"; - compatible = "ti,omap3430-sdp", "ti,omap3"; + compatible = "ti,omap3430-sdp", "ti,omap3430", "ti,omap3"; memory@80000000 { device_type = "memory"; diff --git a/arch/arm/boot/dts/omap34xx.dtsi b/arch/arm/boot/dts/omap34xx.dtsi index 7b09cbee8bb8..c4dd9801840d 100644 --- a/arch/arm/boot/dts/omap34xx.dtsi +++ b/arch/arm/boot/dts/omap34xx.dtsi @@ -16,19 +16,67 @@ / { cpus { cpu: cpu@0 { - /* OMAP343x/OMAP35xx variants OPP1-5 */ - operating-points = < - /* kHz uV */ - 125000 975000 - 250000 1075000 - 500000 1200000 - 550000 1270000 - 600000 1350000 - >; + /* OMAP343x/OMAP35xx variants OPP1-6 */ + operating-points-v2 = <&cpu0_opp_table>; + clock-latency = <300000>; /* From legacy driver */ }; }; + /* see Documentation/devicetree/bindings/opp/opp.txt */ + cpu0_opp_table: opp-table { + compatible = "operating-points-v2-ti-cpu"; + syscon = <&scm_conf>; + + opp1-125000000 { + opp-hz = /bits/ 64 <125000000>; + /* + * we currently only select the max voltage from table + * Table 3-3 of the omap3530 Data sheet (SPRS507F). + * Format is: <target min max> + */ + opp-microvolt = <975000 975000 975000>; + /* + * first value is silicon revision bit mask + * second one 720MHz Device Identification bit mask + */ + opp-supported-hw = <0xffffffff 3>; + }; + + opp2-250000000 { + opp-hz = /bits/ 64 <250000000>; + opp-microvolt = <1075000 1075000 1075000>; + opp-supported-hw = <0xffffffff 3>; + opp-suspend; + }; + + opp3-500000000 { + opp-hz = /bits/ 64 <500000000>; + opp-microvolt = <1200000 1200000 1200000>; + opp-supported-hw = <0xffffffff 3>; + }; + + opp4-550000000 { + opp-hz = /bits/ 64 <550000000>; + opp-microvolt = <1275000 1275000 1275000>; + opp-supported-hw = <0xffffffff 3>; + }; + + opp5-600000000 { + opp-hz = /bits/ 64 <600000000>; + opp-microvolt = <1350000 1350000 1350000>; + opp-supported-hw = <0xffffffff 3>; + }; + + opp6-720000000 { + opp-hz = /bits/ 64 <720000000>; + opp-microvolt = <1350000 1350000 1350000>; + /* only high-speed grade omap3530 devices */ + opp-supported-hw = <0xffffffff 2>; + turbo-mode; + }; + }; + ocp@68000000 { omap3_pmx_core2: pinmux@480025d8 { compatible = "ti,omap3-padconf", "pinctrl-single"; diff --git a/arch/arm/boot/dts/omap36xx.dtsi b/arch/arm/boot/dts/omap36xx.dtsi index 1e552f08f120..c618cb257d00 100644 --- a/arch/arm/boot/dts/omap36xx.dtsi +++ b/arch/arm/boot/dts/omap36xx.dtsi @@ -19,16 +19,65 @@ }; cpus { - /* OMAP3630/OMAP37xx 'standard device' variants OPP50 to OPP130 */ + /* OMAP3630/OMAP37xx variants OPP50 to OPP130 and OPP1G */ cpu: cpu@0 { - operating-points = < - /* kHz uV */ - 300000 1012500 - 600000 1200000 - 800000 1325000 - >; - clock-latency = <300000>; /* From legacy driver */ + operating-points-v2 = <&cpu0_opp_table>; + + vbb-supply = <&abb_mpu_iva>; + clock-latency = <300000>; /* From omap-cpufreq driver */ + }; + }; + + /* see Documentation/devicetree/bindings/opp/opp.txt */ + cpu0_opp_table: opp-table { + compatible = "operating-points-v2-ti-cpu"; + syscon = <&scm_conf>; + + opp50-300000000 { + opp-hz = /bits/ 64 <300000000>; + /* + * we currently only select the max voltage from table + * Table 4-19 of the DM3730 Data sheet (SPRS685B) + * Format is: cpu0-supply: <target min max> + * vbb-supply: <target min max> + */ + opp-microvolt = <1012500 1012500 1012500>, + <1012500 1012500 1012500>; + /* + * first value is silicon revision bit mask + * second one is "speed binned" bit mask + */ + opp-supported-hw = <0xffffffff 3>; + opp-suspend; + }; + + opp100-600000000 { + opp-hz = /bits/ 64 <600000000>; + opp-microvolt = <1200000 1200000 1200000>, + <1200000 1200000 1200000>; + opp-supported-hw = <0xffffffff 3>; + }; + + opp130-800000000 { + opp-hz = /bits/ 64 <800000000>; + opp-microvolt = <1325000 1325000 1325000>, + <1325000 1325000 1325000>; + opp-supported-hw = <0xffffffff 3>; }; + + opp1g-1000000000 { + opp-hz = /bits/ 64 <1000000000>; + opp-microvolt = <1375000 1375000 1375000>, + <1375000 1375000 1375000>; + /* only on am/dm37x with speed-binned bit set */ + opp-supported-hw = <0xffffffff 2>; + turbo-mode; + }; + }; + + opp_supply_mpu_iva: opp_supply { + compatible = "ti,omap-opp-supply"; + ti,absolute-max-voltage-uv = <1375000>; }; ocp@68000000 { diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 40d7f1a4fc45..89cce8d4bc6b 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -554,3 +554,4 @@ CONFIG_DEBUG_INFO_DWARF4=y CONFIG_MAGIC_SYSRQ=y CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_BUGVERBOSE is not set +CONFIG_TI_CPSW_SWITCHDEV=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 043b0b18bf7e..2674de6ada1f 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -30,7 +30,7 @@ config CRYPTO_SHA1_ARM_NEON config CRYPTO_SHA1_ARM_CE tristate "SHA1 digest algorithm (ARM v8 Crypto Extensions)" - depends on KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) select CRYPTO_SHA1_ARM select CRYPTO_HASH help @@ -39,7 +39,7 @@ config CRYPTO_SHA1_ARM_CE config CRYPTO_SHA2_ARM_CE tristate "SHA-224/256 digest algorithm (ARM v8 Crypto Extensions)" - depends on KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) select CRYPTO_SHA256_ARM select CRYPTO_HASH help @@ -81,7 +81,7 @@ config CRYPTO_AES_ARM config CRYPTO_AES_ARM_BS tristate "Bit sliced AES using NEON instructions" depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER select CRYPTO_LIB_AES select CRYPTO_SIMD help @@ -96,8 +96,8 @@ config CRYPTO_AES_ARM_BS config CRYPTO_AES_ARM_CE tristate "Accelerated AES using ARMv8 Crypto Extensions" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + select CRYPTO_SKCIPHER select CRYPTO_LIB_AES select CRYPTO_SIMD help @@ -106,7 +106,7 @@ config CRYPTO_AES_ARM_CE config CRYPTO_GHASH_ARM_CE tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions" - depends on KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) select CRYPTO_HASH select CRYPTO_CRYPTD select CRYPTO_GF128MUL @@ -118,23 +118,35 @@ config CRYPTO_GHASH_ARM_CE config CRYPTO_CRCT10DIF_ARM_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" - depends on KERNEL_MODE_NEON && CRC_T10DIF + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on CRC_T10DIF select CRYPTO_HASH config CRYPTO_CRC32_ARM_CE tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions" - depends on KERNEL_MODE_NEON && CRC32 + depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on CRC32 select CRYPTO_HASH config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha stream cipher algorithms" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_CHACHA20 + tristate "NEON and scalar accelerated ChaCha stream cipher algorithms" + select CRYPTO_SKCIPHER + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_ARM + tristate "Accelerated scalar and SIMD Poly1305 hash implementations" + select CRYPTO_HASH + select CRYPTO_ARCH_HAVE_LIB_POLY1305 config CRYPTO_NHPOLY1305_NEON tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)" depends on KERNEL_MODE_NEON select CRYPTO_NHPOLY1305 +config CRYPTO_CURVE25519_NEON + tristate "NEON accelerated Curve25519 scalar multiplication library" + depends on KERNEL_MODE_NEON + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 4180f3a13512..b745c17d356f 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -10,34 +10,16 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o +obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o -ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o -ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o -ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o -ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o -ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o -crc-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o - -ifneq ($(crc-obj-y)$(crc-obj-m),) -ifeq ($(call as-instr,.arch armv8-a\n.arch_extension crc,y,n),y) -ce-obj-y += $(crc-obj-y) -ce-obj-m += $(crc-obj-m) -else -$(warning These CRC Extensions modules need binutils 2.23 or higher) -$(warning $(crc-obj-y) $(crc-obj-m)) -endif -endif - -ifneq ($(ce-obj-y)$(ce-obj-m),) -ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) -obj-y += $(ce-obj-y) -obj-m += $(ce-obj-m) -else -$(warning These ARMv8 Crypto Extensions modules need binutils 2.23 or higher) -$(warning $(ce-obj-y) $(ce-obj-m)) -endif -endif +obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o +obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o +obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o +obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o +obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o @@ -53,13 +35,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o +chacha-neon-y := chacha-scalar-core.o chacha-glue.o +chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o +poly1305-arm-y := poly1305-core.o poly1305-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o +curve25519-neon-y := curve25519-core.o curve25519-glue.o ifdef REGENERATE_ARM_CRYPTO quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $(<) > $(@) +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl + $(call cmd,perl) + $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl $(call cmd,perl) @@ -67,4 +55,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl $(call cmd,perl) endif -clean-files += sha256-core.S sha512-core.S +clean-files += poly1305-core.S sha256-core.S sha512-core.S + +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y) diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c new file mode 100644 index 000000000000..3f0c057aa050 --- /dev/null +++ b/arch/arm/crypto/chacha-glue.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 Martin Willi + */ + +#include <crypto/algapi.h> +#include <crypto/internal/chacha.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cputype.h> +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + const u32 *state, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); + +static inline bool neon_usable(void) +{ + return static_branch_likely(&use_neon) && crypto_simd_usable(); +} + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + } +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { + hchacha_block_arm(state, stream, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, stream, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || + bytes <= CHACHA_BLOCK_SIZE) { + chacha_doarm(dst, src, bytes, state, nrounds); + state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); + return; + } + + kernel_neon_begin(); + chacha_doneon(state, dst, src, bytes, nrounds); + kernel_neon_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_stream_xor(struct skcipher_request *req, + const struct chacha_ctx *ctx, const u8 *iv, + bool neon) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + if (!neon) { + chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr, + nbytes, state, ctx->nrounds); + state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE); + } else { + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + kernel_neon_end(); + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int do_chacha(struct skcipher_request *req, bool neon) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_stream_xor(req, ctx, req->iv, neon); +} + +static int chacha_arm(struct skcipher_request *req) +{ + return do_chacha(req, false); +} + +static int chacha_neon(struct skcipher_request *req) +{ + return do_chacha(req, neon_usable()); +} + +static int do_xchacha(struct skcipher_request *req, bool neon) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + + if (!neon) { + hchacha_block_arm(state, subctx.key, ctx->nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, subctx.key, ctx->nrounds); + kernel_neon_end(); + } + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_stream_xor(req, &subctx, real_iv, neon); +} + +static int xchacha_arm(struct skcipher_request *req) +{ + return do_xchacha(req, false); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + return do_xchacha(req, neon_usable()); +} + +static struct skcipher_alg arm_algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-arm", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha_arm, + .decrypt = chacha_arm, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-arm", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = xchacha_arm, + .decrypt = xchacha_arm, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-arm", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha12_setkey, + .encrypt = xchacha_arm, + .decrypt = xchacha_arm, + }, +}; + +static struct skcipher_alg neon_algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + int err; + + err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); + if (err) + return err; + + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + int i; + + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A7: + case ARM_CPU_PART_CORTEX_A5: + /* + * The Cortex-A7 and Cortex-A5 do not perform well with + * the NEON implementation but do incredibly with the + * scalar one and use less power. + */ + for (i = 0; i < ARRAY_SIZE(neon_algs); i++) + neon_algs[i].base.cra_priority = 0; + break; + default: + static_branch_enable(&use_neon); + } + + err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs)); + if (err) + crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); + } + return err; +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) + crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-arm"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-arm"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-arm"); +#ifdef CONFIG_KERNEL_MODE_NEON +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); +#endif diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c deleted file mode 100644 index a8e9b534c8da..000000000000 --- a/arch/arm/crypto/chacha-neon-glue.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * ARM NEON accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/algapi.h> -#include <crypto/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, - int nrounds); -asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, - int nrounds); -asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); - -static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_neon(state, dst, src, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha_block_xor_neon(state, dst, src, nrounds); - bytes -= CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, bytes); - } -} - -static int chacha_neon_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - crypto_chacha_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - kernel_neon_begin(); - chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes, ctx->nrounds); - kernel_neon_end(); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - - return chacha_neon_stream_xor(req, ctx, req->iv); -} - -static int xchacha_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - crypto_chacha_init(state, ctx, req->iv); - - kernel_neon_begin(); - hchacha_block_neon(state, subctx.key, ctx->nrounds); - kernel_neon_end(); - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_neon_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha_neon, - .decrypt = chacha_neon, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - } -}; - -static int __init chacha_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha_simd_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-neon"); diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S new file mode 100644 index 000000000000..2985b80a45b5 --- /dev/null +++ b/arch/arm/crypto/chacha-scalar-core.S @@ -0,0 +1,460 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Google, Inc. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Design notes: + * + * 16 registers would be needed to hold the state matrix, but only 14 are + * available because 'sp' and 'pc' cannot be used. So we spill the elements + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one + * 'ldrd' and one 'strd' instruction per round. + * + * All rotates are performed using the implicit rotate operand accepted by the + * 'add' and 'eor' instructions. This is faster than using explicit rotate + * instructions. To make this work, we allow the values in the second and last + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the + * wrong rotation amount. The rotation amount is then fixed up just in time + * when the values are used. 'brot' is the number of bits the values in row 'b' + * need to be rotated right to arrive at the correct values, and 'drot' + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such + * that they end up as (25, 24) after every round. + */ + + // ChaCha state registers + X0 .req r0 + X1 .req r1 + X2 .req r2 + X3 .req r3 + X4 .req r4 + X5 .req r5 + X6 .req r6 + X7 .req r7 + X8_X10 .req r8 // shared by x8 and x10 + X9_X11 .req r9 // shared by x9 and x11 + X12 .req r10 + X13 .req r11 + X14 .req r12 + X15 .req r14 + +.macro __rev out, in, t0, t1, t2 +.if __LINUX_ARM_ARCH__ >= 6 + rev \out, \in +.else + lsl \t0, \in, #24 + and \t1, \in, #0xff00 + and \t2, \in, #0xff0000 + orr \out, \t0, \in, lsr #24 + orr \out, \out, \t1, lsl #8 + orr \out, \out, \t2, lsr #8 +.endif +.endm + +.macro _le32_bswap x, t0, t1, t2 +#ifdef __ARMEB__ + __rev \x, \x, \t0, \t1, \t2 +#endif +.endm + +.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 + _le32_bswap \a, \t0, \t1, \t2 + _le32_bswap \b, \t0, \t1, \t2 + _le32_bswap \c, \t0, \t1, \t2 + _le32_bswap \d, \t0, \t1, \t2 +.endm + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 + + // a += b; d ^= a; d = rol(d, 16); + add \a1, \a1, \b1, ror #brot + add \a2, \a2, \b2, ror #brot + eor \d1, \a1, \d1, ror #drot + eor \d2, \a2, \d2, ror #drot + // drot == 32 - 16 == 16 + + // c += d; b ^= c; b = rol(b, 12); + add \c1, \c1, \d1, ror #16 + add \c2, \c2, \d2, ror #16 + eor \b1, \c1, \b1, ror #brot + eor \b2, \c2, \b2, ror #brot + // brot == 32 - 12 == 20 + + // a += b; d ^= a; d = rol(d, 8); + add \a1, \a1, \b1, ror #20 + add \a2, \a2, \b2, ror #20 + eor \d1, \a1, \d1, ror #16 + eor \d2, \a2, \d2, ror #16 + // drot == 32 - 8 == 24 + + // c += d; b ^= c; b = rol(b, 7); + add \c1, \c1, \d1, ror #24 + add \c2, \c2, \d2, ror #24 + eor \b1, \c1, \b1, ror #20 + eor \b2, \c2, \b2, ror #20 + // brot == 32 - 7 == 25 +.endm + +.macro _doubleround + + // column round + + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 + + // save (x8, x9); restore (x10, x11) + __strd X8_X10, X9_X11, sp, 0 + __ldrd X8_X10, X9_X11, sp, 8 + + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 + + .set brot, 25 + .set drot, 24 + + // diagonal round + + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 + + // save (x10, x11); restore (x8, x9) + __strd X8_X10, X9_X11, sp, 8 + __ldrd X8_X10, X9_X11, sp, 0 + + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 +.endm + +.macro _chacha_permute nrounds + .set brot, 0 + .set drot, 0 + .rept \nrounds / 2 + _doubleround + .endr +.endm + +.macro _chacha nrounds + +.Lnext_block\@: + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + + // Do the core ChaCha permutation to update x0-x15. + _chacha_permute \nrounds + + add sp, #8 + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). + push {X8_X10, X9_X11, X12, X13, X14, X15} + + // Load (OUT, IN, LEN). + ldr r14, [sp, #96] + ldr r12, [sp, #100] + ldr r11, [sp, #104] + + orr r10, r14, r12 + + // Use slow path if fewer than 64 bytes remain. + cmp r11, #64 + blt .Lxor_slowpath\@ + + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on + // ARMv6+, since ldmia and stmia (used below) still require alignment. + tst r10, #3 + bne .Lxor_slowpath\@ + + // Fast path: XOR 64 bytes of aligned data. + + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // x0-x3 + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + ldmia r12!, {r8-r11} + eor X0, X0, r8 + eor X1, X1, r9 + eor X2, X2, r10 + eor X3, X3, r11 + stmia r14!, {X0-X3} + + // x4-x7 + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + ldmia r12!, {X0-X3} + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + eor X4, X4, X0 + eor X5, X5, X1 + eor X6, X6, X2 + eor X7, X7, X3 + stmia r14!, {X4-X7} + + // x8-x15 + pop {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + ldmia r12!, {r8-r11} + eor r0, r0, r8 // x8 + eor r1, r1, r9 // x9 + eor r6, r6, r10 // x10 + eor r7, r7, r11 // x11 + stmia r14!, {r0,r1,r6,r7} + ldmia r12!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + ldr r9, [sp, #72] // load LEN + eor r2, r2, r0 // x12 + eor r3, r3, r1 // x13 + eor r4, r4, r6 // x14 + eor r5, r5, r7 // x15 + subs r9, #64 // decrement and check LEN + stmia r14!, {r2-r5} + + beq .Ldone\@ + +.Lprepare_for_next_block\@: + + // Stack: x0-x15 OUT IN LEN + + // Increment block counter (x12) + add r8, #1 + + // Store updated (OUT, IN, LEN) + str r14, [sp, #64] + str r12, [sp, #68] + str r9, [sp, #72] + + mov r14, sp + + // Store updated block counter (x12) + str r8, [sp, #48] + + sub sp, #16 + + // Reload state and do next block + ldmia r14!, {r0-r11} // load x0-x11 + __strd r10, r11, sp, 8 // store x10-x11 before state + ldmia r14, {r10-r12,r14} // load x12-x15 + b .Lnext_block\@ + +.Lxor_slowpath\@: + // Slow path: < 64 bytes remaining, or unaligned input or output buffer. + // We handle it by storing the 64 bytes of keystream to the stack, then + // XOR-ing the needed portion with the data. + + // Allocate keystream buffer + sub sp, #64 + mov r14, sp + + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Save keystream for x0-x3 + __ldrd r8, r9, sp, 96 + __ldrd r10, r11, sp, 104 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + stmia r14!, {X0-X3} + + // Save keystream for x4-x7 + __ldrd r8, r9, sp, 112 + __ldrd r10, r11, sp, 120 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + add r8, sp, #64 + stmia r14!, {X4-X7} + + // Save keystream for x8-x15 + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 128 + __ldrd r10, r11, sp, 136 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + stmia r14!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 144 + __ldrd r10, r11, sp, 152 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + stmia r14, {r2-r5} + + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN + // Registers: r8 is block counter, r12 is IN. + + ldr r9, [sp, #168] // LEN + ldr r14, [sp, #160] // OUT + cmp r9, #64 + mov r0, sp + movle r1, r9 + movgt r1, #64 + // r1 is number of bytes to XOR, in range [1, 64] + +.if __LINUX_ARM_ARCH__ < 6 + orr r2, r12, r14 + tst r2, #3 // IN or OUT misaligned? + bne .Lxor_next_byte\@ +.endif + + // XOR a word at a time +.rept 16 + subs r1, #4 + blt .Lxor_words_done\@ + ldr r2, [r12], #4 + ldr r3, [r0], #4 + eor r2, r2, r3 + str r2, [r14], #4 +.endr + b .Lxor_slowpath_done\@ +.Lxor_words_done\@: + ands r1, r1, #3 + beq .Lxor_slowpath_done\@ + + // XOR a byte at a time +.Lxor_next_byte\@: + ldrb r2, [r12], #1 + ldrb r3, [r0], #1 + eor r2, r2, r3 + strb r2, [r14], #1 + subs r1, #1 + bne .Lxor_next_byte\@ + +.Lxor_slowpath_done\@: + subs r9, #64 + add sp, #96 + bgt .Lprepare_for_next_block\@ + +.Ldone\@: +.endm // _chacha + +/* + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + * const u32 *state, int nrounds); + */ +ENTRY(chacha_doarm) + cmp r2, #0 // len == 0? + reteq lr + + ldr ip, [sp] + cmp ip, #12 + + push {r0-r2,r4-r11,lr} + + // Push state x0-x15 onto stack. + // Also store an extra copy of x10-x11 just before the state. + + add X12, r3, #48 + ldm X12, {X12,X13,X14,X15} + push {X12,X13,X14,X15} + sub sp, sp, #64 + + __ldrd X8_X10, X9_X11, r3, 40 + __strd X8_X10, X9_X11, sp, 8 + __strd X8_X10, X9_X11, sp, 56 + ldm r3, {X0-X9_X11} + __strd X0, X1, sp, 16 + __strd X2, X3, sp, 24 + __strd X4, X5, sp, 32 + __strd X6, X7, sp, 40 + __strd X8_X10, X9_X11, sp, 48 + + beq 1f + _chacha 20 + +0: add sp, #76 + pop {r4-r11, pc} + +1: _chacha 12 + b 0b +ENDPROC(chacha_doarm) + +/* + * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); + */ +ENTRY(hchacha_block_arm) + push {r1,r4-r11,lr} + + cmp r2, #12 // ChaCha12 ? + + mov r14, r0 + ldmia r14!, {r0-r11} // load x0-x11 + push {r10-r11} // store x10-x11 to stack + ldm r14, {r10-r12,r14} // load x12-x15 + sub sp, #8 + + beq 1f + _chacha_permute 20 + + // Skip over (unused0-unused1, x10-x11) +0: add sp, #16 + + // Fix up rotations of x12-x15 + ror X12, X12, #drot + ror X13, X13, #drot + pop {r4} // load 'out' + ror X14, X14, #drot + ror X15, X15, #drot + + // Store (x0-x3,x12-x15) to 'out' + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} + + pop {r4-r11,pc} + +1: _chacha_permute 12 + b 0b +ENDPROC(hchacha_block_arm) diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S index 86be258a803f..46c02c518a30 100644 --- a/arch/arm/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -72,7 +72,7 @@ #endif .text - .arch armv7-a + .arch armv8-a .fpu crypto-neon-fp-armv8 init_crc .req r0 diff --git a/arch/arm/crypto/curve25519-core.S b/arch/arm/crypto/curve25519-core.S new file mode 100644 index 000000000000..be18af52e7dc --- /dev/null +++ b/arch/arm/crypto/curve25519-core.S @@ -0,0 +1,2062 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been + * manually reworked for use in kernel space. + */ + +#include <linux/linkage.h> + +.text +.fpu neon +.arch armv7-a +.align 4 + +ENTRY(curve25519_neon) + push {r4-r11, lr} + mov ip, sp + sub r3, sp, #704 + and r3, r3, #0xfffffff0 + mov sp, r3 + movw r4, #0 + movw r5, #254 + vmov.i32 q0, #1 + vshr.u64 q1, q0, #7 + vshr.u64 q0, q0, #8 + vmov.i32 d4, #19 + vmov.i32 d5, #38 + add r6, sp, #480 + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d4-d5}, [r6, : 128] + add r6, r3, #0 + vmov.i32 q2, #0 + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 d4, [r6, : 64] + add r6, r3, #0 + movw r7, #960 + sub r7, r7, #2 + neg r7, r7 + sub r7, r7, r7, LSL #7 + str r7, [r6] + add r6, sp, #672 + vld1.8 {d4-d5}, [r1]! + vld1.8 {d6-d7}, [r1] + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 {d6-d7}, [r6, : 128] + sub r1, r6, #16 + ldrb r6, [r1] + and r6, r6, #248 + strb r6, [r1] + ldrb r6, [r1, #31] + and r6, r6, #127 + orr r6, r6, #64 + strb r6, [r1, #31] + vmov.i64 q2, #0xffffffff + vshr.u64 q3, q2, #7 + vshr.u64 q2, q2, #6 + vld1.8 {d8}, [r2] + vld1.8 {d10}, [r2] + add r2, r2, #6 + vld1.8 {d12}, [r2] + vld1.8 {d14}, [r2] + add r2, r2, #6 + vld1.8 {d16}, [r2] + add r2, r2, #4 + vld1.8 {d18}, [r2] + vld1.8 {d20}, [r2] + add r2, r2, #6 + vld1.8 {d22}, [r2] + add r2, r2, #2 + vld1.8 {d24}, [r2] + vld1.8 {d26}, [r2] + vshr.u64 q5, q5, #26 + vshr.u64 q6, q6, #3 + vshr.u64 q7, q7, #29 + vshr.u64 q8, q8, #6 + vshr.u64 q10, q10, #25 + vshr.u64 q11, q11, #3 + vshr.u64 q12, q12, #12 + vshr.u64 q13, q13, #38 + vand q4, q4, q2 + vand q6, q6, q2 + vand q8, q8, q2 + vand q10, q10, q2 + vand q2, q12, q2 + vand q5, q5, q3 + vand q7, q7, q3 + vand q9, q9, q3 + vand q11, q11, q3 + vand q3, q13, q3 + add r2, r3, #48 + vadd.i64 q12, q4, q1 + vadd.i64 q13, q10, q1 + vshr.s64 q12, q12, #26 + vshr.s64 q13, q13, #26 + vadd.i64 q5, q5, q12 + vshl.i64 q12, q12, #26 + vadd.i64 q14, q5, q0 + vadd.i64 q11, q11, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q11, q0 + vsub.i64 q4, q4, q12 + vshr.s64 q12, q14, #25 + vsub.i64 q10, q10, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q12 + vshl.i64 q12, q12, #25 + vadd.i64 q14, q6, q1 + vadd.i64 q2, q2, q13 + vsub.i64 q5, q5, q12 + vshr.s64 q12, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q1 + vadd.i64 q7, q7, q12 + vshl.i64 q12, q12, #26 + vadd.i64 q15, q7, q0 + vsub.i64 q11, q11, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q12 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q3, q0 + vadd.i64 q8, q8, q12 + vshl.i64 q12, q12, #25 + vadd.i64 q15, q8, q1 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q7, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q9, q9, q12 + vtrn.32 d12, d14 + vshl.i64 q12, q12, #26 + vtrn.32 d13, d15 + vadd.i64 q0, q9, q0 + vadd.i64 q4, q4, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q6, q13, #4 + vsub.i64 q7, q8, q12 + vshr.s64 q0, q0, #25 + vadd.i64 q4, q4, q6 + vadd.i64 q6, q10, q0 + vshl.i64 q0, q0, #25 + vadd.i64 q8, q6, q1 + vadd.i64 q4, q4, q13 + vshl.i64 q10, q13, #25 + vadd.i64 q1, q4, q1 + vsub.i64 q0, q9, q0 + vshr.s64 q8, q8, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d14, d0 + vshr.s64 q1, q1, #26 + vtrn.32 d15, d1 + vadd.i64 q0, q11, q8 + vst1.8 d14, [r2, : 64] + vshl.i64 q7, q8, #26 + vadd.i64 q5, q5, q1 + vtrn.32 d4, d6 + vshl.i64 q1, q1, #26 + vtrn.32 d5, d7 + vsub.i64 q3, q6, q7 + add r2, r2, #16 + vsub.i64 q1, q4, q1 + vst1.8 d4, [r2, : 64] + vtrn.32 d6, d0 + vtrn.32 d7, d1 + sub r2, r2, #8 + vtrn.32 d2, d10 + vtrn.32 d3, d11 + vst1.8 d6, [r2, : 64] + sub r2, r2, #24 + vst1.8 d2, [r2, : 64] + add r2, r3, #96 + vmov.i32 q0, #0 + vmov.i64 d2, #0xff + vmov.i64 d3, #0 + vshr.u32 q1, q1, #7 + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #144 + vmov.i32 q0, #0 + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #240 + vmov.i32 q0, #0 + vmov.i64 d2, #0xff + vmov.i64 d3, #0 + vshr.u32 q1, q1, #7 + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #48 + add r6, r3, #192 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 d4, [r6, : 64] +.Lmainloop: + mov r2, r5, LSR #3 + and r6, r5, #7 + ldrb r2, [r1, r2] + mov r2, r2, LSR r6 + and r2, r2, #1 + str r5, [sp, #456] + eor r4, r4, r2 + str r2, [sp, #460] + neg r2, r4 + add r4, r3, #96 + add r5, r3, #192 + add r6, r3, #144 + vld1.8 {d8-d9}, [r4, : 128]! + add r7, r3, #240 + vld1.8 {d10-d11}, [r5, : 128]! + veor q6, q4, q5 + vld1.8 {d14-d15}, [r6, : 128]! + vdup.i32 q8, r2 + vld1.8 {d18-d19}, [r7, : 128]! + veor q10, q7, q9 + vld1.8 {d22-d23}, [r4, : 128]! + vand q6, q6, q8 + vld1.8 {d24-d25}, [r5, : 128]! + vand q10, q10, q8 + vld1.8 {d26-d27}, [r6, : 128]! + veor q4, q4, q6 + vld1.8 {d28-d29}, [r7, : 128]! + veor q5, q5, q6 + vld1.8 {d0}, [r4, : 64] + veor q6, q7, q10 + vld1.8 {d2}, [r5, : 64] + veor q7, q9, q10 + vld1.8 {d4}, [r6, : 64] + veor q9, q11, q12 + vld1.8 {d6}, [r7, : 64] + veor q10, q0, q1 + sub r2, r4, #32 + vand q9, q9, q8 + sub r4, r5, #32 + vand q10, q10, q8 + sub r5, r6, #32 + veor q11, q11, q9 + sub r6, r7, #32 + veor q0, q0, q10 + veor q9, q12, q9 + veor q1, q1, q10 + veor q10, q13, q14 + veor q12, q2, q3 + vand q10, q10, q8 + vand q8, q12, q8 + veor q12, q13, q10 + veor q2, q2, q8 + veor q10, q14, q10 + veor q3, q3, q8 + vadd.i32 q8, q4, q6 + vsub.i32 q4, q4, q6 + vst1.8 {d16-d17}, [r2, : 128]! + vadd.i32 q6, q11, q12 + vst1.8 {d8-d9}, [r5, : 128]! + vsub.i32 q4, q11, q12 + vst1.8 {d12-d13}, [r2, : 128]! + vadd.i32 q6, q0, q2 + vst1.8 {d8-d9}, [r5, : 128]! + vsub.i32 q0, q0, q2 + vst1.8 d12, [r2, : 64] + vadd.i32 q2, q5, q7 + vst1.8 d0, [r5, : 64] + vsub.i32 q0, q5, q7 + vst1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q2, q9, q10 + vst1.8 {d0-d1}, [r6, : 128]! + vsub.i32 q0, q9, q10 + vst1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q2, q1, q3 + vst1.8 {d0-d1}, [r6, : 128]! + vsub.i32 q0, q1, q3 + vst1.8 d4, [r4, : 64] + vst1.8 d0, [r6, : 64] + add r2, sp, #512 + add r4, r3, #96 + add r5, r3, #144 + vld1.8 {d0-d1}, [r2, : 128] + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4-d5}, [r5, : 128]! + vzip.i32 q1, q2 + vld1.8 {d6-d7}, [r4, : 128]! + vld1.8 {d8-d9}, [r5, : 128]! + vshl.i32 q5, q1, #1 + vzip.i32 q3, q4 + vshl.i32 q6, q2, #1 + vld1.8 {d14}, [r4, : 64] + vshl.i32 q8, q3, #1 + vld1.8 {d15}, [r5, : 64] + vshl.i32 q9, q4, #1 + vmul.i32 d21, d7, d1 + vtrn.32 d14, d15 + vmul.i32 q11, q4, q0 + vmul.i32 q0, q7, q0 + vmull.s32 q12, d2, d2 + vmlal.s32 q12, d11, d1 + vmlal.s32 q12, d12, d0 + vmlal.s32 q12, d13, d23 + vmlal.s32 q12, d16, d22 + vmlal.s32 q12, d7, d21 + vmull.s32 q10, d2, d11 + vmlal.s32 q10, d4, d1 + vmlal.s32 q10, d13, d0 + vmlal.s32 q10, d6, d23 + vmlal.s32 q10, d17, d22 + vmull.s32 q13, d10, d4 + vmlal.s32 q13, d11, d3 + vmlal.s32 q13, d13, d1 + vmlal.s32 q13, d16, d0 + vmlal.s32 q13, d17, d23 + vmlal.s32 q13, d8, d22 + vmull.s32 q1, d10, d5 + vmlal.s32 q1, d11, d4 + vmlal.s32 q1, d6, d1 + vmlal.s32 q1, d17, d0 + vmlal.s32 q1, d8, d23 + vmull.s32 q14, d10, d6 + vmlal.s32 q14, d11, d13 + vmlal.s32 q14, d4, d4 + vmlal.s32 q14, d17, d1 + vmlal.s32 q14, d18, d0 + vmlal.s32 q14, d9, d23 + vmull.s32 q11, d10, d7 + vmlal.s32 q11, d11, d6 + vmlal.s32 q11, d12, d5 + vmlal.s32 q11, d8, d1 + vmlal.s32 q11, d19, d0 + vmull.s32 q15, d10, d8 + vmlal.s32 q15, d11, d17 + vmlal.s32 q15, d12, d6 + vmlal.s32 q15, d13, d5 + vmlal.s32 q15, d19, d1 + vmlal.s32 q15, d14, d0 + vmull.s32 q2, d10, d9 + vmlal.s32 q2, d11, d8 + vmlal.s32 q2, d12, d7 + vmlal.s32 q2, d13, d6 + vmlal.s32 q2, d14, d1 + vmull.s32 q0, d15, d1 + vmlal.s32 q0, d10, d14 + vmlal.s32 q0, d11, d19 + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 + add r2, sp, #480 + vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 + vshr.s64 q5, q5, #26 + vshr.s64 q6, q6, #26 + vadd.i64 q7, q10, q5 + vshl.i64 q5, q5, #26 + vadd.i64 q8, q7, q4 + vadd.i64 q2, q2, q6 + vshl.i64 q6, q6, #26 + vadd.i64 q10, q2, q4 + vsub.i64 q5, q12, q5 + vshr.s64 q8, q8, #25 + vsub.i64 q6, q15, q6 + vshr.s64 q10, q10, #25 + vadd.i64 q12, q13, q8 + vshl.i64 q8, q8, #25 + vadd.i64 q13, q12, q9 + vadd.i64 q0, q0, q10 + vsub.i64 q7, q7, q8 + vshr.s64 q8, q13, #26 + vshl.i64 q10, q10, #25 + vadd.i64 q13, q0, q9 + vadd.i64 q1, q1, q8 + vshl.i64 q8, q8, #26 + vadd.i64 q15, q1, q4 + vsub.i64 q2, q2, q10 + vshr.s64 q10, q13, #26 + vsub.i64 q8, q12, q8 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q10 + vshl.i64 q10, q10, #26 + vadd.i64 q13, q3, q4 + vadd.i64 q14, q14, q12 + add r2, r3, #288 + vshl.i64 q12, q12, #25 + add r4, r3, #336 + vadd.i64 q15, q14, q9 + add r2, r2, #8 + vsub.i64 q0, q0, q10 + add r4, r4, #8 + vshr.s64 q10, q13, #25 + vsub.i64 q1, q1, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q13, q10, q10 + vadd.i64 q11, q11, q12 + vtrn.32 d16, d2 + vshl.i64 q12, q12, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q11, q4 + vadd.i64 q4, q5, q13 + vst1.8 d16, [r2, : 64]! + vshl.i64 q5, q10, #4 + vst1.8 d17, [r4, : 64]! + vsub.i64 q8, q14, q12 + vshr.s64 q1, q1, #25 + vadd.i64 q4, q4, q5 + vadd.i64 q5, q6, q1 + vshl.i64 q1, q1, #25 + vadd.i64 q6, q5, q9 + vadd.i64 q4, q4, q10 + vshl.i64 q10, q10, #25 + vadd.i64 q9, q4, q9 + vsub.i64 q1, q11, q1 + vshr.s64 q6, q6, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d16, d2 + vshr.s64 q9, q9, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q2, q6 + vst1.8 d16, [r2, : 64] + vshl.i64 q2, q6, #26 + vst1.8 d17, [r4, : 64] + vadd.i64 q6, q7, q9 + vtrn.32 d0, d6 + vshl.i64 q7, q9, #26 + vtrn.32 d1, d7 + vsub.i64 q2, q5, q2 + add r2, r2, #16 + vsub.i64 q3, q4, q7 + vst1.8 d0, [r2, : 64] + add r4, r4, #16 + vst1.8 d1, [r4, : 64] + vtrn.32 d4, d2 + vtrn.32 d5, d3 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d6, d12 + vtrn.32 d7, d13 + vst1.8 d4, [r2, : 64] + vst1.8 d5, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d6, [r2, : 64] + vst1.8 d7, [r4, : 64] + add r2, r3, #240 + add r4, r3, #96 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #144 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #192 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128] + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #144 + vshl.i64 q7, q7, #25 + add r4, r3, #96 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + add r2, r3, #288 + add r4, r3, #336 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vsub.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4-d5}, [r4, : 128]! + vsub.i32 q1, q1, q2 + add r5, r3, #240 + vld1.8 {d4}, [r2, : 64] + vld1.8 {d6}, [r4, : 64] + vsub.i32 q2, q2, q3 + vst1.8 {d0-d1}, [r5, : 128]! + vst1.8 {d2-d3}, [r5, : 128]! + vst1.8 d4, [r5, : 64] + add r2, r3, #144 + add r4, r3, #96 + add r5, r3, #144 + add r6, r3, #192 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vsub.i32 q2, q0, q1 + vadd.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d6-d7}, [r4, : 128]! + vsub.i32 q4, q1, q3 + vadd.i32 q1, q1, q3 + vld1.8 {d6}, [r2, : 64] + vld1.8 {d10}, [r4, : 64] + vsub.i32 q6, q3, q5 + vadd.i32 q3, q3, q5 + vst1.8 {d4-d5}, [r5, : 128]! + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d8-d9}, [r5, : 128]! + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 d12, [r5, : 64] + vst1.8 d6, [r6, : 64] + add r2, r3, #0 + add r4, r3, #240 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #336 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #288 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #288 + vshl.i64 q7, q7, #25 + add r4, r3, #96 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + add r2, sp, #512 + add r4, r3, #144 + add r5, r3, #192 + vld1.8 {d0-d1}, [r2, : 128] + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4-d5}, [r5, : 128]! + vzip.i32 q1, q2 + vld1.8 {d6-d7}, [r4, : 128]! + vld1.8 {d8-d9}, [r5, : 128]! + vshl.i32 q5, q1, #1 + vzip.i32 q3, q4 + vshl.i32 q6, q2, #1 + vld1.8 {d14}, [r4, : 64] + vshl.i32 q8, q3, #1 + vld1.8 {d15}, [r5, : 64] + vshl.i32 q9, q4, #1 + vmul.i32 d21, d7, d1 + vtrn.32 d14, d15 + vmul.i32 q11, q4, q0 + vmul.i32 q0, q7, q0 + vmull.s32 q12, d2, d2 + vmlal.s32 q12, d11, d1 + vmlal.s32 q12, d12, d0 + vmlal.s32 q12, d13, d23 + vmlal.s32 q12, d16, d22 + vmlal.s32 q12, d7, d21 + vmull.s32 q10, d2, d11 + vmlal.s32 q10, d4, d1 + vmlal.s32 q10, d13, d0 + vmlal.s32 q10, d6, d23 + vmlal.s32 q10, d17, d22 + vmull.s32 q13, d10, d4 + vmlal.s32 q13, d11, d3 + vmlal.s32 q13, d13, d1 + vmlal.s32 q13, d16, d0 + vmlal.s32 q13, d17, d23 + vmlal.s32 q13, d8, d22 + vmull.s32 q1, d10, d5 + vmlal.s32 q1, d11, d4 + vmlal.s32 q1, d6, d1 + vmlal.s32 q1, d17, d0 + vmlal.s32 q1, d8, d23 + vmull.s32 q14, d10, d6 + vmlal.s32 q14, d11, d13 + vmlal.s32 q14, d4, d4 + vmlal.s32 q14, d17, d1 + vmlal.s32 q14, d18, d0 + vmlal.s32 q14, d9, d23 + vmull.s32 q11, d10, d7 + vmlal.s32 q11, d11, d6 + vmlal.s32 q11, d12, d5 + vmlal.s32 q11, d8, d1 + vmlal.s32 q11, d19, d0 + vmull.s32 q15, d10, d8 + vmlal.s32 q15, d11, d17 + vmlal.s32 q15, d12, d6 + vmlal.s32 q15, d13, d5 + vmlal.s32 q15, d19, d1 + vmlal.s32 q15, d14, d0 + vmull.s32 q2, d10, d9 + vmlal.s32 q2, d11, d8 + vmlal.s32 q2, d12, d7 + vmlal.s32 q2, d13, d6 + vmlal.s32 q2, d14, d1 + vmull.s32 q0, d15, d1 + vmlal.s32 q0, d10, d14 + vmlal.s32 q0, d11, d19 + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 + add r2, sp, #480 + vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 + vshr.s64 q5, q5, #26 + vshr.s64 q6, q6, #26 + vadd.i64 q7, q10, q5 + vshl.i64 q5, q5, #26 + vadd.i64 q8, q7, q4 + vadd.i64 q2, q2, q6 + vshl.i64 q6, q6, #26 + vadd.i64 q10, q2, q4 + vsub.i64 q5, q12, q5 + vshr.s64 q8, q8, #25 + vsub.i64 q6, q15, q6 + vshr.s64 q10, q10, #25 + vadd.i64 q12, q13, q8 + vshl.i64 q8, q8, #25 + vadd.i64 q13, q12, q9 + vadd.i64 q0, q0, q10 + vsub.i64 q7, q7, q8 + vshr.s64 q8, q13, #26 + vshl.i64 q10, q10, #25 + vadd.i64 q13, q0, q9 + vadd.i64 q1, q1, q8 + vshl.i64 q8, q8, #26 + vadd.i64 q15, q1, q4 + vsub.i64 q2, q2, q10 + vshr.s64 q10, q13, #26 + vsub.i64 q8, q12, q8 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q10 + vshl.i64 q10, q10, #26 + vadd.i64 q13, q3, q4 + vadd.i64 q14, q14, q12 + add r2, r3, #144 + vshl.i64 q12, q12, #25 + add r4, r3, #192 + vadd.i64 q15, q14, q9 + add r2, r2, #8 + vsub.i64 q0, q0, q10 + add r4, r4, #8 + vshr.s64 q10, q13, #25 + vsub.i64 q1, q1, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q13, q10, q10 + vadd.i64 q11, q11, q12 + vtrn.32 d16, d2 + vshl.i64 q12, q12, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q11, q4 + vadd.i64 q4, q5, q13 + vst1.8 d16, [r2, : 64]! + vshl.i64 q5, q10, #4 + vst1.8 d17, [r4, : 64]! + vsub.i64 q8, q14, q12 + vshr.s64 q1, q1, #25 + vadd.i64 q4, q4, q5 + vadd.i64 q5, q6, q1 + vshl.i64 q1, q1, #25 + vadd.i64 q6, q5, q9 + vadd.i64 q4, q4, q10 + vshl.i64 q10, q10, #25 + vadd.i64 q9, q4, q9 + vsub.i64 q1, q11, q1 + vshr.s64 q6, q6, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d16, d2 + vshr.s64 q9, q9, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q2, q6 + vst1.8 d16, [r2, : 64] + vshl.i64 q2, q6, #26 + vst1.8 d17, [r4, : 64] + vadd.i64 q6, q7, q9 + vtrn.32 d0, d6 + vshl.i64 q7, q9, #26 + vtrn.32 d1, d7 + vsub.i64 q2, q5, q2 + add r2, r2, #16 + vsub.i64 q3, q4, q7 + vst1.8 d0, [r2, : 64] + add r4, r4, #16 + vst1.8 d1, [r4, : 64] + vtrn.32 d4, d2 + vtrn.32 d5, d3 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d6, d12 + vtrn.32 d7, d13 + vst1.8 d4, [r2, : 64] + vst1.8 d5, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d6, [r2, : 64] + vst1.8 d7, [r4, : 64] + add r2, r3, #336 + add r4, r3, #288 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vadd.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q1, q1, q2 + add r5, r3, #288 + vld1.8 {d4}, [r2, : 64] + vld1.8 {d6}, [r4, : 64] + vadd.i32 q2, q2, q3 + vst1.8 {d0-d1}, [r5, : 128]! + vst1.8 {d2-d3}, [r5, : 128]! + vst1.8 d4, [r5, : 64] + add r2, r3, #48 + add r4, r3, #144 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #288 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #240 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #240 + vshl.i64 q7, q7, #25 + add r4, r3, #144 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + ldr r2, [sp, #456] + ldr r4, [sp, #460] + subs r5, r2, #1 + bge .Lmainloop + add r1, r3, #144 + add r2, r3, #336 + vld1.8 {d0-d1}, [r1, : 128]! + vld1.8 {d2-d3}, [r1, : 128]! + vld1.8 {d4}, [r1, : 64] + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 d4, [r2, : 64] + movw r1, #0 +.Linvertloop: + add r2, r3, #144 + movw r4, #0 + movw r5, #2 + cmp r1, #1 + moveq r5, #1 + addeq r2, r3, #336 + addeq r4, r3, #48 + cmp r1, #2 + moveq r5, #1 + addeq r2, r3, #48 + cmp r1, #3 + moveq r5, #5 + addeq r4, r3, #336 + cmp r1, #4 + moveq r5, #10 + cmp r1, #5 + moveq r5, #20 + cmp r1, #6 + moveq r5, #10 + addeq r2, r3, #336 + addeq r4, r3, #336 + cmp r1, #7 + moveq r5, #50 + cmp r1, #8 + moveq r5, #100 + cmp r1, #9 + moveq r5, #50 + addeq r2, r3, #336 + cmp r1, #10 + moveq r5, #5 + addeq r2, r3, #48 + cmp r1, #11 + moveq r5, #0 + addeq r2, r3, #96 + add r6, r3, #144 + add r7, r3, #288 + vld1.8 {d0-d1}, [r6, : 128]! + vld1.8 {d2-d3}, [r6, : 128]! + vld1.8 {d4}, [r6, : 64] + vst1.8 {d0-d1}, [r7, : 128]! + vst1.8 {d2-d3}, [r7, : 128]! + vst1.8 d4, [r7, : 64] + cmp r5, #0 + beq .Lskipsquaringloop +.Lsquaringloop: + add r6, r3, #288 + add r7, r3, #288 + add r8, r3, #288 + vmov.i32 q0, #19 + vmov.i32 q1, #0 + vmov.i32 q2, #1 + vzip.i32 q1, q2 + vld1.8 {d4-d5}, [r7, : 128]! + vld1.8 {d6-d7}, [r7, : 128]! + vld1.8 {d9}, [r7, : 64] + vld1.8 {d10-d11}, [r6, : 128]! + add r7, sp, #384 + vld1.8 {d12-d13}, [r6, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r6, : 64] + vext.32 d17, d11, d10, #1 + vmul.i32 q9, q3, q0 + vext.32 d16, d10, d8, #1 + vshl.u32 q10, q5, q1 + vext.32 d22, d14, d4, #1 + vext.32 d24, d18, d6, #1 + vshl.u32 q13, q6, q1 + vshl.u32 d28, d8, d2 + vrev64.i32 d22, d22 + vmul.i32 d1, d9, d1 + vrev64.i32 d24, d24 + vext.32 d29, d8, d13, #1 + vext.32 d0, d1, d9, #1 + vrev64.i32 d0, d0 + vext.32 d2, d9, d1, #1 + vext.32 d23, d15, d5, #1 + vmull.s32 q4, d20, d4 + vrev64.i32 d23, d23 + vmlal.s32 q4, d21, d1 + vrev64.i32 d2, d2 + vmlal.s32 q4, d26, d19 + vext.32 d3, d5, d15, #1 + vmlal.s32 q4, d27, d18 + vrev64.i32 d3, d3 + vmlal.s32 q4, d28, d15 + vext.32 d14, d12, d11, #1 + vmull.s32 q5, d16, d23 + vext.32 d15, d13, d12, #1 + vmlal.s32 q5, d17, d4 + vst1.8 d8, [r7, : 64]! + vmlal.s32 q5, d14, d1 + vext.32 d12, d9, d8, #0 + vmlal.s32 q5, d15, d19 + vmov.i64 d13, #0 + vmlal.s32 q5, d29, d18 + vext.32 d25, d19, d7, #1 + vmlal.s32 q6, d20, d5 + vrev64.i32 d25, d25 + vmlal.s32 q6, d21, d4 + vst1.8 d11, [r7, : 64]! + vmlal.s32 q6, d26, d1 + vext.32 d9, d10, d10, #0 + vmlal.s32 q6, d27, d19 + vmov.i64 d8, #0 + vmlal.s32 q6, d28, d18 + vmlal.s32 q4, d16, d24 + vmlal.s32 q4, d17, d5 + vmlal.s32 q4, d14, d4 + vst1.8 d12, [r7, : 64]! + vmlal.s32 q4, d15, d1 + vext.32 d10, d13, d12, #0 + vmlal.s32 q4, d29, d19 + vmov.i64 d11, #0 + vmlal.s32 q5, d20, d6 + vmlal.s32 q5, d21, d5 + vmlal.s32 q5, d26, d4 + vext.32 d13, d8, d8, #0 + vmlal.s32 q5, d27, d1 + vmov.i64 d12, #0 + vmlal.s32 q5, d28, d19 + vst1.8 d9, [r7, : 64]! + vmlal.s32 q6, d16, d25 + vmlal.s32 q6, d17, d6 + vst1.8 d10, [r7, : 64] + vmlal.s32 q6, d14, d5 + vext.32 d8, d11, d10, #0 + vmlal.s32 q6, d15, d4 + vmov.i64 d9, #0 + vmlal.s32 q6, d29, d1 + vmlal.s32 q4, d20, d7 + vmlal.s32 q4, d21, d6 + vmlal.s32 q4, d26, d5 + vext.32 d11, d12, d12, #0 + vmlal.s32 q4, d27, d4 + vmov.i64 d10, #0 + vmlal.s32 q4, d28, d1 + vmlal.s32 q5, d16, d0 + sub r6, r7, #32 + vmlal.s32 q5, d17, d7 + vmlal.s32 q5, d14, d6 + vext.32 d30, d9, d8, #0 + vmlal.s32 q5, d15, d5 + vld1.8 {d31}, [r6, : 64]! + vmlal.s32 q5, d29, d4 + vmlal.s32 q15, d20, d0 + vext.32 d0, d6, d18, #1 + vmlal.s32 q15, d21, d25 + vrev64.i32 d0, d0 + vmlal.s32 q15, d26, d24 + vext.32 d1, d7, d19, #1 + vext.32 d7, d10, d10, #0 + vmlal.s32 q15, d27, d23 + vrev64.i32 d1, d1 + vld1.8 {d6}, [r6, : 64] + vmlal.s32 q15, d28, d22 + vmlal.s32 q3, d16, d4 + add r6, r6, #24 + vmlal.s32 q3, d17, d2 + vext.32 d4, d31, d30, #0 + vmov d17, d11 + vmlal.s32 q3, d14, d1 + vext.32 d11, d13, d13, #0 + vext.32 d13, d30, d30, #0 + vmlal.s32 q3, d15, d0 + vext.32 d1, d8, d8, #0 + vmlal.s32 q3, d29, d3 + vld1.8 {d5}, [r6, : 64] + sub r6, r6, #16 + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 + add r7, sp, #480 + vld1.8 {d14-d15}, [r7, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 + vshr.s64 q10, q9, #26 + vld1.8 {d0}, [r6, : 64]! + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r6, : 64]! + add r6, sp, #496 + vld1.8 {d20-d21}, [r6, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 + vshr.s64 q9, q11, #25 + vext.32 d12, d5, d4, #0 + vand q11, q11, q4 + vadd.i64 q0, q0, q9 + vmov d19, d7 + vadd.i64 q3, q0, q7 + vsub.i64 q5, q5, q11 + vshr.s64 q11, q3, #26 + vext.32 d18, d11, d10, #0 + vand q3, q3, q1 + vadd.i64 q8, q8, q11 + vadd.i64 q11, q8, q10 + vsub.i64 q0, q0, q3 + vshr.s64 q3, q11, #25 + vand q11, q11, q4 + vadd.i64 q3, q6, q3 + vadd.i64 q6, q3, q7 + vsub.i64 q8, q8, q11 + vshr.s64 q11, q6, #26 + vand q6, q6, q1 + vadd.i64 q9, q9, q11 + vadd.i64 d25, d19, d21 + vsub.i64 q3, q3, q6 + vshr.s64 d23, d25, #25 + vand q4, q12, q4 + vadd.i64 d21, d23, d23 + vshl.i64 d25, d23, #4 + vadd.i64 d21, d21, d23 + vadd.i64 d25, d25, d21 + vadd.i64 d4, d4, d25 + vzip.i32 q0, q8 + vadd.i64 d12, d4, d14 + add r6, r8, #8 + vst1.8 d0, [r6, : 64] + vsub.i64 d19, d19, d9 + add r6, r6, #16 + vst1.8 d16, [r6, : 64] + vshr.s64 d22, d12, #26 + vand q0, q6, q1 + vadd.i64 d10, d10, d22 + vzip.i32 q3, q9 + vsub.i64 d4, d4, d0 + sub r6, r6, #8 + vst1.8 d6, [r6, : 64] + add r6, r6, #16 + vst1.8 d18, [r6, : 64] + vzip.i32 q2, q5 + sub r6, r6, #32 + vst1.8 d4, [r6, : 64] + subs r5, r5, #1 + bhi .Lsquaringloop +.Lskipsquaringloop: + mov r2, r2 + add r5, r3, #288 + add r6, r3, #144 + vmov.i32 q0, #19 + vmov.i32 q1, #0 + vmov.i32 q2, #1 + vzip.i32 q1, q2 + vld1.8 {d4-d5}, [r5, : 128]! + vld1.8 {d6-d7}, [r5, : 128]! + vld1.8 {d9}, [r5, : 64] + vld1.8 {d10-d11}, [r2, : 128]! + add r5, sp, #384 + vld1.8 {d12-d13}, [r2, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r2, : 64] + vext.32 d17, d11, d10, #1 + vmul.i32 q9, q3, q0 + vext.32 d16, d10, d8, #1 + vshl.u32 q10, q5, q1 + vext.32 d22, d14, d4, #1 + vext.32 d24, d18, d6, #1 + vshl.u32 q13, q6, q1 + vshl.u32 d28, d8, d2 + vrev64.i32 d22, d22 + vmul.i32 d1, d9, d1 + vrev64.i32 d24, d24 + vext.32 d29, d8, d13, #1 + vext.32 d0, d1, d9, #1 + vrev64.i32 d0, d0 + vext.32 d2, d9, d1, #1 + vext.32 d23, d15, d5, #1 + vmull.s32 q4, d20, d4 + vrev64.i32 d23, d23 + vmlal.s32 q4, d21, d1 + vrev64.i32 d2, d2 + vmlal.s32 q4, d26, d19 + vext.32 d3, d5, d15, #1 + vmlal.s32 q4, d27, d18 + vrev64.i32 d3, d3 + vmlal.s32 q4, d28, d15 + vext.32 d14, d12, d11, #1 + vmull.s32 q5, d16, d23 + vext.32 d15, d13, d12, #1 + vmlal.s32 q5, d17, d4 + vst1.8 d8, [r5, : 64]! + vmlal.s32 q5, d14, d1 + vext.32 d12, d9, d8, #0 + vmlal.s32 q5, d15, d19 + vmov.i64 d13, #0 + vmlal.s32 q5, d29, d18 + vext.32 d25, d19, d7, #1 + vmlal.s32 q6, d20, d5 + vrev64.i32 d25, d25 + vmlal.s32 q6, d21, d4 + vst1.8 d11, [r5, : 64]! + vmlal.s32 q6, d26, d1 + vext.32 d9, d10, d10, #0 + vmlal.s32 q6, d27, d19 + vmov.i64 d8, #0 + vmlal.s32 q6, d28, d18 + vmlal.s32 q4, d16, d24 + vmlal.s32 q4, d17, d5 + vmlal.s32 q4, d14, d4 + vst1.8 d12, [r5, : 64]! + vmlal.s32 q4, d15, d1 + vext.32 d10, d13, d12, #0 + vmlal.s32 q4, d29, d19 + vmov.i64 d11, #0 + vmlal.s32 q5, d20, d6 + vmlal.s32 q5, d21, d5 + vmlal.s32 q5, d26, d4 + vext.32 d13, d8, d8, #0 + vmlal.s32 q5, d27, d1 + vmov.i64 d12, #0 + vmlal.s32 q5, d28, d19 + vst1.8 d9, [r5, : 64]! + vmlal.s32 q6, d16, d25 + vmlal.s32 q6, d17, d6 + vst1.8 d10, [r5, : 64] + vmlal.s32 q6, d14, d5 + vext.32 d8, d11, d10, #0 + vmlal.s32 q6, d15, d4 + vmov.i64 d9, #0 + vmlal.s32 q6, d29, d1 + vmlal.s32 q4, d20, d7 + vmlal.s32 q4, d21, d6 + vmlal.s32 q4, d26, d5 + vext.32 d11, d12, d12, #0 + vmlal.s32 q4, d27, d4 + vmov.i64 d10, #0 + vmlal.s32 q4, d28, d1 + vmlal.s32 q5, d16, d0 + sub r2, r5, #32 + vmlal.s32 q5, d17, d7 + vmlal.s32 q5, d14, d6 + vext.32 d30, d9, d8, #0 + vmlal.s32 q5, d15, d5 + vld1.8 {d31}, [r2, : 64]! + vmlal.s32 q5, d29, d4 + vmlal.s32 q15, d20, d0 + vext.32 d0, d6, d18, #1 + vmlal.s32 q15, d21, d25 + vrev64.i32 d0, d0 + vmlal.s32 q15, d26, d24 + vext.32 d1, d7, d19, #1 + vext.32 d7, d10, d10, #0 + vmlal.s32 q15, d27, d23 + vrev64.i32 d1, d1 + vld1.8 {d6}, [r2, : 64] + vmlal.s32 q15, d28, d22 + vmlal.s32 q3, d16, d4 + add r2, r2, #24 + vmlal.s32 q3, d17, d2 + vext.32 d4, d31, d30, #0 + vmov d17, d11 + vmlal.s32 q3, d14, d1 + vext.32 d11, d13, d13, #0 + vext.32 d13, d30, d30, #0 + vmlal.s32 q3, d15, d0 + vext.32 d1, d8, d8, #0 + vmlal.s32 q3, d29, d3 + vld1.8 {d5}, [r2, : 64] + sub r2, r2, #16 + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 + add r5, sp, #480 + vld1.8 {d14-d15}, [r5, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 + vshr.s64 q10, q9, #26 + vld1.8 {d0}, [r2, : 64]! + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r2, : 64]! + add r2, sp, #496 + vld1.8 {d20-d21}, [r2, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 + vshr.s64 q9, q11, #25 + vext.32 d12, d5, d4, #0 + vand q11, q11, q4 + vadd.i64 q0, q0, q9 + vmov d19, d7 + vadd.i64 q3, q0, q7 + vsub.i64 q5, q5, q11 + vshr.s64 q11, q3, #26 + vext.32 d18, d11, d10, #0 + vand q3, q3, q1 + vadd.i64 q8, q8, q11 + vadd.i64 q11, q8, q10 + vsub.i64 q0, q0, q3 + vshr.s64 q3, q11, #25 + vand q11, q11, q4 + vadd.i64 q3, q6, q3 + vadd.i64 q6, q3, q7 + vsub.i64 q8, q8, q11 + vshr.s64 q11, q6, #26 + vand q6, q6, q1 + vadd.i64 q9, q9, q11 + vadd.i64 d25, d19, d21 + vsub.i64 q3, q3, q6 + vshr.s64 d23, d25, #25 + vand q4, q12, q4 + vadd.i64 d21, d23, d23 + vshl.i64 d25, d23, #4 + vadd.i64 d21, d21, d23 + vadd.i64 d25, d25, d21 + vadd.i64 d4, d4, d25 + vzip.i32 q0, q8 + vadd.i64 d12, d4, d14 + add r2, r6, #8 + vst1.8 d0, [r2, : 64] + vsub.i64 d19, d19, d9 + add r2, r2, #16 + vst1.8 d16, [r2, : 64] + vshr.s64 d22, d12, #26 + vand q0, q6, q1 + vadd.i64 d10, d10, d22 + vzip.i32 q3, q9 + vsub.i64 d4, d4, d0 + sub r2, r2, #8 + vst1.8 d6, [r2, : 64] + add r2, r2, #16 + vst1.8 d18, [r2, : 64] + vzip.i32 q2, q5 + sub r2, r2, #32 + vst1.8 d4, [r2, : 64] + cmp r4, #0 + beq .Lskippostcopy + add r2, r3, #144 + mov r4, r4 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +.Lskippostcopy: + cmp r1, #1 + bne .Lskipfinalcopy + add r2, r3, #288 + add r4, r3, #144 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +.Lskipfinalcopy: + add r1, r1, #1 + cmp r1, #12 + blo .Linvertloop + add r1, r3, #144 + ldr r2, [r1], #4 + ldr r3, [r1], #4 + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldr r6, [r1], #4 + ldr r7, [r1], #4 + ldr r8, [r1], #4 + ldr r9, [r1], #4 + ldr r10, [r1], #4 + ldr r1, [r1] + add r11, r1, r1, LSL #4 + add r11, r11, r1, LSL #1 + add r11, r11, #16777216 + mov r11, r11, ASR #25 + add r11, r11, r2 + mov r11, r11, ASR #26 + add r11, r11, r3 + mov r11, r11, ASR #25 + add r11, r11, r4 + mov r11, r11, ASR #26 + add r11, r11, r5 + mov r11, r11, ASR #25 + add r11, r11, r6 + mov r11, r11, ASR #26 + add r11, r11, r7 + mov r11, r11, ASR #25 + add r11, r11, r8 + mov r11, r11, ASR #26 + add r11, r11, r9 + mov r11, r11, ASR #25 + add r11, r11, r10 + mov r11, r11, ASR #26 + add r11, r11, r1 + mov r11, r11, ASR #25 + add r2, r2, r11 + add r2, r2, r11, LSL #1 + add r2, r2, r11, LSL #4 + mov r11, r2, ASR #26 + add r3, r3, r11 + sub r2, r2, r11, LSL #26 + mov r11, r3, ASR #25 + add r4, r4, r11 + sub r3, r3, r11, LSL #25 + mov r11, r4, ASR #26 + add r5, r5, r11 + sub r4, r4, r11, LSL #26 + mov r11, r5, ASR #25 + add r6, r6, r11 + sub r5, r5, r11, LSL #25 + mov r11, r6, ASR #26 + add r7, r7, r11 + sub r6, r6, r11, LSL #26 + mov r11, r7, ASR #25 + add r8, r8, r11 + sub r7, r7, r11, LSL #25 + mov r11, r8, ASR #26 + add r9, r9, r11 + sub r8, r8, r11, LSL #26 + mov r11, r9, ASR #25 + add r10, r10, r11 + sub r9, r9, r11, LSL #25 + mov r11, r10, ASR #26 + add r1, r1, r11 + sub r10, r10, r11, LSL #26 + mov r11, r1, ASR #25 + sub r1, r1, r11, LSL #25 + add r2, r2, r3, LSL #26 + mov r3, r3, LSR #6 + add r3, r3, r4, LSL #19 + mov r4, r4, LSR #13 + add r4, r4, r5, LSL #13 + mov r5, r5, LSR #19 + add r5, r5, r6, LSL #6 + add r6, r7, r8, LSL #25 + mov r7, r8, LSR #7 + add r7, r7, r9, LSL #19 + mov r8, r9, LSR #13 + add r8, r8, r10, LSL #12 + mov r9, r10, LSR #20 + add r1, r9, r1, LSL #6 + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r1, [r0, #28] + movw r0, #0 + mov sp, ip + pop {r4-r11, pc} +ENDPROC(curve25519_neon) diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c new file mode 100644 index 000000000000..2e9e12d2f642 --- /dev/null +++ b/arch/arm/crypto/curve25519-glue.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been + * manually reworked for use in kernel space. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/kpp.h> +#include <crypto/internal/simd.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/jump_label.h> +#include <crypto/curve25519.h> + +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + curve25519_neon(out, scalar, point); + kernel_neon_end(); + } else { + curve25519_generic(out, scalar, point); + } +} +EXPORT_SYMBOL(curve25519_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_compute_value(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + u8 const *bp; + + if (req->src) { + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + bp = public_key; + } else { + bp = curve25519_base_point; + } + + curve25519_arch(buf, secret, bp); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-neon", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_compute_value, + .compute_shared_secret = curve25519_compute_value, + .max_size = curve25519_max_size, +}; + +static int __init mod_init(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + return crypto_register_kpp(&curve25519_alg); + } + return 0; +} + +static void __exit mod_exit(void) +{ + if (elf_hwcap & HWCAP_NEON) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(mod_init); +module_exit(mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-neon"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index c47fe81abcb0..534c9647726d 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -88,6 +88,7 @@ T3_H .req d17 .text + .arch armv8-a .fpu crypto-neon-fp-armv8 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 diff --git a/arch/arm/crypto/poly1305-armv4.pl b/arch/arm/crypto/poly1305-armv4.pl new file mode 100644 index 000000000000..6d79498d3115 --- /dev/null +++ b/arch/arm/crypto/poly1305-armv4.pl @@ -0,0 +1,1236 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_init_arm +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arm +.globl poly1305_blocks_neon +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ clear is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + mov r3,#-1 + str r3,[$ctx,#28] @ impossible key power value +# ifndef __KERNEL__ + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +# endif +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +# if !defined(_WIN32) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# endif +# if defined(__APPLE__) || defined(_WIN32) + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + it ne + movne r11,r9 + adr r12,.Lpoly1305_emit + orr r11,r11,#1 @ thumb-ify addresses + orr r12,r12,#1 +# else + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + ite eq + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + +#if __ARM_ARCH__<7 + ldmia $ctx,{$h0-$r3} @ load context + add $ctx,$ctx,#20 + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] +#else + ldr lr,[$ctx,#36] @ is_base2_26 + ldmia $ctx!,{$h0-$h4} @ load hash value + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] + + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $r1,$h1,lsr#6 + adcs $r1,$r1,$h2,lsl#20 + mov $r2,$h2,lsr#12 + adcs $r2,$r2,$h3,lsl#14 + mov $r3,$h3,lsr#18 + adcs $r3,$r3,$h4,lsl#8 + mov $len,#0 + teq lr,#0 + str $len,[$ctx,#16] @ clear is_base2_26 + adc $len,$len,$h4,lsr#24 + + itttt ne + movne $h0,$r0 @ choose between radixes + movne $h1,$r1 + movne $h2,$r2 + movne $h3,$r3 + ldmia $ctx,{$r0-$r3} @ load key + it ne + movne $h4,$len +#endif + + mov lr,$inp + cmp $padbit,#0 + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.align 4 +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input + it hi + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmdb $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$ctx; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + stmdb sp!,{r4-r11} + + ldmia $ctx,{$h0-$h4} + +#if __ARM_ARCH__>=7 + ldr ip,[$ctx,#36] @ is_base2_26 + + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $g1,$h1,lsr#6 + adcs $g1,$g1,$h2,lsl#20 + mov $g2,$h2,lsr#12 + adcs $g2,$g2,$h3,lsl#14 + mov $g3,$h3,lsr#18 + adcs $g3,$g3,$h4,lsl#8 + mov $g4,#0 + adc $g4,$g4,$h4,lsr#24 + + tst ip,ip + itttt ne + movne $h0,$g0 + movne $h1,$g1 + movne $h2,$g2 + movne $h3,$g3 + it ne + movne $h4,$g4 +#endif + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r3,[$ctx,#48] @ first table element + cmp r3,#-1 @ is value impossible? + bne .Lno_init_neon + + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + +.Lno_init_neon: + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + cmp $len,#64 + blo .Lpoly1305_blocks + + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ set is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lhash_loaded + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lhash_loaded: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/arch/arm/crypto/poly1305-core.S_shipped b/arch/arm/crypto/poly1305-core.S_shipped new file mode 100644 index 000000000000..37b71d990293 --- /dev/null +++ b/arch/arm/crypto/poly1305-core.S_shipped @@ -0,0 +1,1158 @@ +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_init_arm +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arm +.globl poly1305_blocks_neon +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp r1,#0 + str r3,[r0,#0] @ zero hash value + str r3,[r0,#4] + str r3,[r0,#8] + str r3,[r0,#12] + str r3,[r0,#16] + str r3,[r0,#36] @ clear is_base2_26 + add r0,r0,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + mov r3,#-1 + str r3,[r0,#28] @ impossible key power value +# ifndef __KERNEL__ + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +# endif +#endif + ldrb r4,[r1,#0] + mov r10,#0x0fffffff + ldrb r5,[r1,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[r1,#2] + ldrb r7,[r1,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[r1,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[r1,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[r1,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +# if !defined(_WIN32) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# endif +# if defined(__APPLE__) || defined(_WIN32) + ldr r12,[r12] +# endif +#endif + ldrb r8,[r1,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[r1,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[r1,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[r1,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + it ne + movne r11,r9 + adr r12,.Lpoly1305_emit + orr r11,r11,#1 @ thumb-ify addresses + orr r12,r12,#1 +# else + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + ite eq + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) +# endif +#endif + ldrb r9,[r1,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[r1,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[r1,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[r1,#14] + and r6,r6,r3 + + ldrb r10,[r1,#15] + orr r7,r7,r8,lsl#8 + str r4,[r0,#0] + orr r7,r7,r9,lsl#16 + str r5,[r0,#4] + orr r7,r7,r10,lsl#24 + str r6,[r0,#8] + and r7,r7,r3 + str r7,[r0,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands r2,r2,#-16 + beq .Lno_data + + add r2,r2,r1 @ end pointer + sub sp,sp,#32 + +#if __ARM_ARCH__<7 + ldmia r0,{r4-r12} @ load context + add r0,r0,#20 + str r2,[sp,#16] @ offload stuff + str r0,[sp,#12] +#else + ldr lr,[r0,#36] @ is_base2_26 + ldmia r0!,{r4-r8} @ load hash value + str r2,[sp,#16] @ offload stuff + str r0,[sp,#12] + + adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32 + mov r10,r5,lsr#6 + adcs r10,r10,r6,lsl#20 + mov r11,r6,lsr#12 + adcs r11,r11,r7,lsl#14 + mov r12,r7,lsr#18 + adcs r12,r12,r8,lsl#8 + mov r2,#0 + teq lr,#0 + str r2,[r0,#16] @ clear is_base2_26 + adc r2,r2,r8,lsr#24 + + itttt ne + movne r4,r9 @ choose between radixes + movne r5,r10 + movne r6,r11 + movne r7,r12 + ldmia r0,{r9-r12} @ load key + it ne + movne r8,r2 +#endif + + mov lr,r1 + cmp r3,#0 + str r10,[sp,#20] + str r11,[sp,#24] + str r12,[sp,#28] + b .Loop + +.align 4 +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi r8,r8,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds r4,r4,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs r5,r5,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs r6,r6,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add r10,r10,r10,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input + it hi + addhi r8,r8,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds r4,r4,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs r5,r5,r1 + add r10,r10,r10,lsr#2 + adcs r6,r6,r2 +#endif + add r11,r11,r11,lsr#2 + adcs r7,r7,r3 + add r12,r12,r12,lsr#2 + + umull r2,r3,r5,r9 + adc r8,r8,#0 + umull r0,r1,r4,r9 + umlal r2,r3,r8,r10 + umlal r0,r1,r7,r10 + ldr r10,[sp,#20] @ reload r10 + umlal r2,r3,r6,r12 + umlal r0,r1,r5,r12 + umlal r2,r3,r7,r11 + umlal r0,r1,r6,r11 + umlal r2,r3,r4,r10 + str r0,[sp,#0] @ future r4 + mul r0,r11,r8 + ldr r11,[sp,#24] @ reload r11 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future r6 + str r2,[sp,#4] @ future r5 + + mul r2,r12,r8 + eor r3,r3,r3 + umlal r0,r1,r7,r12 + ldr r12,[sp,#28] @ reload r12 + umlal r2,r3,r7,r9 + umlal r0,r1,r6,r9 + umlal r2,r3,r6,r10 + umlal r0,r1,r5,r10 + umlal r2,r3,r5,r11 + umlal r0,r1,r4,r11 + umlal r2,r3,r4,r12 + ldr r4,[sp,#0] + mul r8,r9,r8 + ldr r5,[sp,#4] + + adds r6,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds r7,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add r8,r8,r3 @ h4+=d3>>32 + + and r1,r8,#-4 + and r8,r8,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds r4,r4,r1 + adcs r5,r5,#0 + adcs r6,r6,#0 + adcs r7,r7,#0 + adc r8,r8,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr r0,[sp,#12] + add sp,sp,#32 + stmdb r0,{r4-r8} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + stmdb sp!,{r4-r11} + + ldmia r0,{r3-r7} + +#if __ARM_ARCH__>=7 + ldr ip,[r0,#36] @ is_base2_26 + + adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32 + mov r9,r4,lsr#6 + adcs r9,r9,r5,lsl#20 + mov r10,r5,lsr#12 + adcs r10,r10,r6,lsl#14 + mov r11,r6,lsr#18 + adcs r11,r11,r7,lsl#8 + mov r0,#0 + adc r0,r0,r7,lsr#24 + + tst ip,ip + itttt ne + movne r3,r8 + movne r4,r9 + movne r5,r10 + movne r6,r11 + it ne + movne r7,r0 +#endif + + adds r8,r3,#5 @ compare to modulus + adcs r9,r4,#0 + adcs r10,r5,#0 + adcs r11,r6,#0 + adc r0,r7,#0 + tst r0,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne r3,r8 + ldr r8,[r2,#0] +#ifdef __thumb2__ + it ne +#endif + movne r4,r9 + ldr r9,[r2,#4] +#ifdef __thumb2__ + it ne +#endif + movne r5,r10 + ldr r10,[r2,#8] +#ifdef __thumb2__ + it ne +#endif + movne r6,r11 + ldr r11,[r2,#12] + + adds r3,r3,r8 + adcs r4,r4,r9 + adcs r5,r5,r10 + adc r6,r6,r11 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 +# endif + str r3,[r1,#0] + str r4,[r1,#4] + str r5,[r1,#8] + str r6,[r1,#12] +#else + strb r3,[r1,#0] + mov r3,r3,lsr#8 + strb r4,[r1,#4] + mov r4,r4,lsr#8 + strb r5,[r1,#8] + mov r5,r5,lsr#8 + strb r6,[r1,#12] + mov r6,r6,lsr#8 + + strb r3,[r1,#1] + mov r3,r3,lsr#8 + strb r4,[r1,#5] + mov r4,r4,lsr#8 + strb r5,[r1,#9] + mov r5,r5,lsr#8 + strb r6,[r1,#13] + mov r6,r6,lsr#8 + + strb r3,[r1,#2] + mov r3,r3,lsr#8 + strb r4,[r1,#6] + mov r4,r4,lsr#8 + strb r5,[r1,#10] + mov r5,r5,lsr#8 + strb r6,[r1,#14] + mov r6,r6,lsr#8 + + strb r3,[r1,#3] + strb r4,[r1,#7] + strb r5,[r1,#11] + strb r6,[r1,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r3,[r0,#48] @ first table element + cmp r3,#-1 @ is value impossible? + bne .Lno_init_neon + + ldr r4,[r0,#20] @ load key base 2^32 + ldr r5,[r0,#24] + ldr r6,[r0,#28] + ldr r7,[r0,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 d0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 d1,r3 + add r3,r4,r4,lsl#2 + vdup.32 d2,r2 + vdup.32 d3,r4 + add r4,r5,r5,lsl#2 + vdup.32 d4,r3 + vdup.32 d5,r5 + add r5,r6,r6,lsl#2 + vdup.32 d6,r4 + vdup.32 d7,r6 + vdup.32 d8,r5 + + mov r5,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 q5,d0,d0[1] + vmull.u32 q6,d1,d0[1] + vmull.u32 q7,d3,d0[1] + vmull.u32 q8,d5,d0[1] + vmull.u32 q9,d7,d0[1] + + vmlal.u32 q5,d7,d2[1] + vmlal.u32 q6,d0,d1[1] + vmlal.u32 q7,d1,d1[1] + vmlal.u32 q8,d3,d1[1] + vmlal.u32 q9,d5,d1[1] + + vmlal.u32 q5,d5,d4[1] + vmlal.u32 q6,d7,d4[1] + vmlal.u32 q8,d1,d3[1] + vmlal.u32 q7,d0,d3[1] + vmlal.u32 q9,d3,d3[1] + + vmlal.u32 q5,d3,d6[1] + vmlal.u32 q8,d0,d5[1] + vmlal.u32 q6,d5,d6[1] + vmlal.u32 q7,d7,d6[1] + vmlal.u32 q9,d1,d5[1] + + vmlal.u32 q8,d7,d8[1] + vmlal.u32 q5,d1,d8[1] + vmlal.u32 q6,d3,d8[1] + vmlal.u32 q7,d5,d8[1] + vmlal.u32 q9,d0,d7[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff + vadd.i64 q6,q6,q4 @ h0 -> h1 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vbic.i32 d18,#0xfc000000 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vadd.i32 d10,d10,d30 @ h4 -> h0 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vbic.i32 d14,#0xfc000000 + + vshr.u32 d30,d10,#26 + vbic.i32 d10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + + subs r5,r5,#1 + beq .Lsquare_break_neon + + add r6,r0,#(48+0*9*4) + add r7,r0,#(48+1*9*4) + + vtrn.32 d0,d10 @ r^2:r^1 + vtrn.32 d3,d14 + vtrn.32 d5,d16 + vtrn.32 d1,d12 + vtrn.32 d7,d18 + + vshl.u32 d4,d3,#2 @ *5 + vshl.u32 d6,d5,#2 + vshl.u32 d2,d1,#2 + vshl.u32 d8,d7,#2 + vadd.i32 d4,d4,d3 + vadd.i32 d2,d2,d1 + vadd.i32 d6,d6,d5 + vadd.i32 d8,d8,d7 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6,:32] + vst1.32 {d8[1]},[r7,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add r6,r0,#(48+2*4*9) + add r7,r0,#(48+3*4*9) + + vmov d0,d10 @ r^4:r^3 + vshl.u32 d2,d12,#2 @ *5 + vmov d1,d12 + vshl.u32 d4,d14,#2 + vmov d3,d14 + vshl.u32 d6,d16,#2 + vmov d5,d16 + vshl.u32 d8,d18,#2 + vmov d7,d18 + vadd.i32 d2,d2,d12 + vadd.i32 d4,d4,d14 + vadd.i32 d6,d6,d16 + vadd.i32 d8,d8,d18 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6] + vst1.32 {d8[1]},[r7] + +.Lno_init_neon: + bx lr @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr ip,[r0,#36] @ is_base2_26 + + cmp r2,#64 + blo .Lpoly1305_blocks + + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[r0,#0] @ load hash value base 2^32 + ldr r5,[r0,#4] + ldr r6,[r0,#8] + ldr r7,[r0,#12] + ldr ip,[r0,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor d10,d10,d10 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor d12,d12,d12 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor d14,d14,d14 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor d16,d16,d16 + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor d18,d18,d18 + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[r0,#36] @ set is_base2_26 + + vmov.32 d10[0],r2 + vmov.32 d12[0],r3 + vmov.32 d14[0],r4 + vmov.32 d16[0],r5 + vmov.32 d18[0],r6 + adr r5,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lhash_loaded + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor d10,d10,d10 + veor d12,d12,d12 + veor d14,d14,d14 + veor d16,d16,d16 + veor d18,d18,d18 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + adr r5,.Lzeros + vld1.32 {d18[0]},[r0] + sub r0,r0,#16 @ rewind + +.Lhash_loaded: + add r4,r1,#32 + mov r3,r3,lsl#24 + tst r2,#31 + beq .Leven + + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! + vmov.32 d28[0],r3 + sub r2,r2,#16 + add r4,r1,#32 + +# ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +# endif + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 + vshl.u32 d26,d26,#18 + + vsri.u32 d26,d24,#14 + vshl.u32 d24,d24,#12 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi + + vbic.i32 d26,#0xfc000000 + vsri.u32 d24,d22,#20 + vshl.u32 d22,d22,#6 + + vbic.i32 d24,#0xfc000000 + vsri.u32 d22,d20,#26 + vadd.i32 d27,d26,d16 + + vbic.i32 d20,#0xfc000000 + vbic.i32 d22,#0xfc000000 + vadd.i32 d25,d24,d14 + + vadd.i32 d21,d20,d10 + vadd.i32 d23,d22,d12 + + mov r7,r5 + add r6,r0,#48 + + cmp r2,r2 + b .Long_tail + +.align 4 +.Leven: + subs r2,r2,#64 + it lo + movlo r4,r5 + + vmov.i32 q14,#1<<24 @ padbit, yes, always + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + itt hi + addhi r7,r0,#(48+1*9*4) + addhi r6,r0,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +# endif + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vshl.u32 q13,q13,#18 + + vsri.u32 q13,q12,#14 + vshl.u32 q12,q12,#12 + + vbic.i32 q13,#0xfc000000 + vsri.u32 q12,q11,#20 + vshl.u32 q11,q11,#6 + + vbic.i32 q12,#0xfc000000 + vsri.u32 q11,q10,#26 + + vbic.i32 q10,#0xfc000000 + vbic.i32 q11,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ ___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ ___________________/ ____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 d24,d24,d14 @ accumulate inp[0:1] + vmull.u32 q7,d25,d0[1] + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0[1] + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0[1] + vmlal.u32 q7,d23,d1[1] + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0[1] + + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0[1] + subs r2,r2,#64 + vmlal.u32 q5,d29,d2[1] + it lo + movlo r4,r5 + vmlal.u32 q8,d25,d1[1] + vld1.32 d8[1],[r7,:32] + vmlal.u32 q6,d21,d1[1] + vmlal.u32 q9,d27,d1[1] + + vmlal.u32 q5,d27,d4[1] + vmlal.u32 q8,d23,d3[1] + vmlal.u32 q9,d25,d3[1] + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d21,d3[1] + + vmlal.u32 q8,d21,d5[1] + vmlal.u32 q5,d25,d6[1] + vmlal.u32 q9,d23,d5[1] + vmlal.u32 q6,d27,d6[1] + vmlal.u32 q7,d29,d6[1] + + vmlal.u32 q8,d29,d8[1] + vmlal.u32 q5,d23,d8[1] + vmlal.u32 q9,d21,d7[1] + vmlal.u32 q6,d25,d8[1] + vmlal.u32 q7,d27,d8[1] + + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 q8,d26,d0[0] + vmlal.u32 q5,d20,d0[0] + vmlal.u32 q9,d28,d0[0] + vmlal.u32 q6,d22,d0[0] + vmlal.u32 q7,d24,d0[0] + vld1.32 d8[0],[r6,:32] + + vmlal.u32 q8,d24,d1[0] + vmlal.u32 q5,d28,d2[0] + vmlal.u32 q9,d26,d1[0] + vmlal.u32 q6,d20,d1[0] + vmlal.u32 q7,d22,d1[0] + + vmlal.u32 q8,d22,d3[0] + vmlal.u32 q5,d26,d4[0] + vmlal.u32 q9,d24,d3[0] + vmlal.u32 q6,d28,d4[0] + vmlal.u32 q7,d20,d3[0] + + vmlal.u32 q8,d20,d5[0] + vmlal.u32 q5,d24,d6[0] + vmlal.u32 q9,d22,d5[0] + vmlal.u32 q6,d26,d6[0] + vmlal.u32 q8,d28,d8[0] + + vmlal.u32 q7,d28,d6[0] + vmlal.u32 q5,d22,d8[0] + vmlal.u32 q9,d20,d7[0] + vmov.i32 q14,#1<<24 @ padbit, yes, always + vmlal.u32 q6,d24,d8[0] + vmlal.u32 q7,d26,d8[0] + + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 +# ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vrev32.8 q12,q12 + vrev32.8 q13,q13 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vadd.i64 q6,q6,q4 @ h0 -> h1 + vshl.u32 q13,q13,#18 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vsri.u32 q13,q12,#14 + vbic.i32 d18,#0xfc000000 + vshl.u32 q12,q12,#12 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vbic.i32 q13,#0xfc000000 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] + vsri.u32 q12,q11,#20 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vshl.u32 q11,q11,#6 + vbic.i32 d14,#0xfc000000 + vbic.i32 q12,#0xfc000000 + + vshrn.u64 d30,q5,#26 @ re-narrow + vmovn.i64 d10,q5 + vsri.u32 q11,q10,#26 + vbic.i32 q10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vbic.i32 d10,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + vbic.i32 q11,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add r7,r0,#(48+0*9*4) + add r6,r0,#(48+1*9*4) + adds r2,r2,#32 + it ne + movne r2,#0 + bne .Long_tail + + vadd.i32 d25,d24,d14 @ add hash value and move to #hi + vadd.i32 d21,d20,d10 + vadd.i32 d27,d26,d16 + vadd.i32 d23,d22,d12 + vadd.i32 d29,d28,d18 + +.Long_tail: + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 + + vadd.i32 d24,d24,d14 @ can be redundant + vmull.u32 q7,d25,d0 + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0 + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0 + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0 + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0 + + vmlal.u32 q5,d29,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d25,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d21,d1 + vmlal.u32 q9,d27,d1 + vmlal.u32 q7,d23,d1 + + vmlal.u32 q8,d23,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d27,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d25,d3 + vmlal.u32 q6,d29,d4 + vmlal.u32 q7,d21,d3 + + vmlal.u32 q8,d21,d5 + it ne + addne r7,r0,#(48+2*9*4) + vmlal.u32 q5,d25,d6 + it ne + addne r6,r0,#(48+3*9*4) + vmlal.u32 q9,d23,d5 + vmlal.u32 q6,d27,d6 + vmlal.u32 q7,d29,d6 + + vmlal.u32 q8,d29,d8 + vorn q0,q0,q0 @ all-ones, can be redundant + vmlal.u32 q5,d23,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d21,d7 + vmlal.u32 q6,d25,d8 + vmlal.u32 q7,d27,d8 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + + vmlal.u32 q7,d24,d0 + vmlal.u32 q5,d20,d0 + vmlal.u32 q8,d26,d0 + vmlal.u32 q6,d22,d0 + vmlal.u32 q9,d28,d0 + + vmlal.u32 q5,d28,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d24,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d20,d1 + vmlal.u32 q9,d26,d1 + vmlal.u32 q7,d22,d1 + + vmlal.u32 q8,d22,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d26,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d24,d3 + vmlal.u32 q6,d28,d4 + vmlal.u32 q7,d20,d3 + + vmlal.u32 q8,d20,d5 + vmlal.u32 q5,d24,d6 + vmlal.u32 q9,d22,d5 + vmlal.u32 q6,d26,d6 + vmlal.u32 q7,d28,d6 + + vmlal.u32 q8,d28,d8 + vorn q0,q0,q0 @ all-ones + vmlal.u32 q5,d22,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d20,d7 + vmlal.u32 q6,d24,d8 + vmlal.u32 q7,d26,d8 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 d16,d16,d17 + vadd.i64 d10,d10,d11 + vadd.i64 d18,d18,d19 + vadd.i64 d12,d12,d13 + vadd.i64 d14,d14,d15 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 q15,q8,#26 + vand.i64 q8,q8,q0 + vshr.u64 q4,q5,#26 + vand.i64 q5,q5,q0 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vadd.i64 q6,q6,q4 @ h0 -> h1 + + vshr.u64 q15,q9,#26 + vand.i64 q9,q9,q0 + vshr.u64 q4,q6,#26 + vand.i64 q6,q6,q0 + vadd.i64 q7,q7,q4 @ h1 -> h2 + + vadd.i64 q5,q5,q15 + vshl.u64 q15,q15,#2 + vshr.u64 q4,q7,#26 + vand.i64 q7,q7,q0 + vadd.i64 q5,q5,q15 @ h4 -> h0 + vadd.i64 q8,q8,q4 @ h2 -> h3 + + vshr.u64 q15,q5,#26 + vand.i64 q5,q5,q0 + vshr.u64 q4,q8,#26 + vand.i64 q8,q8,q0 + vadd.i64 q6,q6,q15 @ h0 -> h1 + vadd.i64 q9,q9,q4 @ h3 -> h4 + + cmp r2,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + vst1.32 {d18[0]},[r0] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} + bx lr @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm" +.align 2 diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c new file mode 100644 index 000000000000..74a725ac89c9 --- /dev/null +++ b/arch/arm/crypto/poly1305-glue.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/jump_label.h> +#include <linux/module.h> + +void poly1305_init_arm(void *state, const u8 *key); +void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit); +void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce); + +void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit) +{ +} + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +{ + poly1305_init_arm(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(key + 16); + dctx->s[1] = get_unaligned_le32(key + 20); + dctx->s[2] = get_unaligned_le32(key + 24); + dctx->s[3] = get_unaligned_le32(key + 28); + dctx->buflen = 0; +} +EXPORT_SYMBOL(poly1305_init_arch); + +static int arm_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, + u32 len, u32 hibit, bool do_neon) +{ + if (unlikely(!dctx->sset)) { + if (!dctx->rset) { + poly1305_init_arm(&dctx->h, src); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + if (len < POLY1305_BLOCK_SIZE) + return; + } + + len &= ~(POLY1305_BLOCK_SIZE - 1); + + if (static_branch_likely(&have_neon) && likely(do_neon)) + poly1305_blocks_neon(&dctx->h, src, len, hibit); + else + poly1305_blocks_arm(&dctx->h, src, len, hibit); +} + +static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx, + const u8 *src, u32 len, bool do_neon) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + len -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + arm_poly1305_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE, 1, false); + dctx->buflen = 0; + } + } + + if (likely(len >= POLY1305_BLOCK_SIZE)) { + arm_poly1305_blocks(dctx, src, len, 1, do_neon); + src += round_down(len, POLY1305_BLOCK_SIZE); + len %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(len)) { + dctx->buflen = len; + memcpy(dctx->buf, src, len); + } +} + +static int arm_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + arm_poly1305_do_update(dctx, src, srclen, false); + return 0; +} + +static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc, + const u8 *src, + unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + bool do_neon = crypto_simd_usable() && srclen > 128; + + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_begin(); + arm_poly1305_do_update(dctx, src, srclen, do_neon); + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_end(); + return 0; +} + +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int nbytes) +{ + bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + crypto_simd_usable(); + + if (unlikely(dctx->buflen)) { + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + nbytes -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_blocks_arm(&dctx->h, dctx->buf, + POLY1305_BLOCK_SIZE, 1); + dctx->buflen = 0; + } + } + + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + if (static_branch_likely(&have_neon) && do_neon) { + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, len, 1); + kernel_neon_end(); + } else { + poly1305_blocks_arm(&dctx->h, src, len, 1); + } + src += len; + nbytes %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(nbytes)) { + dctx->buflen = nbytes; + memcpy(dctx->buf, src, nbytes); + } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) +{ + __le32 digest[4]; + u64 f = 0; + + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit_arm(&dctx->h, digest, dctx->s); + + /* mac = (h + s) % (2^128) */ + f = (f >> 32) + le32_to_cpu(digest[0]); + put_unaligned_le32(f, dst); + f = (f >> 32) + le32_to_cpu(digest[1]); + put_unaligned_le32(f, dst + 4); + f = (f >> 32) + le32_to_cpu(digest[2]); + put_unaligned_le32(f, dst + 8); + f = (f >> 32) + le32_to_cpu(digest[3]); + put_unaligned_le32(f, dst + 12); + + *dctx = (struct poly1305_desc_ctx){}; +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int arm_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); + return 0; +} + +static struct shash_alg arm_poly1305_algs[] = {{ + .init = arm_poly1305_init, + .update = arm_poly1305_update, + .final = arm_poly1305_final, + .digestsize = POLY1305_DIGEST_SIZE, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.cra_driver_name = "poly1305-arm", + .base.cra_priority = 150, + .base.cra_blocksize = POLY1305_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +#ifdef CONFIG_KERNEL_MODE_NEON +}, { + .init = arm_poly1305_init, + .update = arm_poly1305_update_neon, + .final = arm_poly1305_final, + .digestsize = POLY1305_DIGEST_SIZE, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.cra_driver_name = "poly1305-neon", + .base.cra_priority = 200, + .base.cra_blocksize = POLY1305_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +#endif +}}; + +static int __init arm_poly1305_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + (elf_hwcap & HWCAP_NEON)) + static_branch_enable(&have_neon); + else + /* register only the first entry */ + return crypto_register_shash(&arm_poly1305_algs[0]); + + return crypto_register_shashes(arm_poly1305_algs, + ARRAY_SIZE(arm_poly1305_algs)); +} + +static void __exit arm_poly1305_mod_exit(void) +{ + if (!static_branch_likely(&have_neon)) { + crypto_unregister_shash(&arm_poly1305_algs[0]); + return; + } + crypto_unregister_shashes(arm_poly1305_algs, + ARRAY_SIZE(arm_poly1305_algs)); +} + +module_init(arm_poly1305_mod_init); +module_exit(arm_poly1305_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-arm"); +MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S index 49a74a441aec..8a702e051738 100644 --- a/arch/arm/crypto/sha1-ce-core.S +++ b/arch/arm/crypto/sha1-ce-core.S @@ -10,6 +10,7 @@ #include <asm/assembler.h> .text + .arch armv8-a .fpu crypto-neon-fp-armv8 k0 .req q0 diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/crypto/sha2-ce-core.S index 4ad517577e23..b6369d2440a1 100644 --- a/arch/arm/crypto/sha2-ce-core.S +++ b/arch/arm/crypto/sha2-ce-core.S @@ -10,6 +10,7 @@ #include <asm/assembler.h> .text + .arch armv8-a .fpu crypto-neon-fp-armv8 k0 .req q7 diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index 8c74037ade22..21b8b271c80d 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -70,8 +70,6 @@ SECTIONS ARM_UNWIND_SECTIONS #endif - NOTES - _etext = .; /* End of text and rodata section */ ARM_VECTORS @@ -114,7 +112,7 @@ SECTIONS . = ALIGN(THREAD_SIZE); _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) .data.ro_after_init : AT(ADDR(.data.ro_after_init) - LOAD_OFFSET) { *(.data..ro_after_init) } diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 23150c0f0f4d..319ccb10846a 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -81,8 +81,6 @@ SECTIONS ARM_UNWIND_SECTIONS #endif - NOTES - #ifdef CONFIG_STRICT_KERNEL_RWX . = ALIGN(1<<SECTION_SHIFT); #else @@ -143,7 +141,7 @@ SECTIONS __init_end = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; BSS_SECTION(0, 0, 0) diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c index 39a7d9393641..24dd5bbe60e4 100644 --- a/arch/arm/mach-imx/cpuidle-imx6q.c +++ b/arch/arm/mach-imx/cpuidle-imx6q.c @@ -62,13 +62,13 @@ static struct cpuidle_driver imx6q_cpuidle_driver = { */ void imx6q_cpuidle_fec_irqs_used(void) { - imx6q_cpuidle_driver.states[1].disabled = true; + cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, true); } EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_used); void imx6q_cpuidle_fec_irqs_unused(void) { - imx6q_cpuidle_driver.states[1].disabled = false; + cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, false); } EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_unused); diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 8f208197988f..1e1e86d17fc5 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -216,9 +216,6 @@ obj-$(CONFIG_MACH_NOKIA_N8X0) += board-n8x0.o # Platform specific device init code -omap-hsmmc-$(CONFIG_MMC_OMAP_HS) := hsmmc.o -obj-y += $(omap-hsmmc-m) $(omap-hsmmc-y) - obj-y += omap_phy_internal.o obj-$(CONFIG_MACH_OMAP2_TUSB6010) += usb-tusb6010.o diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h index 6316da3623b3..223b37c48389 100644 --- a/arch/arm/mach-omap2/common.h +++ b/arch/arm/mach-omap2/common.h @@ -352,7 +352,6 @@ void omap_pcs_legacy_init(int irq, void (*rearm)(void)); struct omap_sdrc_params; extern void omap_sdrc_init(struct omap_sdrc_params *sdrc_cs0, struct omap_sdrc_params *sdrc_cs1); -struct omap2_hsmmc_info; extern void omap_reserve(void); struct omap_hwmod; diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c deleted file mode 100644 index 63423ea6a240..000000000000 --- a/arch/arm/mach-omap2/hsmmc.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/arm/mach-omap2/hsmmc.c - * - * Copyright (C) 2007-2008 Texas Instruments - * Copyright (C) 2008 Nokia Corporation - * Author: Texas Instruments - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/mmc/host.h> -#include <linux/platform_data/hsmmc-omap.h> - -#include "soc.h" -#include "omap_device.h" - -#include "hsmmc.h" -#include "control.h" - -#if IS_ENABLED(CONFIG_MMC_OMAP_HS) - -static u16 control_pbias_offset; -static u16 control_devconf1_offset; - -#define HSMMC_NAME_LEN 9 - -static int __init omap_hsmmc_pdata_init(struct omap2_hsmmc_info *c, - struct omap_hsmmc_platform_data *mmc) -{ - char *hc_name; - - hc_name = kzalloc(HSMMC_NAME_LEN + 1, GFP_KERNEL); - if (!hc_name) - return -ENOMEM; - - snprintf(hc_name, (HSMMC_NAME_LEN + 1), "mmc%islot%i", c->mmc, 1); - mmc->name = hc_name; - mmc->caps = c->caps; - mmc->reg_offset = 0; - - return 0; -} - -static int omap_hsmmc_done; - -void omap_hsmmc_late_init(struct omap2_hsmmc_info *c) -{ - struct platform_device *pdev; - int res; - - if (omap_hsmmc_done) - return; - - omap_hsmmc_done = 1; - - for (; c->mmc; c++) { - pdev = c->pdev; - if (!pdev) - continue; - res = omap_device_register(pdev); - if (res) - pr_err("Could not late init MMC\n"); - } -} - -#define MAX_OMAP_MMC_HWMOD_NAME_LEN 16 - -static void __init omap_hsmmc_init_one(struct omap2_hsmmc_info *hsmmcinfo, - int ctrl_nr) -{ - struct omap_hwmod *oh; - struct omap_hwmod *ohs[1]; - struct omap_device *od; - struct platform_device *pdev; - char oh_name[MAX_OMAP_MMC_HWMOD_NAME_LEN]; - struct omap_hsmmc_platform_data *mmc_data; - struct omap_hsmmc_dev_attr *mmc_dev_attr; - char *name; - int res; - - mmc_data = kzalloc(sizeof(*mmc_data), GFP_KERNEL); - if (!mmc_data) - return; - - res = omap_hsmmc_pdata_init(hsmmcinfo, mmc_data); - if (res < 0) - goto free_mmc; - - name = "omap_hsmmc"; - res = snprintf(oh_name, MAX_OMAP_MMC_HWMOD_NAME_LEN, - "mmc%d", ctrl_nr); - WARN(res >= MAX_OMAP_MMC_HWMOD_NAME_LEN, - "String buffer overflow in MMC%d device setup\n", ctrl_nr); - - oh = omap_hwmod_lookup(oh_name); - if (!oh) { - pr_err("Could not look up %s\n", oh_name); - goto free_name; - } - ohs[0] = oh; - if (oh->dev_attr != NULL) { - mmc_dev_attr = oh->dev_attr; - mmc_data->controller_flags = mmc_dev_attr->flags; - } - - pdev = platform_device_alloc(name, ctrl_nr - 1); - if (!pdev) { - pr_err("Could not allocate pdev for %s\n", name); - goto free_name; - } - dev_set_name(&pdev->dev, "%s.%d", pdev->name, pdev->id); - - od = omap_device_alloc(pdev, ohs, 1); - if (IS_ERR(od)) { - pr_err("Could not allocate od for %s\n", name); - goto put_pdev; - } - - res = platform_device_add_data(pdev, mmc_data, - sizeof(struct omap_hsmmc_platform_data)); - if (res) { - pr_err("Could not add pdata for %s\n", name); - goto put_pdev; - } - - hsmmcinfo->pdev = pdev; - - res = omap_device_register(pdev); - if (res) { - pr_err("Could not register od for %s\n", name); - goto free_od; - } - - goto free_mmc; - -free_od: - omap_device_delete(od); - -put_pdev: - platform_device_put(pdev); - -free_name: - kfree(mmc_data->name); - -free_mmc: - kfree(mmc_data); -} - -void __init omap_hsmmc_init(struct omap2_hsmmc_info *controllers) -{ - if (omap_hsmmc_done) - return; - - omap_hsmmc_done = 1; - - if (cpu_is_omap2430()) { - control_pbias_offset = OMAP243X_CONTROL_PBIAS_LITE; - control_devconf1_offset = OMAP243X_CONTROL_DEVCONF1; - } else { - control_pbias_offset = OMAP343X_CONTROL_PBIAS_LITE; - control_devconf1_offset = OMAP343X_CONTROL_DEVCONF1; - } - - for (; controllers->mmc; controllers++) - omap_hsmmc_init_one(controllers, controllers->mmc); - -} - -#endif diff --git a/arch/arm/mach-omap2/hsmmc.h b/arch/arm/mach-omap2/hsmmc.h deleted file mode 100644 index 76c5ed2afa72..000000000000 --- a/arch/arm/mach-omap2/hsmmc.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * MMC definitions for OMAP2 - */ - -struct mmc_card; - -struct omap2_hsmmc_info { - u8 mmc; /* controller 1/2/3 */ - u32 caps; /* 4/8 wires and any additional host - * capabilities OR'd (ref. linux/mmc/host.h) */ - struct platform_device *pdev; /* mmc controller instance */ - /* init some special card */ - void (*init_card)(struct mmc_card *card); -}; - -#if IS_ENABLED(CONFIG_MMC_OMAP_HS) - -void omap_hsmmc_init(struct omap2_hsmmc_info *); -void omap_hsmmc_late_init(struct omap2_hsmmc_info *); - -#else - -static inline void omap_hsmmc_init(struct omap2_hsmmc_info *info) -{ -} - -static inline void omap_hsmmc_late_init(struct omap2_hsmmc_info *info) -{ -} - -#endif diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 2efd18e8824c..c47a2afc91e5 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -7,7 +7,6 @@ #include <linux/clk.h> #include <linux/davinci_emac.h> #include <linux/gpio.h> -#include <linux/gpio/machine.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/of_platform.h> @@ -33,7 +32,6 @@ #include "omap_device.h" #include "omap-secure.h" #include "soc.h" -#include "hsmmc.h" static struct omap_hsmmc_platform_data __maybe_unused mmc_pdata[2]; @@ -269,21 +267,13 @@ static void __init am3517_evm_legacy_init(void) am35xx_emac_reset(); } -static struct platform_device omap3_rom_rng_device = { - .name = "omap3-rom-rng", - .id = -1, - .dev = { - .platform_data = rx51_secure_rng_call, - }, -}; - static void __init nokia_n900_legacy_init(void) { hsmmc2_internal_input_clk(); mmc_pdata[0].name = "external"; mmc_pdata[1].name = "internal"; - if (omap_type() == OMAP2_DEVICE_TYPE_SEC) { + if (omap_type() != OMAP2_DEVICE_TYPE_GP) { if (IS_ENABLED(CONFIG_ARM_ERRATA_430973)) { pr_info("RX-51: Enabling ARM errata 430973 workaround\n"); /* set IBE to 1 */ @@ -292,9 +282,6 @@ static void __init nokia_n900_legacy_init(void) pr_warn("RX-51: Not enabling ARM errata 430973 workaround\n"); pr_warn("Thumb binaries may crash randomly without this workaround\n"); } - - pr_info("RX-51: Registering OMAP3 HWRNG device\n"); - platform_device_register(&omap3_rom_rng_device); } } @@ -311,118 +298,15 @@ static void __init omap3_logicpd_torpedo_init(void) } /* omap3pandora legacy devices */ -#define PANDORA_WIFI_IRQ_GPIO 21 -#define PANDORA_WIFI_NRESET_GPIO 23 static struct platform_device pandora_backlight = { .name = "pandora-backlight", .id = -1, }; -static struct regulator_consumer_supply pandora_vmmc3_supply[] = { - REGULATOR_SUPPLY("vmmc", "omap_hsmmc.2"), -}; - -static struct regulator_init_data pandora_vmmc3 = { - .constraints = { - .valid_ops_mask = REGULATOR_CHANGE_STATUS, - }, - .num_consumer_supplies = ARRAY_SIZE(pandora_vmmc3_supply), - .consumer_supplies = pandora_vmmc3_supply, -}; - -static struct fixed_voltage_config pandora_vwlan = { - .supply_name = "vwlan", - .microvolts = 1800000, /* 1.8V */ - .startup_delay = 50000, /* 50ms */ - .init_data = &pandora_vmmc3, -}; - -static struct platform_device pandora_vwlan_device = { - .name = "reg-fixed-voltage", - .id = 1, - .dev = { - .platform_data = &pandora_vwlan, - }, -}; - -static struct gpiod_lookup_table pandora_vwlan_gpiod_table = { - .dev_id = "reg-fixed-voltage.1", - .table = { - /* - * As this is a low GPIO number it should be at the first - * GPIO bank. - */ - GPIO_LOOKUP("gpio-0-31", PANDORA_WIFI_NRESET_GPIO, - NULL, GPIO_ACTIVE_HIGH), - { }, - }, -}; - -static void pandora_wl1251_init_card(struct mmc_card *card) -{ - /* - * We have TI wl1251 attached to MMC3. Pass this information to - * SDIO core because it can't be probed by normal methods. - */ - if (card->type == MMC_TYPE_SDIO || card->type == MMC_TYPE_SD_COMBO) { - card->quirks |= MMC_QUIRK_NONSTD_SDIO; - card->cccr.wide_bus = 1; - card->cis.vendor = 0x104c; - card->cis.device = 0x9066; - card->cis.blksize = 512; - card->cis.max_dtr = 24000000; - card->ocr = 0x80; - } -} - -static struct omap2_hsmmc_info pandora_mmc3[] = { - { - .mmc = 3, - .caps = MMC_CAP_4_BIT_DATA | MMC_CAP_POWER_OFF_CARD, - .init_card = pandora_wl1251_init_card, - }, - {} /* Terminator */ -}; - -static void __init pandora_wl1251_init(void) -{ - struct wl1251_platform_data pandora_wl1251_pdata; - int ret; - - memset(&pandora_wl1251_pdata, 0, sizeof(pandora_wl1251_pdata)); - - pandora_wl1251_pdata.power_gpio = -1; - - ret = gpio_request_one(PANDORA_WIFI_IRQ_GPIO, GPIOF_IN, "wl1251 irq"); - if (ret < 0) - goto fail; - - pandora_wl1251_pdata.irq = gpio_to_irq(PANDORA_WIFI_IRQ_GPIO); - if (pandora_wl1251_pdata.irq < 0) - goto fail_irq; - - pandora_wl1251_pdata.use_eeprom = true; - ret = wl1251_set_platform_data(&pandora_wl1251_pdata); - if (ret < 0) - goto fail_irq; - - return; - -fail_irq: - gpio_free(PANDORA_WIFI_IRQ_GPIO); -fail: - pr_err("wl1251 board initialisation failed\n"); -} - static void __init omap3_pandora_legacy_init(void) { platform_device_register(&pandora_backlight); - gpiod_add_lookup_table(&pandora_vwlan_gpiod_table); - platform_device_register(&pandora_vwlan_device); - omap_hsmmc_init(pandora_mmc3); - omap_hsmmc_late_init(pandora_mmc3); - pandora_wl1251_init(); } #endif /* CONFIG_ARCH_OMAP3 */ @@ -638,6 +522,7 @@ static struct of_dev_auxdata omap_auxdata_lookup[] = { OF_DEV_AUXDATA("ti,davinci_mdio", 0x5c030000, "davinci_mdio.0", NULL), OF_DEV_AUXDATA("ti,am3517-emac", 0x5c000000, "davinci_emac.0", &am35xx_emac_pdata), + OF_DEV_AUXDATA("nokia,n900-rom-rng", 0, NULL, rx51_secure_rng_call), /* McBSP modules with sidetone core */ #if IS_ENABLED(CONFIG_SND_SOC_OMAP_MCBSP) OF_DEV_AUXDATA("ti,omap3-mcbsp", 0x49022000, "49022000.mcbsp", &mcbsp_pdata), diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c index 865b10344ea2..0474a4b1394d 100644 --- a/arch/arm/mach-pxa/icontrol.c +++ b/arch/arm/mach-pxa/icontrol.c @@ -12,6 +12,7 @@ #include <linux/irq.h> #include <linux/platform_device.h> +#include <linux/property.h> #include <linux/gpio.h> #include <asm/mach-types.h> @@ -22,7 +23,6 @@ #include <linux/spi/spi.h> #include <linux/spi/pxa2xx_spi.h> -#include <linux/can/platform/mcp251x.h> #include <linux/regulator/machine.h> #include "generic.h" @@ -69,8 +69,9 @@ static struct pxa2xx_spi_chip mcp251x_chip_info4 = { .gpio_cs = ICONTROL_MCP251x_nCS4 }; -static struct mcp251x_platform_data mcp251x_info = { - .oscillator_frequency = 16E6, +static const struct property_entry mcp251x_properties[] = { + PROPERTY_ENTRY_U32("clock-frequency", 16000000), + {} }; static struct spi_board_info mcp251x_board_info[] = { @@ -79,7 +80,7 @@ static struct spi_board_info mcp251x_board_info[] = { .max_speed_hz = 6500000, .bus_num = 3, .chip_select = 0, - .platform_data = &mcp251x_info, + .properties = mcp251x_properties, .controller_data = &mcp251x_chip_info1, .irq = PXA_GPIO_TO_IRQ(ICONTROL_MCP251x_nIRQ1) }, diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c index da113c8eefbf..b27fc7ac9cea 100644 --- a/arch/arm/mach-pxa/zeus.c +++ b/arch/arm/mach-pxa/zeus.c @@ -13,6 +13,7 @@ #include <linux/leds.h> #include <linux/irq.h> #include <linux/pm.h> +#include <linux/property.h> #include <linux/gpio.h> #include <linux/gpio/machine.h> #include <linux/serial_8250.h> @@ -27,7 +28,6 @@ #include <linux/platform_data/i2c-pxa.h> #include <linux/platform_data/pca953x.h> #include <linux/apm-emulation.h> -#include <linux/can/platform/mcp251x.h> #include <linux/regulator/fixed.h> #include <linux/regulator/machine.h> @@ -428,14 +428,15 @@ static struct gpiod_lookup_table can_regulator_gpiod_table = { }, }; -static struct mcp251x_platform_data zeus_mcp2515_pdata = { - .oscillator_frequency = 16*1000*1000, +static const struct property_entry mcp251x_properties[] = { + PROPERTY_ENTRY_U32("clock-frequency", 16000000), + {} }; static struct spi_board_info zeus_spi_board_info[] = { [0] = { .modalias = "mcp2515", - .platform_data = &zeus_mcp2515_pdata, + .properties = mcp251x_properties, .irq = PXA_GPIO_TO_IRQ(ZEUS_CAN_GPIO), .max_speed_hz = 1*1000*1000, .bus_num = 3, diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c index 2447427cb4a8..69f3fa270fbe 100644 --- a/arch/arm/mach-tegra/cpuidle-tegra20.c +++ b/arch/arm/mach-tegra/cpuidle-tegra20.c @@ -203,7 +203,7 @@ void tegra20_cpuidle_pcie_irqs_in_use(void) { pr_info_once( "Disabling cpuidle LP2 state, since PCIe IRQs are in use\n"); - tegra_idle_driver.states[1].disabled = true; + cpuidle_driver_state_disabled(&tegra_idle_driver, 1, true); } int __init tegra20_cpuidle_init(void) diff --git a/arch/arm/plat-pxa/ssp.c b/arch/arm/plat-pxa/ssp.c index 9a6e4923bd69..563440315acd 100644 --- a/arch/arm/plat-pxa/ssp.c +++ b/arch/arm/plat-pxa/ssp.c @@ -89,7 +89,7 @@ void pxa_ssp_free(struct ssp_device *ssp) ssp->use_count--; ssp->label = NULL; } else - dev_err(&ssp->pdev->dev, "device already free\n"); + dev_err(ssp->dev, "device already free\n"); mutex_unlock(&ssp_lock); } EXPORT_SYMBOL(pxa_ssp_free); @@ -118,7 +118,7 @@ static int pxa_ssp_probe(struct platform_device *pdev) if (ssp == NULL) return -ENOMEM; - ssp->pdev = pdev; + ssp->dev = dev; ssp->clk = devm_clk_get(dev, NULL); if (IS_ERR(ssp->clk)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d66a9727344d..afe6412fe769 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -67,7 +67,7 @@ config ARM64 select ARCH_USE_QUEUED_SPINLOCKS select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @@ -182,7 +182,6 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY - select REFCOUNT_FULL select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts index 1d05d570142f..ce4b0679839d 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts @@ -252,6 +252,7 @@ }; &r_ir { + linux,rc-map-name = "rc-beelink-gs1"; status = "okay"; }; diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 4922c4451e7c..b8eb0453123d 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -86,7 +86,7 @@ config CRYPTO_AES_ARM64_CE_CCM config CRYPTO_AES_ARM64_CE_BLK tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64_CE select CRYPTO_AES_ARM64 select CRYPTO_SIMD @@ -94,7 +94,7 @@ config CRYPTO_AES_ARM64_CE_BLK config CRYPTO_AES_ARM64_NEON_BLK tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64 select CRYPTO_LIB_AES select CRYPTO_SIMD @@ -102,8 +102,15 @@ config CRYPTO_AES_ARM64_NEON_BLK config CRYPTO_CHACHA20_NEON tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions" depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_CHACHA20 + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_NEON + tristate "Poly1305 hash function using scalar or NEON instructions" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_ARCH_HAVE_LIB_POLY1305 config CRYPTO_NHPOLY1305_NEON tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)" @@ -113,7 +120,7 @@ config CRYPTO_NHPOLY1305_NEON config CRYPTO_AES_ARM64_BS tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER + select CRYPTO_SKCIPHER select CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_AES_ARM64 select CRYPTO_LIB_AES diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 0435f2a0610e..d0901e610df3 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-core.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o +poly1305-neon-y := poly1305-core.o poly1305-glue.o +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64 + obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o @@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $(<) void $(@) +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl + $(call cmd,perlasm) + $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl $(call cmd,perlasm) $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl $(call cmd,perlasm) + endif -clean-files += sha256-core.S sha512-core.S +clean-files += poly1305-core.S sha256-core.S sha512-core.S diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index ea873b8904c4..e3e27349a9fe 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -384,7 +384,7 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, goto xts_tail; kernel_neon_end(); - skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } if (err || likely(!tail)) diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 1495d2b18518..b08029d7bde6 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -1,5 +1,5 @@ /* - * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers, * including ChaCha20 (RFC7539) * * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> @@ -20,9 +20,10 @@ */ #include <crypto/algapi.h> -#include <crypto/chacha.h> +#include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> @@ -36,6 +37,8 @@ asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, int nrounds, int bytes); asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, int bytes, int nrounds) { @@ -59,6 +62,37 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, } } +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, stream, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, bytes, nrounds); + kernel_neon_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + static int chacha_neon_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { @@ -68,7 +102,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, err = skcipher_walk_virt(&walk, req, false); - crypto_chacha_init(state, ctx, iv); + chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -76,10 +110,17 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, if (nbytes < walk.total) nbytes = rounddown(nbytes, walk.stride); - kernel_neon_begin(); - chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes, ctx->nrounds); - kernel_neon_end(); + if (!static_branch_likely(&have_neon) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + kernel_neon_end(); + } err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } @@ -91,9 +132,6 @@ static int chacha_neon(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - return chacha_neon_stream_xor(req, ctx, req->iv); } @@ -105,14 +143,8 @@ static int xchacha_neon(struct skcipher_request *req) u32 state[16]; u8 real_iv[16]; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - crypto_chacha_init(state, ctx, req->iv); - - kernel_neon_begin(); - hchacha_block_neon(state, subctx.key, ctx->nrounds); - kernel_neon_end(); + chacha_init_generic(state, ctx->key, req->iv); + hchacha_block_arch(state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); @@ -134,7 +166,7 @@ static struct skcipher_alg algs[] = { .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = chacha_neon, .decrypt = chacha_neon, }, { @@ -150,7 +182,7 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, }, { @@ -166,7 +198,7 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, + .setkey = chacha12_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, } @@ -175,14 +207,17 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { if (!cpu_have_named_feature(ASIMD)) - return -ENODEV; + return 0; + + static_branch_enable(&have_neon); return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_simd_mod_fini(void) { - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); + if (cpu_have_named_feature(ASIMD)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index 410e8afcf5a7..a791c4adf8e6 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -13,8 +13,8 @@ T1 .req v2 T2 .req v3 MASK .req v4 - XL .req v5 - XM .req v6 + XM .req v5 + XL .req v6 XH .req v7 IN1 .req v7 @@ -358,20 +358,37 @@ ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) - KS0 .req v12 - KS1 .req v13 - INP0 .req v14 - INP1 .req v15 - - .macro load_round_keys, rounds, rk - cmp \rounds, #12 - blo 2222f /* 128 bits */ - beq 1111f /* 192 bits */ - ld1 {v17.4s-v18.4s}, [\rk], #32 -1111: ld1 {v19.4s-v20.4s}, [\rk], #32 -2222: ld1 {v21.4s-v24.4s}, [\rk], #64 - ld1 {v25.4s-v28.4s}, [\rk], #64 - ld1 {v29.4s-v31.4s}, [\rk] + KS0 .req v8 + KS1 .req v9 + KS2 .req v10 + KS3 .req v11 + + INP0 .req v21 + INP1 .req v22 + INP2 .req v23 + INP3 .req v24 + + K0 .req v25 + K1 .req v26 + K2 .req v27 + K3 .req v28 + K4 .req v12 + K5 .req v13 + K6 .req v4 + K7 .req v5 + K8 .req v14 + K9 .req v15 + KK .req v29 + KL .req v30 + KM .req v31 + + .macro load_round_keys, rounds, rk, tmp + add \tmp, \rk, #64 + ld1 {K0.4s-K3.4s}, [\rk] + ld1 {K4.4s-K5.4s}, [\tmp] + add \tmp, \rk, \rounds, lsl #4 + sub \tmp, \tmp, #32 + ld1 {KK.4s-KM.4s}, [\tmp] .endm .macro enc_round, state, key @@ -379,197 +396,367 @@ ENDPROC(pmull_ghash_update_p8) aesmc \state\().16b, \state\().16b .endm - .macro enc_block, state, rounds - cmp \rounds, #12 - b.lo 2222f /* 128 bits */ - b.eq 1111f /* 192 bits */ - enc_round \state, v17 - enc_round \state, v18 -1111: enc_round \state, v19 - enc_round \state, v20 -2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + .macro enc_qround, s0, s1, s2, s3, key + enc_round \s0, \key + enc_round \s1, \key + enc_round \s2, \key + enc_round \s3, \key + .endm + + .macro enc_block, state, rounds, rk, tmp + add \tmp, \rk, #96 + ld1 {K6.4s-K7.4s}, [\tmp], #32 + .irp key, K0, K1, K2, K3, K4 K5 enc_round \state, \key .endr - aese \state\().16b, v30.16b - eor \state\().16b, \state\().16b, v31.16b + + tbnz \rounds, #2, .Lnot128_\@ +.Lout256_\@: + enc_round \state, K6 + enc_round \state, K7 + +.Lout192_\@: + enc_round \state, KK + aese \state\().16b, KL.16b + eor \state\().16b, \state\().16b, KM.16b + + .subsection 1 +.Lnot128_\@: + ld1 {K8.4s-K9.4s}, [\tmp], #32 + enc_round \state, K6 + enc_round \state, K7 + ld1 {K6.4s-K7.4s}, [\tmp] + enc_round \state, K8 + enc_round \state, K9 + tbz \rounds, #1, .Lout192_\@ + b .Lout256_\@ + .previous .endm + .align 6 .macro pmull_gcm_do_crypt, enc - ld1 {SHASH.2d}, [x4], #16 - ld1 {HH.2d}, [x4] - ld1 {XL.2d}, [x1] - ldr x8, [x5, #8] // load lower counter + stp x29, x30, [sp, #-32]! + mov x29, sp + str x19, [sp, #24] + + load_round_keys x7, x6, x8 + + ld1 {SHASH.2d}, [x3], #16 + ld1 {HH.2d-HH4.2d}, [x3] - movi MASK.16b, #0xe1 trn1 SHASH2.2d, SHASH.2d, HH.2d trn2 T1.2d, SHASH.2d, HH.2d -CPU_LE( rev x8, x8 ) - shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, T1.16b - .if \enc == 1 - ldr x10, [sp] - ld1 {KS0.16b-KS1.16b}, [x10] - .endif + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b - cbnz x6, 4f + ld1 {XL.2d}, [x4] -0: ld1 {INP0.16b-INP1.16b}, [x3], #32 + cbz x0, 3f // tag only? - rev x9, x8 - add x11, x8, #1 - add x8, x8, #2 + ldr w8, [x5, #12] // load lower counter +CPU_LE( rev w8, w8 ) - .if \enc == 1 - eor INP0.16b, INP0.16b, KS0.16b // encrypt input - eor INP1.16b, INP1.16b, KS1.16b +0: mov w9, #4 // max blocks per round + add x10, x0, #0xf + lsr x10, x10, #4 // remaining blocks + + subs x0, x0, #64 + csel w9, w10, w9, mi + add w8, w8, w9 + + bmi 1f + ld1 {INP0.16b-INP3.16b}, [x2], #64 + .subsection 1 + /* + * Populate the four input registers right to left with up to 63 bytes + * of data, using overlapping loads to avoid branches. + * + * INP0 INP1 INP2 INP3 + * 1 byte | | | |x | + * 16 bytes | | | |xxxxxxxx| + * 17 bytes | | |xxxxxxxx|x | + * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | + * etc etc + * + * Note that this code may read up to 15 bytes before the start of + * the input. It is up to the calling code to ensure this is safe if + * this happens in the first iteration of the loop (i.e., when the + * input size is < 16 bytes) + */ +1: mov x15, #16 + ands x19, x0, #0xf + csel x19, x19, x15, ne + adr_l x17, .Lpermute_table + 16 + + sub x11, x15, x19 + add x12, x17, x11 + sub x17, x17, x11 + ld1 {T1.16b}, [x12] + sub x10, x1, x11 + sub x11, x2, x11 + + cmp x0, #-16 + csel x14, x15, xzr, gt + cmp x0, #-32 + csel x15, x15, xzr, gt + cmp x0, #-48 + csel x16, x19, xzr, gt + csel x1, x1, x10, gt + csel x2, x2, x11, gt + + ld1 {INP0.16b}, [x2], x14 + ld1 {INP1.16b}, [x2], x15 + ld1 {INP2.16b}, [x2], x16 + ld1 {INP3.16b}, [x2] + tbl INP3.16b, {INP3.16b}, T1.16b + b 2f + .previous + +2: .if \enc == 0 + bl pmull_gcm_ghash_4x .endif - ld1 {KS0.8b}, [x5] // load upper counter - rev x11, x11 - sub w0, w0, #2 - mov KS1.8b, KS0.8b - ins KS0.d[1], x9 // set lower counter - ins KS1.d[1], x11 + bl pmull_gcm_enc_4x - rev64 T1.16b, INP1.16b + tbnz x0, #63, 6f + st1 {INP0.16b-INP3.16b}, [x1], #64 + .if \enc == 1 + bl pmull_gcm_ghash_4x + .endif + bne 0b - cmp w7, #12 - b.ge 2f // AES-192/256? +3: ldp x19, x10, [sp, #24] + cbz x10, 5f // output tag? -1: enc_round KS0, v21 - ext IN1.16b, T1.16b, T1.16b, #8 + ld1 {INP3.16b}, [x10] // load lengths[] + mov w9, #1 + bl pmull_gcm_ghash_4x - enc_round KS1, v21 - pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + mov w11, #(0x1 << 24) // BE '1U' + ld1 {KS0.16b}, [x5] + mov KS0.s[3], w11 - enc_round KS0, v22 - eor T1.16b, T1.16b, IN1.16b + enc_block KS0, x7, x6, x12 - enc_round KS1, v22 - pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + ext XL.16b, XL.16b, XL.16b, #8 + rev64 XL.16b, XL.16b + eor XL.16b, XL.16b, KS0.16b + st1 {XL.16b}, [x10] // store tag - enc_round KS0, v23 - pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) +4: ldp x29, x30, [sp], #32 + ret - enc_round KS1, v23 - rev64 T1.16b, INP0.16b - ext T2.16b, XL.16b, XL.16b, #8 +5: +CPU_LE( rev w8, w8 ) + str w8, [x5, #12] // store lower counter + st1 {XL.2d}, [x4] + b 4b + +6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors + sub x17, x17, x19, lsl #1 + + cmp w9, #1 + beq 7f + .subsection 1 +7: ld1 {INP2.16b}, [x1] + tbx INP2.16b, {INP3.16b}, T1.16b + mov INP3.16b, INP2.16b + b 8f + .previous + + st1 {INP0.16b}, [x1], x14 + st1 {INP1.16b}, [x1], x15 + st1 {INP2.16b}, [x1], x16 + tbl INP3.16b, {INP3.16b}, T1.16b + tbx INP3.16b, {INP2.16b}, T2.16b +8: st1 {INP3.16b}, [x1] - enc_round KS0, v24 - ext IN1.16b, T1.16b, T1.16b, #8 - eor T1.16b, T1.16b, T2.16b + .if \enc == 1 + ld1 {T1.16b}, [x17] + tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits + bl pmull_gcm_ghash_4x + .endif + b 3b + .endm - enc_round KS1, v24 - eor XL.16b, XL.16b, IN1.16b + /* + * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], + * struct ghash_key const *k, u64 dg[], u8 ctr[], + * int rounds, u8 tag) + */ +ENTRY(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +ENDPROC(pmull_gcm_encrypt) - enc_round KS0, v25 - eor T1.16b, T1.16b, XL.16b + /* + * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], + * struct ghash_key const *k, u64 dg[], u8 ctr[], + * int rounds, u8 tag) + */ +ENTRY(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +ENDPROC(pmull_gcm_decrypt) - enc_round KS1, v25 - pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 +pmull_gcm_ghash_4x: + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 - enc_round KS0, v26 - pmull XL.1q, HH.1d, XL.1d // a0 * b0 + rev64 T1.16b, INP0.16b + rev64 T2.16b, INP1.16b + rev64 TT3.16b, INP2.16b + rev64 TT4.16b, INP3.16b - enc_round KS1, v26 - pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) + ext XL.16b, XL.16b, XL.16b, #8 - enc_round KS0, v27 - eor XL.16b, XL.16b, XL2.16b - eor XH.16b, XH.16b, XH2.16b + tbz w9, #2, 0f // <4 blocks? + .subsection 1 +0: movi XH2.16b, #0 + movi XM2.16b, #0 + movi XL2.16b, #0 - enc_round KS1, v27 - eor XM.16b, XM.16b, XM2.16b - ext T1.16b, XL.16b, XH.16b, #8 + tbz w9, #0, 1f // 2 blocks? + tbz w9, #1, 2f // 1 block? - enc_round KS0, v28 - eor T2.16b, XL.16b, XH.16b - eor XM.16b, XM.16b, T1.16b + eor T2.16b, T2.16b, XL.16b + ext T1.16b, T2.16b, T2.16b, #8 + b .Lgh3 - enc_round KS1, v28 - eor XM.16b, XM.16b, T2.16b +1: eor TT3.16b, TT3.16b, XL.16b + ext T2.16b, TT3.16b, TT3.16b, #8 + b .Lgh2 - enc_round KS0, v29 - pmull T2.1q, XL.1d, MASK.1d +2: eor TT4.16b, TT4.16b, XL.16b + ext IN1.16b, TT4.16b, TT4.16b, #8 + b .Lgh1 + .previous - enc_round KS1, v29 - mov XH.d[0], XM.d[1] - mov XM.d[1], XL.d[0] + eor T1.16b, T1.16b, XL.16b + ext IN1.16b, T1.16b, T1.16b, #8 - aese KS0.16b, v30.16b - eor XL.16b, XM.16b, T2.16b + pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 + eor T1.16b, T1.16b, IN1.16b + pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 + pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) - aese KS1.16b, v30.16b - ext T2.16b, XL.16b, XL.16b, #8 + ext T1.16b, T2.16b, T2.16b, #8 +.Lgh3: eor T2.16b, T2.16b, T1.16b + pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 + pmull XL.1q, HH3.1d, T1.1d // a0 * b0 + pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) - eor KS0.16b, KS0.16b, v31.16b - pmull XL.1q, XL.1d, MASK.1d - eor T2.16b, T2.16b, XH.16b + eor XH2.16b, XH2.16b, XH.16b + eor XL2.16b, XL2.16b, XL.16b + eor XM2.16b, XM2.16b, XM.16b - eor KS1.16b, KS1.16b, v31.16b - eor XL.16b, XL.16b, T2.16b + ext T2.16b, TT3.16b, TT3.16b, #8 +.Lgh2: eor TT3.16b, TT3.16b, T2.16b + pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 + pmull XL.1q, HH.1d, T2.1d // a0 * b0 + pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) - .if \enc == 0 - eor INP0.16b, INP0.16b, KS0.16b - eor INP1.16b, INP1.16b, KS1.16b - .endif + eor XH2.16b, XH2.16b, XH.16b + eor XL2.16b, XL2.16b, XL.16b + eor XM2.16b, XM2.16b, XM.16b - st1 {INP0.16b-INP1.16b}, [x2], #32 + ext IN1.16b, TT4.16b, TT4.16b, #8 +.Lgh1: eor TT4.16b, TT4.16b, IN1.16b + pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) - cbnz w0, 0b + eor XH.16b, XH.16b, XH2.16b + eor XL.16b, XL.16b, XL2.16b + eor XM.16b, XM.16b, XM2.16b -CPU_LE( rev x8, x8 ) - st1 {XL.2d}, [x1] - str x8, [x5, #8] // store lower counter + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b - .if \enc == 1 - st1 {KS0.16b-KS1.16b}, [x10] - .endif + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b ret +ENDPROC(pmull_gcm_ghash_4x) + +pmull_gcm_enc_4x: + ld1 {KS0.16b}, [x5] // load upper counter + sub w10, w8, #4 + sub w11, w8, #3 + sub w12, w8, #2 + sub w13, w8, #1 + rev w10, w10 + rev w11, w11 + rev w12, w12 + rev w13, w13 + mov KS1.16b, KS0.16b + mov KS2.16b, KS0.16b + mov KS3.16b, KS0.16b + ins KS0.s[3], w10 // set lower counter + ins KS1.s[3], w11 + ins KS2.s[3], w12 + ins KS3.s[3], w13 + + add x10, x6, #96 // round key pointer + ld1 {K6.4s-K7.4s}, [x10], #32 + .irp key, K0, K1, K2, K3, K4, K5 + enc_qround KS0, KS1, KS2, KS3, \key + .endr -2: b.eq 3f // AES-192? - enc_round KS0, v17 - enc_round KS1, v17 - enc_round KS0, v18 - enc_round KS1, v18 -3: enc_round KS0, v19 - enc_round KS1, v19 - enc_round KS0, v20 - enc_round KS1, v20 - b 1b + tbnz x7, #2, .Lnot128 + .subsection 1 +.Lnot128: + ld1 {K8.4s-K9.4s}, [x10], #32 + .irp key, K6, K7 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + ld1 {K6.4s-K7.4s}, [x10] + .irp key, K8, K9 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + tbz x7, #1, .Lout192 + b .Lout256 + .previous -4: load_round_keys w7, x6 - b 0b - .endm +.Lout256: + .irp key, K6, K7 + enc_qround KS0, KS1, KS2, KS3, \key + .endr - /* - * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], - * struct ghash_key const *k, u8 ctr[], - * int rounds, u8 ks[]) - */ -ENTRY(pmull_gcm_encrypt) - pmull_gcm_do_crypt 1 -ENDPROC(pmull_gcm_encrypt) +.Lout192: + enc_qround KS0, KS1, KS2, KS3, KK - /* - * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], - * struct ghash_key const *k, u8 ctr[], - * int rounds) - */ -ENTRY(pmull_gcm_decrypt) - pmull_gcm_do_crypt 0 -ENDPROC(pmull_gcm_decrypt) + aese KS0.16b, KL.16b + aese KS1.16b, KL.16b + aese KS2.16b, KL.16b + aese KS3.16b, KL.16b + + eor KS0.16b, KS0.16b, KM.16b + eor KS1.16b, KS1.16b, KM.16b + eor KS2.16b, KS2.16b, KM.16b + eor KS3.16b, KS3.16b, KM.16b + + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b + eor INP2.16b, INP2.16b, KS2.16b + eor INP3.16b, INP3.16b, KS3.16b - /* - * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) - */ -ENTRY(pmull_gcm_encrypt_block) - cbz x2, 0f - load_round_keys w3, x2 -0: ld1 {v0.16b}, [x1] - enc_block v0, w3 - st1 {v0.16b}, [x0] ret -ENDPROC(pmull_gcm_encrypt_block) +ENDPROC(pmull_gcm_enc_4x) + + .section ".rodata", "a" + .align 6 +.Lpermute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .previous diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 70b1469783f9..522cf004ce65 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -58,17 +58,15 @@ asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, struct ghash_key const *k, const char *head); -asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], struct ghash_key const *k, +asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[], + struct ghash_key const *k, u64 dg[], u8 ctr[], u32 const rk[], int rounds, - u8 ks[]); + u8 tag[]); -asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], struct ghash_key const *k, - u8 ctr[], u32 const rk[], int rounds); - -asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], - u32 const rk[], int rounds); +asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[], + struct ghash_key const *k, u64 dg[], + u8 ctr[], u32 const rk[], int rounds, + u8 tag[]); static int ghash_init(struct shash_desc *desc) { @@ -85,7 +83,7 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, struct ghash_key const *k, const char *head)) { - if (likely(crypto_simd_usable())) { + if (likely(crypto_simd_usable() && simd_update)) { kernel_neon_begin(); simd_update(blocks, dg, src, key, head); kernel_neon_end(); @@ -398,136 +396,112 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) } } -static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx, - u64 dg[], u8 tag[], int cryptlen) -{ - u8 mac[AES_BLOCK_SIZE]; - u128 lengths; - - lengths.a = cpu_to_be64(req->assoclen * 8); - lengths.b = cpu_to_be64(cryptlen * 8); - - ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL, - pmull_ghash_update_p64); - - put_unaligned_be64(dg[1], mac); - put_unaligned_be64(dg[0], mac + 8); - - crypto_xor(tag, mac, AES_BLOCK_SIZE); -} - static int gcm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + int nrounds = num_rounds(&ctx->aes_key); struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; u8 iv[AES_BLOCK_SIZE]; - u8 ks[2 * AES_BLOCK_SIZE]; - u8 tag[AES_BLOCK_SIZE]; u64 dg[2] = {}; - int nrounds = num_rounds(&ctx->aes_key); + u128 lengths; + u8 *tag; int err; + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(req->cryptlen * 8); + if (req->assoclen) gcm_calculate_auth_mac(req, dg); memcpy(iv, req->iv, GCM_IV_SIZE); - put_unaligned_be32(1, iv + GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_IV_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, false); - if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) { - u32 const *rk = NULL; - - kernel_neon_begin(); - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); - put_unaligned_be32(2, iv + GCM_IV_SIZE); - pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); - put_unaligned_be32(3, iv + GCM_IV_SIZE); - pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); - put_unaligned_be32(4, iv + GCM_IV_SIZE); - + if (likely(crypto_simd_usable())) { do { - int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + int nbytes = walk.nbytes; + + tag = (u8 *)&lengths; - if (rk) - kernel_neon_begin(); + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) { + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + } else if (nbytes < walk.total) { + nbytes &= ~(AES_BLOCK_SIZE - 1); + tag = NULL; + } - pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, &ctx->ghash_key, - iv, rk, nrounds, ks); + kernel_neon_begin(); + pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg, + iv, ctx->aes_key.key_enc, nrounds, + tag); kernel_neon_end(); - err = skcipher_walk_done(&walk, - walk.nbytes % (2 * AES_BLOCK_SIZE)); + if (unlikely(!nbytes)) + break; - rk = ctx->aes_key.key_enc; - } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); - } else { - aes_encrypt(&ctx->aes_key, tag, iv); - put_unaligned_be32(2, iv + GCM_IV_SIZE); + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); - while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) { - const int blocks = - walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } while (walk.nbytes); + } else { + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - u8 *src = walk.src.virt.addr; int remaining = blocks; do { - aes_encrypt(&ctx->aes_key, ks, iv); - crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); + aes_encrypt(&ctx->aes_key, buf, iv); + crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); dst += AES_BLOCK_SIZE; src += AES_BLOCK_SIZE; } while (--remaining > 0); - ghash_do_update(blocks, dg, - walk.dst.virt.addr, &ctx->ghash_key, - NULL, pmull_ghash_update_p64); + ghash_do_update(blocks, dg, walk.dst.virt.addr, + &ctx->ghash_key, NULL, NULL); err = skcipher_walk_done(&walk, - walk.nbytes % (2 * AES_BLOCK_SIZE)); - } - if (walk.nbytes) { - aes_encrypt(&ctx->aes_key, ks, iv); - if (walk.nbytes > AES_BLOCK_SIZE) { - crypto_inc(iv, AES_BLOCK_SIZE); - aes_encrypt(&ctx->aes_key, ks + AES_BLOCK_SIZE, iv); - } + walk.nbytes % AES_BLOCK_SIZE); } - } - /* handle the tail */ - if (walk.nbytes) { - u8 buf[GHASH_BLOCK_SIZE]; - unsigned int nbytes = walk.nbytes; - u8 *dst = walk.dst.virt.addr; - u8 *head = NULL; + /* handle the tail */ + if (walk.nbytes) { + aes_encrypt(&ctx->aes_key, buf, iv); - crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, - walk.nbytes); + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, + buf, walk.nbytes); - if (walk.nbytes > GHASH_BLOCK_SIZE) { - head = dst; - dst += GHASH_BLOCK_SIZE; - nbytes %= GHASH_BLOCK_SIZE; + memcpy(buf, walk.dst.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes); } - memcpy(buf, dst, nbytes); - memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); - ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head, - pmull_ghash_update_p64); + tag = (u8 *)&lengths; + ghash_do_update(1, dg, tag, &ctx->ghash_key, + walk.nbytes ? buf : NULL, NULL); - err = skcipher_walk_done(&walk, 0); + if (walk.nbytes) + err = skcipher_walk_done(&walk, 0); + + put_unaligned_be64(dg[1], tag); + put_unaligned_be64(dg[0], tag + 8); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + aes_encrypt(&ctx->aes_key, iv, iv); + crypto_xor(tag, iv, AES_BLOCK_SIZE); } if (err) return err; - gcm_final(req, ctx, dg, tag, req->cryptlen); - /* copy authtag to end of dst */ scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); @@ -540,75 +514,65 @@ static int gcm_decrypt(struct aead_request *req) struct crypto_aead *aead = crypto_aead_reqtfm(req); struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); unsigned int authsize = crypto_aead_authsize(aead); + int nrounds = num_rounds(&ctx->aes_key); struct skcipher_walk walk; - u8 iv[2 * AES_BLOCK_SIZE]; - u8 tag[AES_BLOCK_SIZE]; - u8 buf[2 * GHASH_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u8 iv[AES_BLOCK_SIZE]; u64 dg[2] = {}; - int nrounds = num_rounds(&ctx->aes_key); + u128 lengths; + u8 *tag; int err; + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8); + if (req->assoclen) gcm_calculate_auth_mac(req, dg); memcpy(iv, req->iv, GCM_IV_SIZE); - put_unaligned_be32(1, iv + GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_IV_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, false); - if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) { - u32 const *rk = NULL; - - kernel_neon_begin(); - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); - put_unaligned_be32(2, iv + GCM_IV_SIZE); - + if (likely(crypto_simd_usable())) { do { - int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; - int rem = walk.total - blocks * AES_BLOCK_SIZE; - - if (rk) - kernel_neon_begin(); - - pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, &ctx->ghash_key, - iv, rk, nrounds); - - /* check if this is the final iteration of the loop */ - if (rem < (2 * AES_BLOCK_SIZE)) { - u8 *iv2 = iv + AES_BLOCK_SIZE; - - if (rem > AES_BLOCK_SIZE) { - memcpy(iv2, iv, AES_BLOCK_SIZE); - crypto_inc(iv2, AES_BLOCK_SIZE); - } + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + int nbytes = walk.nbytes; - pmull_gcm_encrypt_block(iv, iv, NULL, nrounds); + tag = (u8 *)&lengths; - if (rem > AES_BLOCK_SIZE) - pmull_gcm_encrypt_block(iv2, iv2, NULL, - nrounds); + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) { + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + } else if (nbytes < walk.total) { + nbytes &= ~(AES_BLOCK_SIZE - 1); + tag = NULL; } + kernel_neon_begin(); + pmull_gcm_decrypt(nbytes, dst, src, &ctx->ghash_key, dg, + iv, ctx->aes_key.key_enc, nrounds, + tag); kernel_neon_end(); - err = skcipher_walk_done(&walk, - walk.nbytes % (2 * AES_BLOCK_SIZE)); + if (unlikely(!nbytes)) + break; - rk = ctx->aes_key.key_enc; - } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); - } else { - aes_encrypt(&ctx->aes_key, tag, iv); - put_unaligned_be32(2, iv + GCM_IV_SIZE); + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); - while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) { - int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } while (walk.nbytes); + } else { + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - u8 *src = walk.src.virt.addr; ghash_do_update(blocks, dg, walk.src.virt.addr, - &ctx->ghash_key, NULL, - pmull_ghash_update_p64); + &ctx->ghash_key, NULL, NULL); do { aes_encrypt(&ctx->aes_key, buf, iv); @@ -620,49 +584,38 @@ static int gcm_decrypt(struct aead_request *req) } while (--blocks > 0); err = skcipher_walk_done(&walk, - walk.nbytes % (2 * AES_BLOCK_SIZE)); + walk.nbytes % AES_BLOCK_SIZE); } - if (walk.nbytes) { - if (walk.nbytes > AES_BLOCK_SIZE) { - u8 *iv2 = iv + AES_BLOCK_SIZE; - - memcpy(iv2, iv, AES_BLOCK_SIZE); - crypto_inc(iv2, AES_BLOCK_SIZE); - aes_encrypt(&ctx->aes_key, iv2, iv2); - } - aes_encrypt(&ctx->aes_key, iv, iv); + /* handle the tail */ + if (walk.nbytes) { + memcpy(buf, walk.src.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes); } - } - /* handle the tail */ - if (walk.nbytes) { - const u8 *src = walk.src.virt.addr; - const u8 *head = NULL; - unsigned int nbytes = walk.nbytes; + tag = (u8 *)&lengths; + ghash_do_update(1, dg, tag, &ctx->ghash_key, + walk.nbytes ? buf : NULL, NULL); - if (walk.nbytes > GHASH_BLOCK_SIZE) { - head = src; - src += GHASH_BLOCK_SIZE; - nbytes %= GHASH_BLOCK_SIZE; - } + if (walk.nbytes) { + aes_encrypt(&ctx->aes_key, buf, iv); - memcpy(buf, src, nbytes); - memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); - ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head, - pmull_ghash_update_p64); + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, + buf, walk.nbytes); - crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, - walk.nbytes); + err = skcipher_walk_done(&walk, 0); + } - err = skcipher_walk_done(&walk, 0); + put_unaligned_be64(dg[1], tag); + put_unaligned_be64(dg[0], tag + 8); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + aes_encrypt(&ctx->aes_key, iv, iv); + crypto_xor(tag, iv, AES_BLOCK_SIZE); } if (err) return err; - gcm_final(req, ctx, dg, tag, req->cryptlen - authsize); - /* compare calculated auth tag with the stored one */ scatterwalk_map_and_copy(buf, req->src, req->assoclen + req->cryptlen - authsize, @@ -675,7 +628,7 @@ static int gcm_decrypt(struct aead_request *req) static struct aead_alg gcm_aes_alg = { .ivsize = GCM_IV_SIZE, - .chunksize = 2 * AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, .setkey = gcm_setkey, .setauthsize = gcm_setauthsize, diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl new file mode 100644 index 000000000000..6e5576d19af8 --- /dev/null +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -0,0 +1,913 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# ThunderX2 1.17/+95% 1.36 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + mov w#$s1,#-1 + stp $r0,$r1,[$ctx,#32] // save key value + str w#$s1,[$ctx,#48] // impossible key power value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + + csel $d0,$d0,$r0,eq + +# ifdef __ILP32__ + stp w#$d0,w#$d1,[$len] +# else + stp $d0,$d1,[$len] +# endif +#endif + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] + ldp $r0,$r1,[$ctx,#32] // load key value + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp x17,#0 // is_base2_26? + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __AARCH64EB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] + ldp $t0,$t1,[$nonce] // load nonce + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp $r0,#0 // is_base2_26? + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 4 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +#endif +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.lo .Lpoly1305_blocks + + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + b .Leven_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + ldr w17,[$ctx,#48] // first table element + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cmp w17,#-1 // is value impossible? + b.ne .Leven_neon + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$inp,#32] // inp[2:3] + subs $len,$len,#64 + ldp x9,x13,[$inp,#48] + add $in2,$inp,#96 + adr $zeros,.Lzeros + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + ldr x30,[sp,#8] + .inst 0xd50323bf // autiasp + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + mov x4,#1 + st1 {$ACC4}[0],[$ctx] + str x4,[$ctx,#8] // set is_base2_26 + + ldr x29,[sp],#80 + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + s/w#x([0-9]+)/w$1/g; + + print $_,"\n"; +} +close STDOUT; diff --git a/arch/arm64/crypto/poly1305-core.S_shipped b/arch/arm64/crypto/poly1305-core.S_shipped new file mode 100644 index 000000000000..8d1c4e420ccd --- /dev/null +++ b/arch/arm64/crypto/poly1305-core.S_shipped @@ -0,0 +1,835 @@ +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp x1,xzr + stp xzr,xzr,[x0] // zero hash value + stp xzr,xzr,[x0,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + + ldp x7,x8,[x1] // load key + mov x9,#0xfffffffc0fffffff + movk x9,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev x7,x7 // flip bytes + rev x8,x8 +#endif + and x7,x7,x9 // &=0ffffffc0fffffff + and x9,x9,#-4 + and x8,x8,x9 // &=0ffffffc0ffffffc + mov w9,#-1 + stp x7,x8,[x0,#32] // save key value + str w9,[x0,#48] // impossible key power value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr x12,.Lpoly1305_blocks + adr x7,.Lpoly1305_blocks_neon + adr x13,.Lpoly1305_emit + + csel x12,x12,x7,eq + +# ifdef __ILP32__ + stp w12,w13,[x2] +# else + stp x12,x13,[x2] +# endif +#endif + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + ands x2,x2,#-16 + b.eq .Lno_data + + ldp x4,x5,[x0] // load hash value + ldp x6,x17,[x0,#16] // [along with is_base2_26] + ldp x7,x8,[x0,#32] // load key value + +#ifdef __AARCH64EB__ + lsr x12,x4,#32 + mov w13,w4 + lsr x14,x5,#32 + mov w15,w5 + lsr x16,x6,#32 +#else + mov w12,w4 + lsr x13,x4,#32 + mov w14,w5 + lsr x15,x5,#32 + mov w16,w6 +#endif + + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64 + lsr x13,x14,#12 + adds x12,x12,x14,lsl#52 + add x13,x13,x15,lsl#14 + adc x13,x13,xzr + lsr x14,x16,#24 + adds x13,x13,x16,lsl#40 + adc x14,x14,xzr + + cmp x17,#0 // is_base2_26? + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + csel x4,x4,x12,eq // choose between radixes + csel x5,x5,x13,eq + csel x6,x6,x14,eq + +.Loop: + ldp x10,x11,[x1],#16 // load input + sub x2,x2,#16 +#ifdef __AARCH64EB__ + rev x10,x10 + rev x11,x11 +#endif + adds x4,x4,x10 // accumulate input + adcs x5,x5,x11 + + mul x12,x4,x7 // h0*r0 + adc x6,x6,x3 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + cbnz x2,.Loop + + stp x4,x5,[x0] // store hash value + stp x6,xzr,[x0,#16] // [and clear is_base2_26] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + ldp x4,x5,[x0] // load hash base 2^64 + ldp x6,x7,[x0,#16] // [along with is_base2_26] + ldp x10,x11,[x2] // load nonce + +#ifdef __AARCH64EB__ + lsr x12,x4,#32 + mov w13,w4 + lsr x14,x5,#32 + mov w15,w5 + lsr x16,x6,#32 +#else + mov w12,w4 + lsr x13,x4,#32 + mov w14,w5 + lsr x15,x5,#32 + mov w16,w6 +#endif + + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64 + lsr x13,x14,#12 + adds x12,x12,x14,lsl#52 + add x13,x13,x15,lsl#14 + adc x13,x13,xzr + lsr x14,x16,#24 + adds x13,x13,x16,lsl#40 + adc x14,x14,xzr + + cmp x7,#0 // is_base2_26? + csel x4,x4,x12,eq // choose between radixes + csel x5,x5,x13,eq + csel x6,x6,x14,eq + + adds x12,x4,#5 // compare to modulus + adcs x13,x5,xzr + adc x14,x6,xzr + + tst x14,#-4 // see if it's carried/borrowed + + csel x4,x4,x12,eq + csel x5,x5,x13,eq + +#ifdef __AARCH64EB__ + ror x10,x10,#32 // flip nonce words + ror x11,x11,#32 +#endif + adds x4,x4,x10 // accumulate nonce + adc x5,x5,x11 +#ifdef __AARCH64EB__ + rev x4,x4 // flip output bytes + rev x5,x5 +#endif + stp x4,x5,[x1] // write result + + ret +.size poly1305_emit,.-poly1305_emit +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul x12,x4,x7 // h0*r0 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 4 +poly1305_splat: + and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,x4,#26,#26 + extr x14,x5,x4,#52 + and x14,x14,#0x03ffffff + ubfx x15,x5,#14,#26 + extr x16,x6,x5,#40 + + str w12,[x0,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[x0,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[x0,#16*2] // s1 + str w14,[x0,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[x0,#16*4] // s2 + str w15,[x0,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[x0,#16*6] // s3 + str w16,[x0,#16*7] // r4 + str w15,[x0,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +#endif +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr x17,[x0,#24] + cmp x2,#128 + b.lo .Lpoly1305_blocks + + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + cbz x17,.Lbase2_64_neon + + ldp w10,w11,[x0] // load hash value base 2^26 + ldp w12,w13,[x0,#8] + ldr w14,[x0,#16] + + tst x2,#31 + b.eq .Leven_neon + + ldp x7,x8,[x0,#32] // load key value + + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr x5,x12,#12 + adds x4,x4,x12,lsl#52 + add x5,x5,x13,lsl#14 + adc x5,x5,xzr + lsr x6,x14,#24 + adds x5,x5,x14,lsl#40 + adc x14,x6,xzr // can be partially reduced... + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + +#ifdef __AARCH64EB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl poly1305_mult + + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + b .Leven_neon + +.align 4 +.Lbase2_64_neon: + ldp x7,x8,[x0,#32] // load key value + + ldp x4,x5,[x0] // load hash value base 2^64 + ldr x6,[x0,#16] + + tst x2,#31 + b.eq .Linit_neon + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl poly1305_mult + +.Linit_neon: + ldr w17,[x0,#48] // first table element + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + cmp w17,#-1 // is value impossible? + b.ne .Leven_neon + + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + + ////////////////////////////////// initialize r^n table + mov x4,x7 // r^1 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + mov x5,x8 + mov x6,xzr + add x0,x0,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub x0,x0,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub x0,x0,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub x0,x0,#4 + bl poly1305_splat + sub x0,x0,#48 // restore original x0 + b .Ldo_neon + +.align 4 +.Leven_neon: + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + +.Ldo_neon: + ldp x8,x12,[x1,#32] // inp[2:3] + subs x2,x2,#64 + ldp x9,x13,[x1,#48] + add x16,x1,#96 + adr x17,.Lzeros + + lsl x3,x3,#24 + add x15,x0,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d14,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d15,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov d16,x8 + fmov d17,x10 + fmov d18,x12 + + ldp x8,x12,[x1],#16 // inp[0:1] + ldp x9,x13,[x1],#48 + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 + ld1 {v8.4s},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d9,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d10,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi v31.2d,#-1 + fmov d11,x8 + fmov d12,x10 + fmov d13,x12 + ushr v31.2d,v31.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // ___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // ___________________/ ____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs x2,x2,#64 + umull v23.2d,v14.2s,v7.s[2] + csel x16,x17,x16,lo + umull v22.2d,v14.2s,v5.s[2] + umull v21.2d,v14.2s,v3.s[2] + ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + umull v20.2d,v14.2s,v1.s[2] + ldp x9,x13,[x16],#48 + umull v19.2d,v14.2s,v0.s[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal v23.2d,v15.2s,v5.s[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v22.2d,v15.2s,v3.s[2] + and x5,x9,#0x03ffffff + umlal v21.2d,v15.2s,v1.s[2] + ubfx x6,x8,#26,#26 + umlal v20.2d,v15.2s,v0.s[2] + ubfx x7,x9,#26,#26 + umlal v19.2d,v15.2s,v8.s[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal v23.2d,v16.2s,v3.s[2] + extr x8,x12,x8,#52 + umlal v22.2d,v16.2s,v1.s[2] + extr x9,x13,x9,#52 + umlal v21.2d,v16.2s,v0.s[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v20.2d,v16.2s,v8.s[2] + fmov d14,x4 + umlal v19.2d,v16.2s,v6.s[2] + and x8,x8,#0x03ffffff + + umlal v23.2d,v17.2s,v1.s[2] + and x9,x9,#0x03ffffff + umlal v22.2d,v17.2s,v0.s[2] + ubfx x10,x12,#14,#26 + umlal v21.2d,v17.2s,v8.s[2] + ubfx x11,x13,#14,#26 + umlal v20.2d,v17.2s,v6.s[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v19.2d,v17.2s,v4.s[2] + fmov d15,x6 + + add v11.2s,v11.2s,v26.2s + add x12,x3,x12,lsr#40 + umlal v23.2d,v18.2s,v0.s[2] + add x13,x3,x13,lsr#40 + umlal v22.2d,v18.2s,v8.s[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v21.2d,v18.2s,v6.s[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v18.2s,v4.s[2] + fmov d16,x8 + umlal v19.2d,v18.2s,v2.s[2] + fmov d17,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add v9.2s,v9.2s,v24.2s + fmov d18,x12 + umlal v22.2d,v11.2s,v1.s[0] + ldp x8,x12,[x1],#16 // inp[0:1] + umlal v19.2d,v11.2s,v6.s[0] + ldp x9,x13,[x1],#48 + umlal v23.2d,v11.2s,v3.s[0] + umlal v20.2d,v11.2s,v8.s[0] + umlal v21.2d,v11.2s,v0.s[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.s[0] + umlal v23.2d,v9.2s,v7.s[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v21.2d,v9.2s,v3.s[0] + and x5,x9,#0x03ffffff + umlal v19.2d,v9.2s,v0.s[0] + ubfx x6,x8,#26,#26 + umlal v20.2d,v9.2s,v1.s[0] + ubfx x7,x9,#26,#26 + + add v12.2s,v12.2s,v27.2s + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal v22.2d,v10.2s,v3.s[0] + extr x8,x12,x8,#52 + umlal v23.2d,v10.2s,v5.s[0] + extr x9,x13,x9,#52 + umlal v19.2d,v10.2s,v8.s[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v21.2d,v10.2s,v1.s[0] + fmov d9,x4 + umlal v20.2d,v10.2s,v0.s[0] + and x8,x8,#0x03ffffff + + add v13.2s,v13.2s,v28.2s + and x9,x9,#0x03ffffff + umlal v22.2d,v12.2s,v0.s[0] + ubfx x10,x12,#14,#26 + umlal v19.2d,v12.2s,v4.s[0] + ubfx x11,x13,#14,#26 + umlal v23.2d,v12.2s,v1.s[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v20.2d,v12.2s,v6.s[0] + fmov d10,x6 + umlal v21.2d,v12.2s,v8.s[0] + add x12,x3,x12,lsr#40 + + umlal v22.2d,v13.2s,v8.s[0] + add x13,x3,x13,lsr#40 + umlal v19.2d,v13.2s,v2.s[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v23.2d,v13.2s,v0.s[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v13.2s,v4.s[0] + fmov d11,x8 + umlal v21.2d,v13.2s,v6.s[0] + fmov d12,x10 + fmov d13,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr v29.2d,v22.2d,#26 + xtn v27.2s,v22.2d + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + add v23.2d,v23.2d,v29.2d // h3 -> h4 + bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + xtn v28.2s,v23.2d + ushr v30.2d,v20.2d,#26 + xtn v25.2s,v20.2d + bic v28.2s,#0xfc,lsl#24 + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + shrn v30.2s,v21.2d,#26 + xtn v26.2s,v21.2d + add v19.2d,v19.2d,v29.2d // h4 -> h0 + bic v25.2s,#0xfc,lsl#24 + add v27.2s,v27.2s,v30.2s // h2 -> h3 + bic v26.2s,#0xfc,lsl#24 + + shrn v29.2s,v19.2d,#26 + xtn v24.2s,v19.2d + ushr v30.2s,v27.2s,#26 + bic v27.2s,#0xfc,lsl#24 + bic v24.2s,#0xfc,lsl#24 + add v25.2s,v25.2s,v29.2s // h0 -> h1 + add v28.2s,v28.2s,v30.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup v16.2d,v16.d[0] + add v11.2s,v11.2s,v26.2s + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds x2,x2,#32 + b.ne .Long_tail + + dup v16.2d,v11.d[0] + add v14.2s,v9.2s,v24.2s + add v17.2s,v12.2s,v27.2s + add v15.2s,v10.2s,v25.2s + add v18.2s,v13.2s,v28.2s + +.Long_tail: + dup v14.2d,v14.d[0] + umull2 v19.2d,v16.4s,v6.4s + umull2 v22.2d,v16.4s,v1.4s + umull2 v23.2d,v16.4s,v3.4s + umull2 v21.2d,v16.4s,v0.4s + umull2 v20.2d,v16.4s,v8.4s + + dup v15.2d,v15.d[0] + umlal2 v19.2d,v14.4s,v0.4s + umlal2 v21.2d,v14.4s,v3.4s + umlal2 v22.2d,v14.4s,v5.4s + umlal2 v23.2d,v14.4s,v7.4s + umlal2 v20.2d,v14.4s,v1.4s + + dup v17.2d,v17.d[0] + umlal2 v19.2d,v15.4s,v8.4s + umlal2 v22.2d,v15.4s,v3.4s + umlal2 v21.2d,v15.4s,v1.4s + umlal2 v23.2d,v15.4s,v5.4s + umlal2 v20.2d,v15.4s,v0.4s + + dup v18.2d,v18.d[0] + umlal2 v22.2d,v17.4s,v0.4s + umlal2 v23.2d,v17.4s,v1.4s + umlal2 v19.2d,v17.4s,v4.4s + umlal2 v20.2d,v17.4s,v6.4s + umlal2 v21.2d,v17.4s,v8.4s + + umlal2 v22.2d,v18.4s,v8.4s + umlal2 v19.2d,v18.4s,v2.4s + umlal2 v23.2d,v18.4s,v0.4s + umlal2 v20.2d,v18.4s,v4.4s + umlal2 v21.2d,v18.4s,v6.4s + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add v9.2s,v9.2s,v24.2s + umlal v22.2d,v11.2s,v1.2s + umlal v19.2d,v11.2s,v6.2s + umlal v23.2d,v11.2s,v3.2s + umlal v20.2d,v11.2s,v8.2s + umlal v21.2d,v11.2s,v0.2s + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.2s + umlal v19.2d,v9.2s,v0.2s + umlal v23.2d,v9.2s,v7.2s + umlal v20.2d,v9.2s,v1.2s + umlal v21.2d,v9.2s,v3.2s + + add v12.2s,v12.2s,v27.2s + umlal v22.2d,v10.2s,v3.2s + umlal v19.2d,v10.2s,v8.2s + umlal v23.2d,v10.2s,v5.2s + umlal v20.2d,v10.2s,v0.2s + umlal v21.2d,v10.2s,v1.2s + + add v13.2s,v13.2s,v28.2s + umlal v22.2d,v12.2s,v0.2s + umlal v19.2d,v12.2s,v4.2s + umlal v23.2d,v12.2s,v1.2s + umlal v20.2d,v12.2s,v6.2s + umlal v21.2d,v12.2s,v8.2s + + umlal v22.2d,v13.2s,v8.2s + umlal v19.2d,v13.2s,v2.2s + umlal v23.2d,v13.2s,v0.2s + umlal v20.2d,v13.2s,v4.2s + umlal v21.2d,v13.2s,v6.2s + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp v22.2d,v22.2d,v22.2d + ldp d8,d9,[sp,#16] // meet ABI requirements + addp v19.2d,v19.2d,v19.2d + ldp d10,d11,[sp,#32] + addp v23.2d,v23.2d,v23.2d + ldp d12,d13,[sp,#48] + addp v20.2d,v20.2d,v20.2d + ldp d14,d15,[sp,#64] + addp v21.2d,v21.2d,v21.2d + ldr x30,[sp,#8] + .inst 0xd50323bf // autiasp + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr v29.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + + add v23.2d,v23.2d,v29.2d // h3 -> h4 + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + and v23.16b,v23.16b,v31.16b + ushr v30.2d,v20.2d,#26 + and v20.16b,v20.16b,v31.16b + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + ushr v30.2d,v21.2d,#26 + and v21.16b,v21.16b,v31.16b + add v19.2d,v19.2d,v29.2d // h4 -> h0 + add v22.2d,v22.2d,v30.2d // h2 -> h3 + + ushr v29.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + ushr v30.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + add v20.2d,v20.2d,v29.2d // h0 -> h1 + add v23.2d,v23.2d,v30.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 + mov x4,#1 + st1 {v23.s}[0],[x0] + str x4,[x0,#8] // set is_base2_26 + + ldr x29,[sp],#80 + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm" +.align 2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c new file mode 100644 index 000000000000..dd843d0ee83a --- /dev/null +++ b/arch/arm64/crypto/poly1305-glue.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/jump_label.h> +#include <linux/module.h> + +asmlinkage void poly1305_init_arm64(void *state, const u8 *key); +asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +{ + poly1305_init_arm64(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(key + 16); + dctx->s[1] = get_unaligned_le32(key + 20); + dctx->s[2] = get_unaligned_le32(key + 24); + dctx->s[3] = get_unaligned_le32(key + 28); + dctx->buflen = 0; +} +EXPORT_SYMBOL(poly1305_init_arch); + +static int neon_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, + u32 len, u32 hibit, bool do_neon) +{ + if (unlikely(!dctx->sset)) { + if (!dctx->rset) { + poly1305_init_arch(dctx, src); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + if (len < POLY1305_BLOCK_SIZE) + return; + } + + len &= ~(POLY1305_BLOCK_SIZE - 1); + + if (static_branch_likely(&have_neon) && likely(do_neon)) + poly1305_blocks_neon(&dctx->h, src, len, hibit); + else + poly1305_blocks(&dctx->h, src, len, hibit); +} + +static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx, + const u8 *src, u32 len, bool do_neon) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + len -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + neon_poly1305_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE, 1, false); + dctx->buflen = 0; + } + } + + if (likely(len >= POLY1305_BLOCK_SIZE)) { + neon_poly1305_blocks(dctx, src, len, 1, do_neon); + src += round_down(len, POLY1305_BLOCK_SIZE); + len %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(len)) { + dctx->buflen = len; + memcpy(dctx->buf, src, len); + } +} + +static int neon_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + bool do_neon = crypto_simd_usable() && srclen > 128; + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_begin(); + neon_poly1305_do_update(dctx, src, srclen, do_neon); + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_end(); + return 0; +} + +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int nbytes) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + nbytes -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); + dctx->buflen = 0; + } + } + + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, len, 1); + kernel_neon_end(); + } else { + poly1305_blocks(&dctx->h, src, len, 1); + } + src += len; + nbytes %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(nbytes)) { + dctx->buflen = nbytes; + memcpy(dctx->buf, src, nbytes); + } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) +{ + __le32 digest[4]; + u64 f = 0; + + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit(&dctx->h, digest, dctx->s); + + /* mac = (h + s) % (2^128) */ + f = (f >> 32) + le32_to_cpu(digest[0]); + put_unaligned_le32(f, dst); + f = (f >> 32) + le32_to_cpu(digest[1]); + put_unaligned_le32(f, dst + 4); + f = (f >> 32) + le32_to_cpu(digest[2]); + put_unaligned_le32(f, dst + 8); + f = (f >> 32) + le32_to_cpu(digest[3]); + put_unaligned_le32(f, dst + 12); + + *dctx = (struct poly1305_desc_ctx){}; +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int neon_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); + return 0; +} + +static struct shash_alg neon_poly1305_alg = { + .init = neon_poly1305_init, + .update = neon_poly1305_update, + .final = neon_poly1305_final, + .digestsize = POLY1305_DIGEST_SIZE, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.cra_driver_name = "poly1305-neon", + .base.cra_priority = 200, + .base.cra_blocksize = POLY1305_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}; + +static int __init neon_poly1305_mod_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return 0; + + static_branch_enable(&have_neon); + + return crypto_register_shash(&neon_poly1305_alg); +} + +static void __exit neon_poly1305_mod_exit(void) +{ + if (cpu_have_named_feature(ASIMD)) + crypto_unregister_shash(&neon_poly1305_alg); +} + +module_init(neon_poly1305_mod_init); +module_exit(neon_poly1305_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 38ee1514cd9c..0b727edf4104 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -51,7 +51,7 @@ int hw_breakpoint_slots(int type) case TYPE_DATA: return get_num_wrps(); default: - pr_warning("unknown slot type: %d\n", type); + pr_warn("unknown slot type: %d\n", type); return 0; } } @@ -112,7 +112,7 @@ static u64 read_wb_reg(int reg, int n) GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to read from unknown breakpoint register %d\n", n); + pr_warn("attempt to read from unknown breakpoint register %d\n", n); } return val; @@ -127,7 +127,7 @@ static void write_wb_reg(int reg, int n, u64 val) GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to write to unknown breakpoint register %d\n", n); + pr_warn("attempt to write to unknown breakpoint register %d\n", n); } isb(); } @@ -145,7 +145,7 @@ static enum dbg_active_el debug_exception_level(int privilege) case AARCH64_BREAKPOINT_EL1: return DBG_ACTIVE_EL1; default: - pr_warning("invalid breakpoint privilege level %d\n", privilege); + pr_warn("invalid breakpoint privilege level %d\n", privilege); return -EINVAL; } } diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dc9fe879c279..ab149bcc3dc7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -345,8 +345,7 @@ void __cpu_die(unsigned int cpu) */ err = op_cpu_kill(cpu); if (err) - pr_warn("CPU%d may not have shut down cleanly: %d\n", - cpu, err); + pr_warn("CPU%d may not have shut down cleanly: %d\n", cpu, err); } /* @@ -976,8 +975,8 @@ void smp_send_stop(void) udelay(1); if (num_online_cpus() > 1) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(cpu_online_mask)); sdei_mask_local_cpu(); } @@ -1017,8 +1016,8 @@ void crash_smp_send_stop(void) udelay(1); if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(&mask)); sdei_mask_local_cpu(); } diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 009057517bdd..841a8b4ce114 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -5,6 +5,8 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> */ +#define RO_EXCEPTION_TABLE_ALIGN 8 + #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> #include <asm/kernel-pgtable.h> @@ -132,11 +134,9 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); _etext = .; /* End of text section */ - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ - NOTES + /* everything from this point to __init_begin will be marked RO NX */ + RO_DATA(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); idmap_pg_dir = .; . += IDMAP_DIR_SIZE; @@ -212,7 +212,7 @@ SECTIONS _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a9f541912289..5a3b15a14a7f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1060,6 +1060,8 @@ int arch_add_memory(int nid, u64 start, u64 size, __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); + memblock_clear_nomap(start, size); + return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 584bab2bace6..ac99ba0864bf 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -5,6 +5,9 @@ * Copyright (C) 2010, 2011 Texas Instruments Incorporated * Mark Salter <msalter@redhat.com> */ + +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> #include <asm/page.h> @@ -80,10 +83,7 @@ SECTIONS *(.gnu.warning) } - EXCEPTION_TABLE(16) - NOTES - - RO_DATA_SECTION(PAGE_SIZE) + RO_DATA(PAGE_SIZE) .const : { *(.const .const.* .gnu.linkonce.r.*) diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S index ae7961b973f2..2ff37beaf2bf 100644 --- a/arch/csky/kernel/vmlinux.lds.S +++ b/arch/csky/kernel/vmlinux.lds.S @@ -49,11 +49,10 @@ SECTIONS _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RO_DATA(PAGE_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; - NOTES EXCEPTION_TABLE(L1_CACHE_BYTES) BSS_SECTION(L1_CACHE_BYTES, PAGE_SIZE, L1_CACHE_BYTES) VBR_BASE diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 49f716c0a1df..6b1afc2f9b68 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/page.h> #include <asm/thread_info.h> @@ -37,9 +40,7 @@ SECTIONS #endif _etext = . ; } - EXCEPTION_TABLE(16) - NOTES - RO_DATA_SECTION(4) + RO_DATA(4) ROMEND = .; #if defined(CONFIG_ROMKERNEL) . = RAMTOP; @@ -48,7 +49,7 @@ SECTIONS #endif _sdata = . ; __data_start = . ; - RW_DATA_SECTION(0, PAGE_SIZE, THREAD_SIZE) + RW_DATA(0, PAGE_SIZE, THREAD_SIZE) #if defined(CONFIG_ROMKERNEL) #undef ADDR #endif diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S index 78f2418e97c8..0ca2471ddb9f 100644 --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -49,12 +49,11 @@ SECTIONS INIT_DATA_SECTION(PAGE_SIZE) _sdata = .; - RW_DATA_SECTION(32,PAGE_SIZE,_THREAD_SIZE) - RO_DATA_SECTION(PAGE_SIZE) + RW_DATA(32,PAGE_SIZE,_THREAD_SIZE) + RO_DATA(PAGE_SIZE) _edata = .; EXCEPTION_TABLE(16) - NOTES BSS_SECTION(_PAGE_SIZE, _PAGE_SIZE, _PAGE_SIZE) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index bb320c6d0cc9..c49fcef754de 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -289,7 +289,7 @@ static void __init setup_crashkernel(unsigned long total, int *n) } if (!check_crashkernel_memory(base, size)) { - pr_warning("crashkernel: There would be kdump memory " + pr_warn("crashkernel: There would be kdump memory " "at %ld GB but this is unusable because it " "must\nbe below 4 GB. Change the memory " "configuration of the machine.\n", diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 1e95d32c8877..91b4024c9351 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -132,7 +132,7 @@ static __u64 vtime_delta(struct task_struct *tsk) return delta_stime; } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); __u64 stime = vtime_delta(tsk); @@ -146,7 +146,7 @@ void vtime_account_system(struct task_struct *tsk) else ti->stime += stime; } -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_idle(struct task_struct *tsk) { diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index d9d4e21107cd..1ec6b703c5b4 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -5,6 +5,9 @@ #include <asm/pgtable.h> #include <asm/thread_info.h> +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT("elf64-ia64-little") @@ -13,7 +16,7 @@ ENTRY(phys_start) jiffies = jiffies_64; PHDRS { - code PT_LOAD; + text PT_LOAD; percpu PT_LOAD; data PT_LOAD; note PT_NOTE; @@ -36,7 +39,7 @@ SECTIONS { phys_start = _start - LOAD_OFFSET; code : { - } :code + } :text . = KERNEL_START; _text = .; @@ -68,11 +71,6 @@ SECTIONS { /* * Read-only data */ - NOTES :code :note /* put .notes in text and mark in PT_NOTE */ - code_continues : { - } : code /* switch back to regular program... */ - - EXCEPTION_TABLE(16) /* MCA table */ . = ALIGN(16); @@ -102,11 +100,11 @@ SECTIONS { __start_unwind = .; *(.IA_64.unwind*) __end_unwind = .; - } :code :unwind + } :text :unwind code_continues2 : { - } : code + } :text - RODATA + RO_DATA(4096) .opd : AT(ADDR(.opd) - LOAD_OFFSET) { __start_opd = .; @@ -214,7 +212,7 @@ SECTIONS { _end = .; code : { - } :code + } :text STABS_DEBUG DWARF_DEBUG diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds index cf6edda38971..7b975420c3d9 100644 --- a/arch/m68k/kernel/vmlinux-nommu.lds +++ b/arch/m68k/kernel/vmlinux-nommu.lds @@ -60,8 +60,8 @@ SECTIONS { #endif _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) - RW_DATA_SECTION(16, PAGE_SIZE, THREAD_SIZE) + RO_DATA(PAGE_SIZE) + RW_DATA(16, PAGE_SIZE, THREAD_SIZE) _edata = .; EXCEPTION_TABLE(16) diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 625a5785804f..4d33da4e7106 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -31,9 +31,9 @@ SECTIONS _sdata = .; /* Start of data section */ - RODATA + RO_DATA(4096) - RW_DATA_SECTION(16, PAGE_SIZE, THREAD_SIZE) + RW_DATA(16, PAGE_SIZE, THREAD_SIZE) BSS_SECTION(0, 0, 0) diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 9868270b0984..87d9f4d08f65 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -24,13 +24,13 @@ SECTIONS *(.fixup) *(.gnu.warning) } :text = 0x4e75 - RODATA + RO_DATA(4096) _etext = .; /* End of text section */ EXCEPTION_TABLE(16) :data _sdata = .; /* Start of rw data section */ - RW_DATA_SECTION(16, PAGE_SIZE, THREAD_SIZE) :data + RW_DATA(16, PAGE_SIZE, THREAD_SIZE) :data /* End of data goes *here* so that freeing init code works properly. */ _edata = .; NOTES diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index c9c4be822456..a75896f18e58 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -46,6 +46,7 @@ config MICROBLAZE select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU + select SPARSE_IRQ # Endianness selection choice diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 654edfdc7867..b3b433db89d8 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -33,6 +33,8 @@ CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_BRIDGE=m CONFIG_PCI=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y CONFIG_MTD=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y @@ -73,6 +75,7 @@ CONFIG_UIO_PDRV_GENIRQ=y CONFIG_UIO_DMEM_GENIRQ=y CONFIG_EXT2_FS=y # CONFIG_DNOTIFY is not set +CONFIG_TMPFS=y CONFIG_CRAMFS=y CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y diff --git a/arch/microblaze/include/asm/irq.h b/arch/microblaze/include/asm/irq.h index d785defeeed5..eac2fb4b3fb9 100644 --- a/arch/microblaze/include/asm/irq.h +++ b/arch/microblaze/include/asm/irq.h @@ -9,7 +9,6 @@ #ifndef _ASM_MICROBLAZE_IRQ_H #define _ASM_MICROBLAZE_IRQ_H -#define NR_IRQS (32 + 1) #include <asm-generic/irq.h> struct pt_regs; diff --git a/arch/microblaze/kernel/entry.S b/arch/microblaze/kernel/entry.S index 4e1b567becd6..de7083bd1d24 100644 --- a/arch/microblaze/kernel/entry.S +++ b/arch/microblaze/kernel/entry.S @@ -738,14 +738,9 @@ no_intr_resched: andi r5, r5, _TIF_NEED_RESCHED; beqi r5, restore /* if zero jump over */ -preempt: /* interrupts are off that's why I am calling preempt_chedule_irq */ bralid r15, preempt_schedule_irq nop - lwi r11, CURRENT_TASK, TS_THREAD_INFO; /* get thread info */ - lwi r5, r11, TI_FLAGS; /* get flags in thread info */ - andi r5, r5, _TIF_NEED_RESCHED; - bnei r5, preempt /* if non zero jump to resched */ restore: #endif VM_OFF /* MS: turn off MMU */ diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S index f264fdcf152a..7d2894418691 100644 --- a/arch/microblaze/kernel/head.S +++ b/arch/microblaze/kernel/head.S @@ -99,7 +99,7 @@ big_endian: _prepare_copy_fdt: or r11, r0, r0 /* incremment */ ori r4, r0, TOPHYS(_fdt_start) - ori r3, r0, (0x8000 - 4) + ori r3, r0, (0x10000 - 4) _copy_fdt: lw r12, r7, r11 /* r12 = r7 + r11 */ sw r12, r4, r11 /* addr[r4 + r11] = r12 */ diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index e1f3e8741292..2c09fa3a8a01 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -11,6 +11,8 @@ OUTPUT_ARCH(microblaze) ENTRY(microblaze_start) +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> @@ -46,14 +48,12 @@ SECTIONS { __fdt_blob : AT(ADDR(__fdt_blob) - LOAD_OFFSET) { _fdt_start = . ; /* place for fdt blob */ *(__fdt_blob) ; /* Any link-placed DTB */ - . = _fdt_start + 0x8000; /* Pad up to 32kbyte */ + . = _fdt_start + 0x10000; /* Pad up to 64kbyte */ _fdt_end = . ; } . = ALIGN(16); - RODATA - EXCEPTION_TABLE(16) - NOTES + RO_DATA(4096) /* * sdata2 section can go anywhere, but must be word aligned @@ -70,7 +70,7 @@ SECTIONS { } _sdata = . ; - RW_DATA_SECTION(32, PAGE_SIZE, THREAD_SIZE) + RW_DATA(32, PAGE_SIZE, THREAD_SIZE) _edata = . ; /* Under the microblaze ABI, .sdata and .sbss must be contiguous */ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c86be02b6d89..61b9269cdd3e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -74,6 +74,7 @@ config MIPS select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ + select HAVE_SPARSE_SYSCALL_NR select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 0a5eab626260..e1c44aed8156 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -326,7 +326,7 @@ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/mips/math-emu/ # See arch/mips/Kbuild for content of core part of the kernel core-y += arch/mips/ -drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/ +drivers-y += arch/mips/crypto/ drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/ # suspend and hibernation support diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile index e07aca572c2e..8e1deaf00e0c 100644 --- a/arch/mips/crypto/Makefile +++ b/arch/mips/crypto/Makefile @@ -4,3 +4,21 @@ # obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o + +obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o +chacha-mips-y := chacha-core.o chacha-glue.o +AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots + +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o +poly1305-mips-y := poly1305-core.o poly1305-glue.o + +perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32 +perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) + +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE + $(call if_changed,perlasm) + +targets += poly1305-core.S diff --git a/arch/mips/crypto/chacha-core.S b/arch/mips/crypto/chacha-core.S new file mode 100644 index 000000000000..5755f69cfe00 --- /dev/null +++ b/arch/mips/crypto/chacha-core.S @@ -0,0 +1,497 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 32 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $t8 +#define X9 $t9 +#define X10 $v1 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define STATE $a0 +#define OUT $a1 +#define IN $a2 +#define BYTES $a3 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $s7 + +#define IS_UNALIGNED $s7 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define ROTx rotl +#define ROTR(n) rotr n, 24 +#define CPU_TO_LE32(n) \ + wsbh n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define ROTx rotr +#define CPU_TO_LE32(n) +#define ROTR(n) +#endif + +#define FOR_EACH_WORD(x) \ + x( 0); \ + x( 1); \ + x( 2); \ + x( 3); \ + x( 4); \ + x( 5); \ + x( 6); \ + x( 7); \ + x( 8); \ + x( 9); \ + x(10); \ + x(11); \ + x(12); \ + x(13); \ + x(14); \ + x(15); + +#define FOR_EACH_WORD_REV(x) \ + x(15); \ + x(14); \ + x(13); \ + x(12); \ + x(11); \ + x(10); \ + x( 9); \ + x( 8); \ + x( 7); \ + x( 6); \ + x( 5); \ + x( 4); \ + x( 3); \ + x( 2); \ + x( 1); \ + x( 0); + +#define PLUS_ONE_0 1 +#define PLUS_ONE_1 2 +#define PLUS_ONE_2 3 +#define PLUS_ONE_3 4 +#define PLUS_ONE_4 5 +#define PLUS_ONE_5 6 +#define PLUS_ONE_6 7 +#define PLUS_ONE_7 8 +#define PLUS_ONE_8 9 +#define PLUS_ONE_9 10 +#define PLUS_ONE_10 11 +#define PLUS_ONE_11 12 +#define PLUS_ONE_12 13 +#define PLUS_ONE_13 14 +#define PLUS_ONE_14 15 +#define PLUS_ONE_15 16 +#define PLUS_ONE(x) PLUS_ONE_ ## x +#define _CONCAT3(a,b,c) a ## b ## c +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lwl T1, (x*4)+MSB ## (IN); \ + lwr T1, (x*4)+LSB ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + swl X ## x, (x*4)+MSB ## (OUT); \ + swr X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lw T1, (x*4) ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + sw X ## x, (x*4) ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x) \ +.Lchacha_mips_jmptbl_aligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_aligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define JMPTBL_UNALIGNED(x) \ +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotl X(V), S; \ + rotl X(W), S; \ + rotl X(Y), S; \ + rotl X(Z), S; + +.text +.set reorder +.set noat +.globl chacha_crypt_arch +.ent chacha_crypt_arch +chacha_crypt_arch: + .frame $sp, STACK_SIZE, $ra + + /* Load number of rounds */ + lw $at, 16($sp) + + addiu $sp, -STACK_SIZE + + /* Return bytes = 0. */ + beqz BYTES, .Lchacha_mips_end + + lw NONCE_0, 48(STATE) + + /* Save s0-s7 */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + sw $s6, 24($sp) + sw $s7, 28($sp) + + /* Test IN or OUT is unaligned. + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 + */ + or IS_UNALIGNED, IN, OUT + andi IS_UNALIGNED, 0x3 + + b .Lchacha_rounds_start + +.align 4 +.Loop_chacha_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + +.Lchacha_rounds_start: + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + + move X12, NONCE_0 + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_chacha_xor_rounds: + addiu $at, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $at, .Loop_chacha_xor_rounds + + addiu BYTES, -(CHACHA20_BLOCK_SIZE) + + /* Is data src/dst unaligned? Jump */ + bnez IS_UNALIGNED, .Loop_chacha_unaligned + + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES < 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_aligned + + FOR_EACH_WORD_REV(STORE_ALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + + /* BYTES < 0? Handle last bytes */ + bltz BYTES, .Lchacha_mips_xor_bytes + +.Lchacha_mips_xor_done: + /* Restore used registers */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + +.Lchacha_mips_end: + addiu $sp, STACK_SIZE + jr $ra + +.Lchacha_mips_no_full_block_aligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) + + +.Loop_chacha_unaligned: + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES > 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_unaligned + + FOR_EACH_WORD_REV(STORE_UNALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + + .set noreorder + /* Fall through to byte handling */ + bgez BYTES, .Lchacha_mips_xor_done +.Lchacha_mips_xor_unaligned_0_b: +.Lchacha_mips_xor_aligned_0_b: + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + .set reorder + +.Lchacha_mips_xor_bytes: + addu IN, $at + addu OUT, $at + /* First byte */ + lbu T1, 0(IN) + addiu $at, BYTES, 1 + CPU_TO_LE32(SAVED_X) + ROTR(SAVED_X) + xor T1, SAVED_X + sb T1, 0(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Second byte */ + lbu T1, 1(IN) + addiu $at, BYTES, 2 + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 1(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Third byte */ + lbu T1, 2(IN) + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 2(OUT) + b .Lchacha_mips_xor_done + +.Lchacha_mips_no_full_block_unaligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_UNALIGNED) +.end chacha_crypt_arch +.set at + +/* Input arguments + * STATE $a0 + * OUT $a1 + * NROUND $a2 + */ + +#undef X12 +#undef X13 +#undef X14 +#undef X15 + +#define X12 $a3 +#define X13 $at +#define X14 $v0 +#define X15 STATE + +.set noat +.globl hchacha_block_arch +.ent hchacha_block_arch +hchacha_block_arch: + .frame $sp, STACK_SIZE, $ra + + addiu $sp, -STACK_SIZE + + /* Save X11(s6) */ + sw X11, 0($sp) + + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + lw X12, 48(STATE) + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_hchacha_xor_rounds: + addiu $a2, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $a2, .Loop_hchacha_xor_rounds + + /* Restore used register */ + lw X11, 0($sp) + + sw X0, 0(OUT) + sw X1, 4(OUT) + sw X2, 8(OUT) + sw X3, 12(OUT) + sw X12, 16(OUT) + sw X13, 20(OUT) + sw X14, 24(OUT) + sw X15, 28(OUT) + + addiu $sp, STACK_SIZE + jr $ra +.end hchacha_block_arch +.set at diff --git a/arch/mips/crypto/chacha-glue.c b/arch/mips/crypto/chacha-glue.c new file mode 100644 index 000000000000..779e399c9bef --- /dev/null +++ b/arch/mips/crypto/chacha-glue.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MIPS accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/byteorder.h> +#include <crypto/algapi.h> +#include <crypto/internal/chacha.h> +#include <crypto/internal/skcipher.h> +#include <linux/kernel.h> +#include <linux/module.h> + +asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +EXPORT_SYMBOL(chacha_crypt_arch); + +asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds); +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +static int chacha_mips_stream_xor(struct skcipher_request *req, + const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_mips(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_mips_stream_xor(req, ctx, req->iv); +} + +static int xchacha_mips(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + + hchacha_block(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_mips_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-mips", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha_mips, + .decrypt = chacha_mips, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-mips", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = xchacha_mips, + .decrypt = xchacha_mips, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-mips", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha12_setkey, + .encrypt = xchacha_mips, + .decrypt = xchacha_mips, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-mips"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-mips"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-mips"); diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c new file mode 100644 index 000000000000..b759b6ccc361 --- /dev/null +++ b/arch/mips/crypto/poly1305-glue.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/unaligned.h> +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +asmlinkage void poly1305_init_mips(void *state, const u8 *key); +asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce); + +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +{ + poly1305_init_mips(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(key + 16); + dctx->s[1] = get_unaligned_le32(key + 20); + dctx->s[2] = get_unaligned_le32(key + 24); + dctx->s[3] = get_unaligned_le32(key + 28); + dctx->buflen = 0; +} +EXPORT_SYMBOL(poly1305_init_arch); + +static int mips_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, + u32 len, u32 hibit) +{ + if (unlikely(!dctx->sset)) { + if (!dctx->rset) { + poly1305_init_mips(&dctx->h, src); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + if (len < POLY1305_BLOCK_SIZE) + return; + } + + len &= ~(POLY1305_BLOCK_SIZE - 1); + + poly1305_blocks_mips(&dctx->h, src, len, hibit); +} + +static int mips_poly1305_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(dctx->buflen)) { + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + len -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1); + dctx->buflen = 0; + } + } + + if (likely(len >= POLY1305_BLOCK_SIZE)) { + mips_poly1305_blocks(dctx, src, len, 1); + src += round_down(len, POLY1305_BLOCK_SIZE); + len %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(len)) { + dctx->buflen = len; + memcpy(dctx->buf, src, len); + } + return 0; +} + +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int nbytes) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + nbytes -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_blocks_mips(&dctx->h, dctx->buf, + POLY1305_BLOCK_SIZE, 1); + dctx->buflen = 0; + } + } + + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + poly1305_blocks_mips(&dctx->h, src, len, 1); + src += len; + nbytes %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(nbytes)) { + dctx->buflen = nbytes; + memcpy(dctx->buf, src, nbytes); + } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) +{ + __le32 digest[4]; + u64 f = 0; + + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit_mips(&dctx->h, digest, dctx->s); + + /* mac = (h + s) % (2^128) */ + f = (f >> 32) + le32_to_cpu(digest[0]); + put_unaligned_le32(f, dst); + f = (f >> 32) + le32_to_cpu(digest[1]); + put_unaligned_le32(f, dst + 4); + f = (f >> 32) + le32_to_cpu(digest[2]); + put_unaligned_le32(f, dst + 8); + f = (f >> 32) + le32_to_cpu(digest[3]); + put_unaligned_le32(f, dst + 12); + + *dctx = (struct poly1305_desc_ctx){}; +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int mips_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); + return 0; +} + +static struct shash_alg mips_poly1305_alg = { + .init = mips_poly1305_init, + .update = mips_poly1305_update, + .final = mips_poly1305_final, + .digestsize = POLY1305_DIGEST_SIZE, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.cra_driver_name = "poly1305-mips", + .base.cra_priority = 200, + .base.cra_blocksize = POLY1305_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}; + +static int __init mips_poly1305_mod_init(void) +{ + return crypto_register_shash(&mips_poly1305_alg); +} + +static void __exit mips_poly1305_mod_exit(void) +{ + crypto_unregister_shash(&mips_poly1305_alg); +} + +module_init(mips_poly1305_mod_init); +module_exit(mips_poly1305_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-mips"); diff --git a/arch/mips/crypto/poly1305-mips.pl b/arch/mips/crypto/poly1305-mips.pl new file mode 100644 index 000000000000..b05bab884ed2 --- /dev/null +++ b/arch/mips/crypto/poly1305-mips.pl @@ -0,0 +1,1273 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL +# project. +# ==================================================================== + +# Poly1305 hash for MIPS. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 ~5.5/+130% (big-endian) +# Octeon II 2.50/+70% (little-endian) +# +# March 2019 +# +# Add 32-bit code path. +# +# October 2019 +# +# Modulo-scheduling reduction allows to omit dependency chain at the +# end of inner loop and improve performance. Also optimize MIPS32R2 +# code path for MIPS 1004K core. Per René von Dorst's suggestions. +# +# IALU/gcc +# R1x000 ~9.8/? (big-endian) +# Octeon II 3.65/+140% (little-endian) +# MT7621/1004K 4.75/? (little-endian) +# +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# <appro@openssl.org> +# +###################################################################### + +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; + +if ($flavour =~ /64|n32/i) {{{ +###################################################################### +# 64-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ + defined(_MIPS_ARCH_MIPS64R6)) \\ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +#endif + +#if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +# define dmultu(rs,rt) dmultu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_init_mips +# define poly1305_blocks poly1305_blocks_mips +# define poly1305_emit poly1305_emit_mips +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $tmp0,$inp,7 # $inp % 8 + dsubu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + ld $in0,0($inp) + ld $in1,8($inp) + beqz $tmp0,.Laligned_key + ld $tmp2,16($inp) + + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + dsllv $in0,$in0,$tmp0 + dsrlv $tmp3,$in1,$tmp1 + dsllv $in1,$in1,$tmp0 + dsrlv $tmp2,$tmp2,$tmp1 +# else + dsrlv $in0,$in0,$tmp0 + dsllv $tmp3,$in1,$tmp1 + dsrlv $in1,$in1,$tmp0 + dsllv $tmp2,$tmp2,$tmp1 +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_key: +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 # 0x0000000100000000 + daddiu $tmp0,-63 # 0x00000000ffffffc1 + dsll $tmp0,28 # 0x0ffffffc10000000 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); +my ($shr,$shl) = ($s6,$s7); # used on R6 + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + .frame $sp,8*8,$ra + .mask $SAVED_REGS_MASK|0x000c0000,-8 + dsubu $sp,8*8 + sd $s7,56($sp) + sd $s6,48($sp) +#else + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + dsubu $sp,6*8 +#endif + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $shr,$inp,7 + dsubu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset + subu $shl,$zero,$shr +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + dsll $len,4 + daddu $len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) +# ifdef MIPSEB + dsllv $in0,$in0,$shr + dsrlv $tmp3,$in1,$shl + dsllv $in1,$in1,$shr + dsrlv $tmp2,$tmp2,$shl +# else + dsrlv $in0,$in0,$shr + dsllv $tmp3,$in1,$shl + dsrlv $in1,$in1,$shr + dsllv $tmp2,$tmp2,$shl +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_inp: +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + dsrl $tmp1,$h2,2 # modulo-scheduled reduction + andi $h2,$h2,3 + dsll $tmp0,$tmp1,2 + + daddu $d0,$h0,$in0 # accumulate input + daddu $tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + daddu $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + daddu $d1,$h1,$in1 + daddu $tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + daddu $d1,$tmp0 + + dmultu ($r0,$d0) # h0*r0 + daddu $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + dmultu ($rs1,$d1) # h1*5*r1 + daddu $d2,$tmp1 + daddu $d2,$tmp0 + mflo ($tmp0,$rs1,$d1) + mfhi ($tmp1,$rs1,$d1) + + dmultu ($r1,$d0) # h0*r1 + mflo ($tmp2,$r1,$d0) + mfhi ($h2,$r1,$d0) + daddu $h0,$tmp0 + daddu $h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + dmultu ($r0,$d1) # h1*r0 + daddu $h1,$tmp0 + daddu $h1,$tmp2 + mflo ($tmp0,$r0,$d1) + mfhi ($tmp1,$r0,$d1) + + dmultu ($rs1,$d2) # h2*5*r1 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + mflo ($tmp2,$rs1,$d2) + + dmultu ($r0,$d2) # h2*r0 + daddu $h1,$tmp0 + daddu $h2,$tmp1 + mflo ($tmp3,$r0,$d2) + sltu $tmp0,$h1,$tmp0 + daddu $h2,$tmp0 + + daddu $h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + daddu $h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + ld $s7,56($sp) + ld $s6,48($sp) +#endif + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra +#if defined(_MIPS_ARCH_MIPS64R6) + daddu $sp,8*8 +#else + daddu $sp,6*8 +#endif +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + li $in0,-4 # final reduction + dsrl $in1,$tmp2,2 + and $in0,$tmp2 + andi $tmp2,$tmp2,3 + daddu $in0,$in1 + + daddu $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + daddiu $in0,$tmp0,5 # compare to modulus + daddu $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + daddu $in1,$tmp1,$tmp3 + daddu $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + + xor $in0,$tmp0 + xor $in1,$tmp1 + and $in0,$tmp2 + and $in1,$tmp2 + xor $in0,$tmp0 + xor $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ + defined(_MIPS_ARCH_MIPS32R6)) \\ + && !defined(_MIPS_ARCH_MIPS32R2) +# define _MIPS_ARCH_MIPS32R2 +#endif + +#if defined(_MIPS_ARCH_MIPS32R6) +# define multu(rs,rt) +# define mflo(rd,rs,rt) mulu rd,rs,rt +# define mfhi(rd,rs,rt) muhu rd,rs,rt +#else +# define multu(rs,rt) multu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_init_mips +# define poly1305_blocks poly1305_blocks_mips +# define poly1305_emit poly1305_emit_mips +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 3 +#else +# define MSB 3 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $tmp0,$inp,3 # $inp % 4 + subu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + sllv $in0,$in0,$tmp0 + srlv $tmp3,$in1,$tmp1 + sllv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + srlv $tmp3,$in2,$tmp1 + sllv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + srlv $tmp3,$in3,$tmp1 + sllv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + srlv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# else + srlv $in0,$in0,$tmp0 + sllv $tmp3,$in1,$tmp1 + srlv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllv $tmp3,$in2,$tmp1 + srlv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllv $tmp3,$in3,$tmp1 + srlv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# endif +.Laligned_key: +#else + lwl $in0,0+MSB($inp) + lwl $in1,4+MSB($inp) + lwl $in2,8+MSB($inp) + lwl $in3,12+MSB($inp) + lwr $in0,0+LSB($inp) + lwr $in1,4+LSB($inp) + lwr $in2,8+LSB($inp) + lwr $in3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $in0,$in0 # byte swap + wsbh $in1,$in1 + wsbh $in2,$in2 + wsbh $in3,$in3 + rotr $in0,$in0,16 + rotr $in1,$in1,16 + rotr $in2,$in2,16 + rotr $in3,$in3,16 +# else + srl $tmp0,$in0,24 # byte swap + srl $tmp1,$in0,8 + andi $tmp2,$in0,0xFF00 + sll $in0,$in0,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in0,$tmp0 + srl $tmp0,$in1,24 + or $tmp1,$tmp2 + srl $tmp2,$in1,8 + or $in0,$tmp1 + andi $tmp1,$in1,0xFF00 + sll $in1,$in1,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in1,$tmp0 + srl $tmp0,$in2,24 + or $tmp2,$tmp1 + srl $tmp1,$in2,8 + or $in1,$tmp2 + andi $tmp2,$in2,0xFF00 + sll $in2,$in2,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in2,$tmp0 + srl $tmp0,$in3,24 + or $tmp1,$tmp2 + srl $tmp2,$in3,8 + or $in2,$tmp1 + andi $tmp1,$in3,0xFF00 + sll $in3,$in3,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in3,$tmp0 + or $tmp2,$tmp1 + or $in3,$tmp2 +# endif +#endif + lui $tmp0,0x0fff + ori $tmp0,0xffff # 0x0fffffff + and $in0,$in0,$tmp0 + subu $tmp0,3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srl $tmp1,$in1,2 + srl $tmp2,$in2,2 + srl $tmp3,$in3,2 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addu $in2,$in2,$tmp2 + addu $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $v0,0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; + +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $t2; # used on R6 +my $one = $t2; # used on R2 + +$code.=<<___; +.globl poly1305_blocks +.align 5 +.ent poly1305_blocks +poly1305_blocks: + .frame $sp,16*4,$ra + .mask $SAVED_REGS_MASK,-4 + .set noreorder + subu $sp, $sp,4*12 + sw $s11,4*11($sp) + sw $s10,4*10($sp) + sw $s9, 4*9($sp) + sw $s8, 4*8($sp) + sw $s7, 4*7($sp) + sw $s6, 4*6($sp) + sw $s5, 4*5($sp) + sw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sw $s3, 4*3($sp) + sw $s2, 4*2($sp) + sw $s1, 4*1($sp) + sw $s0, 4*0($sp) +___ +$code.=<<___; + .set reorder + + srl $len,4 # number of complete blocks + li $one,1 + beqz $len,.Labort + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $shr,$inp,3 + subu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + sll $len,4 + addu $len,$len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS32R6) + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) + beqz $shr,.Laligned_inp + + lw $t0,16($inp) + subu $t1,$zero,$shr +# ifdef MIPSEB + sllv $d0,$d0,$shr + srlv $at,$d1,$t1 + sllv $d1,$d1,$shr + or $d0,$d0,$at + srlv $at,$d2,$t1 + sllv $d2,$d2,$shr + or $d1,$d1,$at + srlv $at,$d3,$t1 + sllv $d3,$d3,$shr + or $d2,$d2,$at + srlv $t0,$t0,$t1 + or $d3,$d3,$t0 +# else + srlv $d0,$d0,$shr + sllv $at,$d1,$t1 + srlv $d1,$d1,$shr + or $d0,$d0,$at + sllv $at,$d2,$t1 + srlv $d2,$d2,$shr + or $d1,$d1,$at + sllv $at,$d3,$t1 + srlv $d3,$d3,$shr + or $d2,$d2,$at + sllv $t0,$t0,$t1 + or $d3,$d3,$t0 +# endif +.Laligned_inp: +#else + lwl $d0,0+MSB($inp) # load input + lwl $d1,4+MSB($inp) + lwl $d2,8+MSB($inp) + lwl $d3,12+MSB($inp) + lwr $d0,0+LSB($inp) + lwr $d1,4+LSB($inp) + lwr $d2,8+LSB($inp) + lwr $d3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $d0,$d0 # byte swap + wsbh $d1,$d1 + wsbh $d2,$d2 + wsbh $d3,$d3 + rotr $d0,$d0,16 + rotr $d1,$d1,16 + rotr $d2,$d2,16 + rotr $d3,$d3,16 +# else + srl $at,$d0,24 # byte swap + srl $t0,$d0,8 + andi $t1,$d0,0xFF00 + sll $d0,$d0,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d0,$at + srl $at,$d1,24 + or $t0,$t1 + srl $t1,$d1,8 + or $d0,$t0 + andi $t0,$d1,0xFF00 + sll $d1,$d1,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d1,$at + srl $at,$d2,24 + or $t1,$t0 + srl $t0,$d2,8 + or $d1,$t1 + andi $t1,$d2,0xFF00 + sll $d2,$d2,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d2,$at + srl $at,$d3,24 + or $t0,$t1 + srl $t1,$d3,8 + or $d2,$t0 + andi $t0,$d3,0xFF00 + sll $d3,$d3,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d3,$at + or $t1,$t0 + or $d3,$t1 +# endif +#endif + srl $t0,$h4,2 # modulo-scheduled reduction + andi $h4,$h4,3 + sll $at,$t0,2 + + addu $d0,$d0,$h0 # accumulate input + addu $t0,$t0,$at + sltu $h0,$d0,$h0 + addu $d0,$d0,$t0 # ... and residue + sltu $at,$d0,$t0 + + addu $d1,$d1,$h1 + addu $h0,$h0,$at # carry + sltu $h1,$d1,$h1 + addu $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addu $d2,$d2,$h2 + addu $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addu $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addu $d3,$d3,$h3 + addu $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addu $d3,$d3,$h2 + +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) + multu $r0,$d0 # d0*r0 + sltu $h2,$d3,$h2 + maddu $rs3,$d1 # d1*s3 + addu $h3,$h3,$h2 # carry + maddu $rs2,$d2 # d2*s2 + addu $h4,$h4,$padbit + maddu $rs1,$d3 # d3*s1 + addu $h4,$h4,$h3 + mfhi $at + mflo $h0 + + multu $r1,$d0 # d0*r1 + maddu $r0,$d1 # d1*r0 + maddu $rs3,$d2 # d2*s3 + maddu $rs2,$d3 # d3*s2 + maddu $rs1,$h4 # h4*s1 + maddu $at,$one # hi*1 + mfhi $at + mflo $h1 + + multu $r2,$d0 # d0*r2 + maddu $r1,$d1 # d1*r1 + maddu $r0,$d2 # d2*r0 + maddu $rs3,$d3 # d3*s3 + maddu $rs2,$h4 # h4*s2 + maddu $at,$one # hi*1 + mfhi $at + mflo $h2 + + mul $t0,$r0,$h4 # h4*r0 + + multu $r3,$d0 # d0*r3 + maddu $r2,$d1 # d1*r2 + maddu $r1,$d2 # d2*r1 + maddu $r0,$d3 # d3*r0 + maddu $rs3,$h4 # h4*s3 + maddu $at,$one # hi*1 + mfhi $at + mflo $h3 + + addiu $inp,$inp,16 + + addu $h4,$t0,$at +#else + multu ($r0,$d0) # d0*r0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + sltu $h2,$d3,$h2 + addu $h3,$h3,$h2 # carry + + multu ($rs3,$d1) # d1*s3 + mflo ($at,$rs3,$d1) + mfhi ($t0,$rs3,$d1) + + addu $h4,$h4,$padbit + addiu $inp,$inp,16 + addu $h4,$h4,$h3 + + multu ($rs2,$d2) # d2*s2 + mflo ($a3,$rs2,$d2) + mfhi ($t1,$rs2,$d2) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($rs1,$d3) # d3*s1 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$rs1,$d3) + mfhi ($t0,$rs1,$d3) + addu $h0,$h0,$a3 + addu $h1,$h1,$t1 + multu ($r1,$d0) # d0*r1 + sltu $a3,$h0,$a3 + addu $h1,$h1,$a3 + + + mflo ($a3,$r1,$d0) + mfhi ($h2,$r1,$d0) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($r0,$d1) # d1*r0 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$r0,$d1) + mfhi ($t0,$r0,$d1) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($rs3,$d2) # d2*s3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs3,$d2) + mfhi ($t1,$rs3,$d2) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($rs2,$d3) # d3*s2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + mflo ($at,$rs2,$d3) + mfhi ($t0,$rs2,$d3) + addu $h1,$h1,$a3 + addu $h2,$h2,$t1 + multu ($rs1,$h4) # h4*s1 + sltu $a3,$h1,$a3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs1,$h4) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($r2,$d0) # d0*r2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + + mflo ($at,$r2,$d0) + mfhi ($h3,$r2,$d0) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($r1,$d1) # d1*r1 + addu $h2,$h2,$a3 + + mflo ($a3,$r1,$d1) + mfhi ($t1,$r1,$d1) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r0,$d2) # d2*r0 + addu $h3,$h3,$at + + mflo ($at,$r0,$d2) + mfhi ($t0,$r0,$d2) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($rs3,$d3) # d3*s3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + mflo ($a3,$rs3,$d3) + mfhi ($t1,$rs3,$d3) + addu $h2,$h2,$at + addu $h3,$h3,$t0 + multu ($rs2,$h4) # h4*s2 + sltu $at,$h2,$at + addu $h3,$h3,$at + + mflo ($at,$rs2,$h4) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($r3,$d0) # d0*r3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + + mflo ($a3,$r3,$d0) + mfhi ($t1,$r3,$d0) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r2,$d1) # d1*r2 + addu $h3,$h3,$at + + mflo ($at,$r2,$d1) + mfhi ($t0,$r2,$d1) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + multu ($r0,$d3) # d3*r0 + addu $t1,$t1,$a3 + + mflo ($a3,$r0,$d3) + mfhi ($d3,$r0,$d3) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r1,$d2) # d2*r1 + sltu $at,$h3,$at + addu $t1,$t1,$at + + mflo ($at,$r1,$d2) + mfhi ($t0,$r1,$d2) + addu $h3,$h3,$a3 + addu $t1,$t1,$d3 + multu ($rs3,$h4) # h4*s3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + + mflo ($a3,$rs3,$h4) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r0,$h4) # h4*r0 + sltu $at,$h3,$at + addu $t1,$t1,$at + + + mflo ($h4,$r0,$h4) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + addu $h4,$h4,$t1 + + li $padbit,1 # if we loop, padbit is 1 +#endif + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + + .set noreorder +.Labort: + lw $s11,4*11($sp) + lw $s10,4*10($sp) + lw $s9, 4*9($sp) + lw $s8, 4*8($sp) + lw $s7, 4*7($sp) + lw $s6, 4*6($sp) + lw $s5, 4*5($sp) + lw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + lw $s3, 4*3($sp) + lw $s2, 4*2($sp) + lw $s1, 4*1($sp) + lw $s0, 4*0($sp) +___ +$code.=<<___; + jr $ra + addu $sp,$sp,4*12 +.end poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + li $in0,-4 # final reduction + srl $ctx,$tmp4,2 + and $in0,$in0,$tmp4 + andi $tmp4,$tmp4,3 + addu $ctx,$ctx,$in0 + + addu $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiu $in0,$tmp0,5 # compare to modulus + addu $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addu $in1,$in1,$tmp1 + addu $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addu $in2,$in2,$tmp2 + addu $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addu $in3,$in3,$tmp3 + addu $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addu $ctx,$tmp4 + + srl $ctx,2 # see if it carried/borrowed + subu $ctx,$zero,$ctx + + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + and $in0,$ctx + and $in1,$ctx + and $in2,$ctx + and $in3,$ctx + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addu $in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addu $in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addu $in1,$ctx + sltu $ctx,$in1,$ctx + addu $ctx,$tmp1 + + addu $in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addu $in2,$ctx + sltu $ctx,$in2,$ctx + addu $ctx,$tmp2 + + addu $in3,$tmp3 + addu $in3,$ctx + + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 33ee0d18fb0a..a5f00ec73ea6 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -10,6 +10,11 @@ */ #define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) +/* Cavium Octeon should not have a separate PT_NOTE Program Header. */ +#ifndef CONFIG_CAVIUM_OCTEON_SOC +#define EMITS_PT_NOTE +#endif + #include <asm-generic/vmlinux.lds.h> #undef mips @@ -76,16 +81,8 @@ SECTIONS __stop___dbe_table = .; } -#ifdef CONFIG_CAVIUM_OCTEON_SOC -#define NOTES_HEADER -#else /* CONFIG_CAVIUM_OCTEON_SOC */ -#define NOTES_HEADER :note -#endif /* CONFIG_CAVIUM_OCTEON_SOC */ - NOTES :text NOTES_HEADER - .dummy : { *(.dummy) } :text - _sdata = .; /* Start of data section */ - RODATA + RO_DATA(4096) /* writeable */ .data : { /* Data */ diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index 1434fa60f3db..94e9ce994494 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -51,6 +51,7 @@ choice select MIPS_GIC select COMMON_CLK select CLKSRC_MIPS_GIC + select HAVE_PCI if PCI_MT7621 endchoice choice diff --git a/arch/nds32/kernel/vmlinux.lds.S b/arch/nds32/kernel/vmlinux.lds.S index 9e90f30a181d..f679d3397436 100644 --- a/arch/nds32/kernel/vmlinux.lds.S +++ b/arch/nds32/kernel/vmlinux.lds.S @@ -53,12 +53,11 @@ SECTIONS _etext = .; /* End of text and rodata section */ _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RO_DATA(PAGE_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; EXCEPTION_TABLE(16) - NOTES BSS_SECTION(4, 4, 4) _end = .; diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index 6ad64f14617d..c55a7cfa1075 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -49,8 +49,8 @@ SECTIONS __init_end = .; _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RO_DATA(PAGE_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; BSS_SECTION(0, 0, 0) @@ -58,7 +58,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG - NOTES DISCARDS } diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 2e2c72c157f3..60449fd7f16f 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -67,19 +67,18 @@ SECTIONS _sdata = .; - /* Page alignment required for RO_DATA_SECTION */ - RO_DATA_SECTION(PAGE_SIZE) + /* Page alignment required for RO_DATA */ + RO_DATA(PAGE_SIZE) _e_kernel_ro = .; /* Whatever comes after _e_kernel_ro had better be page-aligend, too */ /* 32 here is cacheline size... recheck this */ - RW_DATA_SECTION(32, PAGE_SIZE, PAGE_SIZE) + RW_DATA(32, PAGE_SIZE, PAGE_SIZE) _edata = .; EXCEPTION_TABLE(4) - NOTES /* Init code and data */ . = ALIGN(PAGE_SIZE); diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 99cd24f2ea01..53e29d88f99c 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -19,6 +19,7 @@ *(.data..vm0.pte) #define CC_USING_PATCHABLE_FUNCTION_ENTRY +#define RO_EXCEPTION_TABLE_ALIGN 8 #include <asm-generic/vmlinux.lds.h> @@ -109,7 +110,7 @@ SECTIONS _sdata = .; /* Architecturally we need to keep __gp below 0x1000000 and thus - * in front of RO_DATA_SECTION() which stores lots of tracepoint + * in front of RO_DATA() which stores lots of tracepoint * and ftrace symbols. */ #ifdef CONFIG_64BIT . = ALIGN(16); @@ -127,11 +128,7 @@ SECTIONS } #endif - RO_DATA_SECTION(8) - - /* RO because of BUILDTIME_EXTABLE_SORT */ - EXCEPTION_TABLE(8) - NOTES + RO_DATA(8) /* unwind info */ .PARISC.unwind : { @@ -149,7 +146,7 @@ SECTIONS data_start = .; /* Data */ - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, PAGE_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, PAGE_SIZE) /* PA-RISC locks requires 16-byte alignment */ . = ALIGN(16); diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c index 3a4ca7d32477..1fad5d4c658d 100644 --- a/arch/powerpc/crypto/aes-spe-glue.c +++ b/arch/powerpc/crypto/aes-spe-glue.c @@ -17,7 +17,10 @@ #include <asm/byteorder.h> #include <asm/switch_to.h> #include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <crypto/xts.h> +#include <crypto/gf128mul.h> +#include <crypto/scatterwalk.h> /* * MAX_BYTES defines the number of bytes that are allowed to be processed @@ -118,13 +121,19 @@ static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, return 0; } -static int ppc_xts_setkey(struct crypto_tfm *tfm, const u8 *in_key, +static int ppc_aes_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *in_key, unsigned int key_len) +{ + return ppc_aes_setkey(crypto_skcipher_tfm(tfm), in_key, key_len); +} + +static int ppc_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct ppc_xts_ctx *ctx = crypto_tfm_ctx(tfm); + struct ppc_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err; - err = xts_check_key(tfm, in_key, key_len); + err = xts_verify_key(tfm, in_key, key_len); if (err) return err; @@ -133,7 +142,7 @@ static int ppc_xts_setkey(struct crypto_tfm *tfm, const u8 *in_key, if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && key_len != AES_KEYSIZE_256) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -178,208 +187,229 @@ static void ppc_aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) spe_end(); } -static int ppc_ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_ecb_crypt(struct skcipher_request *req, bool enc) { - struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; + while ((nbytes = walk.nbytes) != 0) { + nbytes = min_t(unsigned int, nbytes, MAX_BYTES); + nbytes = round_down(nbytes, AES_BLOCK_SIZE); spe_begin(); - ppc_encrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, ctx->rounds, nbytes); + if (enc) + ppc_encrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes); + else + ppc_decrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes); spe_end(); - err = blkcipher_walk_done(desc, &walk, ubytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int ppc_ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_ecb_encrypt(struct skcipher_request *req) { - struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; - int err; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; - - spe_begin(); - ppc_decrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_dec, ctx->rounds, nbytes); - spe_end(); - - err = blkcipher_walk_done(desc, &walk, ubytes); - } + return ppc_ecb_crypt(req, true); +} - return err; +static int ppc_ecb_decrypt(struct skcipher_request *req) +{ + return ppc_ecb_crypt(req, false); } -static int ppc_cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_cbc_crypt(struct skcipher_request *req, bool enc) { - struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; + while ((nbytes = walk.nbytes) != 0) { + nbytes = min_t(unsigned int, nbytes, MAX_BYTES); + nbytes = round_down(nbytes, AES_BLOCK_SIZE); spe_begin(); - ppc_encrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, ctx->rounds, nbytes, walk.iv); + if (enc) + ppc_encrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes, + walk.iv); + else + ppc_decrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes, + walk.iv); spe_end(); - err = blkcipher_walk_done(desc, &walk, ubytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int ppc_cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_cbc_encrypt(struct skcipher_request *req) { - struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; - int err; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; - - spe_begin(); - ppc_decrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_dec, ctx->rounds, nbytes, walk.iv); - spe_end(); - - err = blkcipher_walk_done(desc, &walk, ubytes); - } + return ppc_cbc_crypt(req, true); +} - return err; +static int ppc_cbc_decrypt(struct skcipher_request *req) +{ + return ppc_cbc_crypt(req, false); } -static int ppc_ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_ctr_crypt(struct skcipher_request *req) { - struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int pbytes, ubytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + err = skcipher_walk_virt(&walk, req, false); - while ((pbytes = walk.nbytes)) { - pbytes = pbytes > MAX_BYTES ? MAX_BYTES : pbytes; - pbytes = pbytes == nbytes ? - nbytes : pbytes & ~(AES_BLOCK_SIZE - 1); - ubytes = walk.nbytes - pbytes; + while ((nbytes = walk.nbytes) != 0) { + nbytes = min_t(unsigned int, nbytes, MAX_BYTES); + if (nbytes < walk.total) + nbytes = round_down(nbytes, AES_BLOCK_SIZE); spe_begin(); ppc_crypt_ctr(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, ctx->rounds, pbytes , walk.iv); + ctx->key_enc, ctx->rounds, nbytes, walk.iv); spe_end(); - nbytes -= pbytes; - err = blkcipher_walk_done(desc, &walk, ubytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int ppc_xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_xts_crypt(struct skcipher_request *req, bool enc) { - struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; u32 *twk; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); twk = ctx->key_twk; - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; + while ((nbytes = walk.nbytes) != 0) { + nbytes = min_t(unsigned int, nbytes, MAX_BYTES); + nbytes = round_down(nbytes, AES_BLOCK_SIZE); spe_begin(); - ppc_encrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, ctx->rounds, nbytes, walk.iv, twk); + if (enc) + ppc_encrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes, + walk.iv, twk); + else + ppc_decrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes, + walk.iv, twk); spe_end(); twk = NULL; - err = blkcipher_walk_done(desc, &walk, ubytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int ppc_xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ppc_xts_encrypt(struct skcipher_request *req) { - struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - unsigned int ubytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + int offset = req->cryptlen - tail - AES_BLOCK_SIZE; + struct skcipher_request subreq; + u8 b[2][AES_BLOCK_SIZE]; int err; - u32 *twk; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - twk = ctx->key_twk; + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + if (tail) { + subreq = *req; + skcipher_request_set_crypt(&subreq, req->src, req->dst, + req->cryptlen - tail, req->iv); + req = &subreq; + } - while ((nbytes = walk.nbytes)) { - ubytes = nbytes > MAX_BYTES ? - nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); - nbytes -= ubytes; + err = ppc_xts_crypt(req, true); + if (err || !tail) + return err; - spe_begin(); - ppc_decrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_dec, ctx->rounds, nbytes, walk.iv, twk); - spe_end(); + scatterwalk_map_and_copy(b[0], req->dst, offset, AES_BLOCK_SIZE, 0); + memcpy(b[1], b[0], tail); + scatterwalk_map_and_copy(b[0], req->src, offset + AES_BLOCK_SIZE, tail, 0); - twk = NULL; - err = blkcipher_walk_done(desc, &walk, ubytes); + spe_begin(); + ppc_encrypt_xts(b[0], b[0], ctx->key_enc, ctx->rounds, AES_BLOCK_SIZE, + req->iv, NULL); + spe_end(); + + scatterwalk_map_and_copy(b[0], req->dst, offset, AES_BLOCK_SIZE + tail, 1); + + return 0; +} + +static int ppc_xts_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct ppc_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + int offset = req->cryptlen - tail - AES_BLOCK_SIZE; + struct skcipher_request subreq; + u8 b[3][AES_BLOCK_SIZE]; + le128 twk; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + if (tail) { + subreq = *req; + skcipher_request_set_crypt(&subreq, req->src, req->dst, + offset, req->iv); + req = &subreq; } - return err; + err = ppc_xts_crypt(req, false); + if (err || !tail) + return err; + + scatterwalk_map_and_copy(b[1], req->src, offset, AES_BLOCK_SIZE + tail, 0); + + spe_begin(); + if (!offset) + ppc_encrypt_ecb(req->iv, req->iv, ctx->key_twk, ctx->rounds, + AES_BLOCK_SIZE); + + gf128mul_x_ble(&twk, (le128 *)req->iv); + + ppc_decrypt_xts(b[1], b[1], ctx->key_dec, ctx->rounds, AES_BLOCK_SIZE, + (u8 *)&twk, NULL); + memcpy(b[0], b[2], tail); + memcpy(b[0] + tail, b[1] + tail, AES_BLOCK_SIZE - tail); + ppc_decrypt_xts(b[0], b[0], ctx->key_dec, ctx->rounds, AES_BLOCK_SIZE, + req->iv, NULL); + spe_end(); + + scatterwalk_map_and_copy(b[0], req->dst, offset, AES_BLOCK_SIZE + tail, 1); + + return 0; } /* @@ -388,9 +418,9 @@ static int ppc_xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, * This improves IPsec thoughput by another few percent. Additionally we assume * that AES context is always aligned to at least 8 bytes because it is created * with kmalloc() in the crypto infrastructure - * */ -static struct crypto_alg aes_algs[] = { { + +static struct crypto_alg aes_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-ppc-spe", .cra_priority = 300, @@ -408,96 +438,84 @@ static struct crypto_alg aes_algs[] = { { .cia_decrypt = ppc_aes_decrypt } } -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-ppc-spe", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ppc_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ppc_aes_setkey, - .encrypt = ppc_ecb_encrypt, - .decrypt = ppc_ecb_decrypt, - } - } -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-ppc-spe", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ppc_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ppc_aes_setkey, - .encrypt = ppc_cbc_encrypt, - .decrypt = ppc_cbc_decrypt, - } - } -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-ppc-spe", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct ppc_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ppc_aes_setkey, - .encrypt = ppc_ctr_crypt, - .decrypt = ppc_ctr_crypt, - } - } -}, { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-ppc-spe", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ppc_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE * 2, - .max_keysize = AES_MAX_KEY_SIZE * 2, - .ivsize = AES_BLOCK_SIZE, - .setkey = ppc_xts_setkey, - .encrypt = ppc_xts_encrypt, - .decrypt = ppc_xts_decrypt, - } +}; + +static struct skcipher_alg aes_skcipher_algs[] = { + { + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-ppc-spe", + .base.cra_priority = 300, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ppc_aes_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = ppc_aes_setkey_skcipher, + .encrypt = ppc_ecb_encrypt, + .decrypt = ppc_ecb_decrypt, + }, { + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-ppc-spe", + .base.cra_priority = 300, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ppc_aes_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_aes_setkey_skcipher, + .encrypt = ppc_cbc_encrypt, + .decrypt = ppc_cbc_decrypt, + }, { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-ppc-spe", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct ppc_aes_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_aes_setkey_skcipher, + .encrypt = ppc_ctr_crypt, + .decrypt = ppc_ctr_crypt, + .chunksize = AES_BLOCK_SIZE, + }, { + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-ppc-spe", + .base.cra_priority = 300, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ppc_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE * 2, + .max_keysize = AES_MAX_KEY_SIZE * 2, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_xts_setkey, + .encrypt = ppc_xts_encrypt, + .decrypt = ppc_xts_decrypt, } -} }; +}; static int __init ppc_aes_mod_init(void) { - return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); + int err; + + err = crypto_register_alg(&aes_cipher_alg); + if (err) + return err; + + err = crypto_register_skciphers(aes_skcipher_algs, + ARRAY_SIZE(aes_skcipher_algs)); + if (err) + crypto_unregister_alg(&aes_cipher_alg); + return err; } static void __exit ppc_aes_mod_fini(void) { - crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); + crypto_unregister_alg(&aes_cipher_alg); + crypto_unregister_skciphers(aes_skcipher_algs, + ARRAY_SIZE(aes_skcipher_algs)); } module_init(ppc_aes_mod_init); diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 8561498e653c..d84d1417ddb6 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -152,9 +152,12 @@ void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); /* Patch sites */ extern s32 patch__call_flush_count_cache; extern s32 patch__flush_count_cache_return; +extern s32 patch__flush_link_stack_return; +extern s32 patch__call_kvm_flush_link_stack; extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; +extern long kvm_flush_link_stack; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h index fdd00939270b..bc4bd19b7fc2 100644 --- a/arch/powerpc/include/asm/local.h +++ b/arch/powerpc/include/asm/local.h @@ -17,7 +17,7 @@ typedef struct #define LOCAL_INIT(i) { (i) } -static __inline__ long local_read(local_t *l) +static __inline__ long local_read(const local_t *l) { return READ_ONCE(l->v); } diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h index 759597bf0fd8..ccf44c135389 100644 --- a/arch/powerpc/include/asm/security_features.h +++ b/arch/powerpc/include/asm/security_features.h @@ -81,6 +81,9 @@ static inline bool security_ftr_enabled(unsigned long feature) // Software required to flush count cache on context switch #define SEC_FTR_FLUSH_COUNT_CACHE 0x0000000000000400ull +// Software required to flush link stack on context switch +#define SEC_FTR_FLUSH_LINK_STACK 0x0000000000001000ull + // Features enabled by default #define SEC_FTR_DEFAULT \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6467bdab8d40..3fd3ef352e3f 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -537,6 +537,7 @@ flush_count_cache: /* Save LR into r9 */ mflr r9 + // Flush the link stack .rept 64 bl .+4 .endr @@ -546,6 +547,11 @@ flush_count_cache: .balign 32 /* Restore LR */ 1: mtlr r9 + + // If we're just flushing the link stack, return here +3: nop + patch_site 3b patch__flush_link_stack_return + li r9,0x7fff mtctr r9 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 7cfcb294b11c..bd91dceb7010 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -24,6 +24,7 @@ enum count_cache_flush_type { COUNT_CACHE_FLUSH_HW = 0x4, }; static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; +static bool link_stack_flush_enabled; bool barrier_nospec_enabled; static bool no_nospec; @@ -212,11 +213,19 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c if (ccd) seq_buf_printf(&s, "Indirect branch cache disabled"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { seq_buf_printf(&s, "Mitigation: Software count cache flush"); if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW) seq_buf_printf(&s, " (hardware accelerated)"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (btb_flush_enabled) { seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); } else { @@ -377,18 +386,49 @@ static __init int stf_barrier_debugfs_init(void) device_initcall(stf_barrier_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +static void no_count_cache_flush(void) +{ + count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; + pr_info("count-cache-flush: software flush disabled.\n"); +} + static void toggle_count_cache_flush(bool enable) { - if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE) && + !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK)) + enable = false; + + if (!enable) { patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP); - count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; - pr_info("count-cache-flush: software flush disabled.\n"); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + patch_instruction_site(&patch__call_kvm_flush_link_stack, PPC_INST_NOP); +#endif + pr_info("link-stack-flush: software flush disabled.\n"); + link_stack_flush_enabled = false; + no_count_cache_flush(); return; } + // This enables the branch from _switch to flush_count_cache patch_branch_site(&patch__call_flush_count_cache, (u64)&flush_count_cache, BRANCH_SET_LINK); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + // This enables the branch from guest_exit_cont to kvm_flush_link_stack + patch_branch_site(&patch__call_kvm_flush_link_stack, + (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); +#endif + + pr_info("link-stack-flush: software flush enabled.\n"); + link_stack_flush_enabled = true; + + // If we just need to flush the link stack, patch an early return + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + patch_instruction_site(&patch__flush_link_stack_return, PPC_INST_BLR); + no_count_cache_flush(); + return; + } + if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) { count_cache_flush_type = COUNT_CACHE_FLUSH_SW; pr_info("count-cache-flush: full software flush sequence enabled.\n"); @@ -407,11 +447,20 @@ void setup_count_cache_flush(void) if (no_spectrev2 || cpu_mitigations_off()) { if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) || security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED)) - pr_warn("Spectre v2 mitigations not under software control, can't disable\n"); + pr_warn("Spectre v2 mitigations not fully under software control, can't disable\n"); enable = false; } + /* + * There's no firmware feature flag/hypervisor bit to tell us we need to + * flush the link stack on context switch. So we set it here if we see + * either of the Spectre v2 mitigations that aim to protect userspace. + */ + if (security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED) || + security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) + security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); + toggle_count_cache_flush(enable); } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308cd5..84827da01d45 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -338,7 +338,7 @@ static unsigned long vtime_delta(struct task_struct *tsk, return stime; } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); @@ -366,7 +366,7 @@ void vtime_account_system(struct task_struct *tsk) #endif } } -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_idle(struct task_struct *tsk) { @@ -395,7 +395,7 @@ static void vtime_flush_scaled(struct task_struct *tsk, /* * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. - * Assumes that vtime_account_system/idle() has been called + * Assumes that vtime_account_kernel/idle() has been called * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 060a1acd7c6d..8834220036a5 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -6,6 +6,8 @@ #endif #define BSS_FIRST_SECTIONS *(.bss.prominit) +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 0 #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> @@ -18,22 +20,8 @@ ENTRY(_stext) PHDRS { - kernel PT_LOAD FLAGS(7); /* RWX */ - notes PT_NOTE FLAGS(0); - dummy PT_NOTE FLAGS(0); - - /* binutils < 2.18 has a bug that makes it misbehave when taking an - ELF file with all segments at load address 0 as input. This - happens when running "strip" on vmlinux, because of the AT() magic - in this linker script. People using GCC >= 4.2 won't run into - this problem, because the "build-id" support will put some data - into the "notes" segment (at a non-zero load address). - - To work around this, we force some data into both the "dummy" - segment and the kernel segment, so the dummy segment will get a - non-zero load address. It's not enough to always create the - "notes" segment, since if nothing gets assigned to it, its load - address will be zero. */ + text PT_LOAD FLAGS(7); /* RWX */ + note PT_NOTE FLAGS(0); } #ifdef CONFIG_PPC64 @@ -77,7 +65,7 @@ SECTIONS #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif - } :kernel + } :text __head_end = .; @@ -126,7 +114,7 @@ SECTIONS __got2_end = .; #endif /* CONFIG_PPC32 */ - } :kernel + } :text . = ALIGN(ETEXT_ALIGN_SIZE); _etext = .; @@ -175,17 +163,6 @@ SECTIONS __stop__btb_flush_fixup = .; } #endif - EXCEPTION_TABLE(0) - - NOTES :kernel :notes - - /* The dummy segment contents for the bug workaround mentioned above - near PHDRS. */ - .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) { - LONG(0) - LONG(0) - LONG(0) - } :kernel :dummy /* * Init sections discarded at runtime @@ -200,7 +177,7 @@ SECTIONS #ifdef CONFIG_PPC64 *(.tramp.ftrace.init); #endif - } :kernel + } :text /* .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index faebcbb8c4db..0496e66aaa56 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -11,6 +11,7 @@ */ #include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> #include <asm/kvm_asm.h> #include <asm/reg.h> #include <asm/mmu.h> @@ -1487,6 +1488,13 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1: #endif /* CONFIG_KVM_XICS */ + /* + * Possibly flush the link stack here, before we do a blr in + * guest_exit_short_path. + */ +1: nop + patch_site 1b patch__call_kvm_flush_link_stack + /* If we came in through the P9 short path, go back out to C now */ lwz r0, STACK_SLOT_SHORT_PATH(r1) cmpwi r0, 0 @@ -1963,6 +1971,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +.balign 32 +.global kvm_flush_link_stack +kvm_flush_link_stack: + /* Save LR into r0 */ + mflr r0 + + /* Flush the link stack. On Power8 it's up to 32 entries in size. */ + .rept 32 + bl .+4 + .endr + + /* And on Power9 it's up to 64. */ +BEGIN_FTR_SECTION + .rept 32 + bl .+4 + .endr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + + /* Restore LR */ + mtlr r0 + blr + kvmppc_guest_external: /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index ca92e01d0bd1..48604625ab31 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -96,7 +96,7 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) { return 0; } -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } +static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { } static inline u32 perf_get_misc_flags(struct pt_regs *regs) { return 0; @@ -127,7 +127,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw) static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} -static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {} +static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } #endif /* CONFIG_PPC32 */ @@ -179,7 +179,7 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the * [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER. */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) +static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { unsigned long mmcra = regs->dsisr; bool sdar_valid; @@ -204,8 +204,7 @@ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid) *addrp = mfspr(SPRN_SDAR); - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) && - is_kernel_addr(mfspr(SPRN_SDAR))) + if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0) *addrp = 0; } @@ -444,7 +443,7 @@ static __u64 power_pmu_bhrb_to(u64 addr) } /* Processing BHRB entries */ -static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) +static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) { u64 val; u64 addr; @@ -472,8 +471,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) * exporting it to userspace (avoid exposure of regions * where we could have speculative execution) */ - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) && - is_kernel_addr(addr)) + if (is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0) continue; /* Branches are read most recent first (ie. mfbhrb 0 is @@ -2087,12 +2085,12 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) - perf_get_data_addr(regs, &data.addr); + perf_get_data_addr(event, regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { struct cpu_hw_events *cpuhw; cpuhw = this_cpu_ptr(&cpu_hw_events); - power_pmu_bhrb_read(cpuhw); + power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 2b87480f2837..eab8aa293743 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -19,7 +19,6 @@ struct dtl { struct dtl_entry *buf; - struct dentry *file; int cpu; int buf_entries; u64 last_idx; @@ -320,46 +319,28 @@ static const struct file_operations dtl_fops = { static struct dentry *dtl_dir; -static int dtl_setup_file(struct dtl *dtl) +static void dtl_setup_file(struct dtl *dtl) { char name[10]; sprintf(name, "cpu-%d", dtl->cpu); - dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops); - if (!dtl->file) - return -ENOMEM; - - return 0; + debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops); } static int dtl_init(void) { - struct dentry *event_mask_file, *buf_entries_file; - int rc, i; + int i; if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return -ENODEV; /* set up common debugfs structure */ - rc = -ENOMEM; dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); - if (!dtl_dir) { - printk(KERN_WARNING "%s: can't create dtl root dir\n", - __func__); - goto err; - } - event_mask_file = debugfs_create_x8("dtl_event_mask", 0600, - dtl_dir, &dtl_event_mask); - buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400, - dtl_dir, &dtl_buf_entries); - - if (!event_mask_file || !buf_entries_file) { - printk(KERN_WARNING "%s: can't create dtl files\n", __func__); - goto err_remove_dir; - } + debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); + debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); /* set up the per-cpu log structures */ for_each_possible_cpu(i) { @@ -367,16 +348,9 @@ static int dtl_init(void) spin_lock_init(&dtl->lock); dtl->cpu = i; - rc = dtl_setup_file(dtl); - if (rc) - goto err_remove_dir; + dtl_setup_file(dtl); } return 0; - -err_remove_dir: - debugfs_remove_recursive(dtl_dir); -err: - return rc; } machine_arch_initcall(pseries, dtl_init); diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index bcc1b67417a8..c40c62ec432e 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -129,7 +129,6 @@ static void probe_hcall_exit(void *ignored, unsigned long opcode, long retval, static int __init hcall_inst_init(void) { struct dentry *hcall_root; - struct dentry *hcall_file; char cpu_name_buf[CPU_NAME_BUF_SIZE]; int cpu; @@ -145,17 +144,12 @@ static int __init hcall_inst_init(void) } hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL); - if (!hcall_root) - return -ENOMEM; for_each_possible_cpu(cpu) { snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu); - hcall_file = debugfs_create_file(cpu_name_buf, 0444, - hcall_root, - per_cpu(hcall_stats, cpu), - &hcall_inst_seq_fops); - if (!hcall_file) - return -ENOMEM; + debugfs_create_file(cpu_name_buf, 0444, hcall_root, + per_cpu(hcall_stats, cpu), + &hcall_inst_seq_fops); } return 0; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f87a5c64e24d..f9f57c55655e 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1998,24 +1998,11 @@ static int __init vpa_debugfs_init(void) return 0; vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); - if (!vpa_dir) { - pr_warn("%s: can't create vpa root dir\n", __func__); - return -ENOMEM; - } /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { - struct dentry *d; - sprintf(name, "cpu-%ld", i); - - d = debugfs_create_file(name, 0400, vpa_dir, (void *)i, - &vpa_fops); - if (!d) { - pr_warn("%s: can't create per-cpu vpa file\n", - __func__); - return -ENOMEM; - } + debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops); } return 0; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8eebbc8860bb..9f7f5dce2dc4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -26,14 +26,15 @@ config RISCV select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK - select GENERIC_STRNCPY_FROM_USER - select GENERIC_STRNLEN_USER + select GENERIC_STRNCPY_FROM_USER if MMU + select GENERIC_STRNLEN_USER if MMU select GENERIC_SMP_IDLE_THREAD select GENERIC_ATOMIC64 if !64BIT select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS select HAVE_MEMBLOCK_NODE_MAP - select HAVE_DMA_CONTIGUOUS + select HAVE_DMA_CONTIGUOUS if MMU select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -50,6 +51,7 @@ config RISCV select PCI_DOMAINS_GENERIC if PCI select PCI_MSI if PCI select RISCV_TIMER + select UACCESS_MEMCPY if !MMU select GENERIC_IRQ_MULTI_HANDLER select GENERIC_ARCH_TOPOLOGY if SMP select ARCH_HAS_PTE_SPECIAL @@ -60,7 +62,7 @@ config RISCV select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU - select HAVE_ARCH_MMAP_RND_BITS + select HAVE_ARCH_MMAP_RND_BITS if MMU config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT @@ -72,8 +74,23 @@ config ARCH_MMAP_RND_BITS_MAX default 24 if 64BIT # SV39 based default 17 +# set if we run in machine mode, cleared if we run in supervisor mode +config RISCV_M_MODE + bool + default !MMU + +# set if we are running in S-mode and can use SBI calls +config RISCV_SBI + bool + depends on !RISCV_M_MODE + default y + config MMU - def_bool y + bool "MMU-based Paged Memory Management Support" + default y + help + Select if you want MMU-based virtualised addressing space + support by paged memory management. If unsure, say 'Y'. config ZONE_DMA32 bool @@ -92,6 +109,7 @@ config PA_BITS config PAGE_OFFSET hex default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB + default 0x80000000 if 64BIT && !MMU default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB @@ -135,7 +153,7 @@ config GENERIC_HWEIGHT def_bool y config FIX_EARLYCON_MEM - def_bool y + def_bool CONFIG_MMU config PGTABLE_LEVELS int @@ -160,17 +178,18 @@ config ARCH_RV32I select GENERIC_LIB_ASHRDI3 select GENERIC_LIB_LSHRDI3 select GENERIC_LIB_UCMPDI2 + select MMU config ARCH_RV64I bool "RV64I" select 64BIT - select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_DYNAMIC_FTRACE - select HAVE_DYNAMIC_FTRACE_WITH_REGS - select SWIOTLB + select HAVE_DYNAMIC_FTRACE if MMU + select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE + select SWIOTLB if MMU endchoice @@ -272,6 +291,19 @@ menu "Kernel features" source "kernel/Kconfig.hz" +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + endmenu menu "Boot options" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index f5e914210245..b9009a2fbaf5 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -83,13 +83,18 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ -all: Image.gz +ifeq ($(CONFIG_RISCV_M_MODE),y) +KBUILD_IMAGE := $(boot)/loader +else +KBUILD_IMAGE := $(boot)/Image.gz +endif +BOOT_TARGETS := Image Image.gz loader -Image: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +all: $(notdir $(KBUILD_IMAGE)) -Image.%: Image +$(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + @$(kecho) ' Kernel: $(boot)/$@ is ready' zinstall install: $(Q)$(MAKE) $(build)=$(boot) $@ diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 0990a9fdbe5d..a474f98ce4fa 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -16,7 +16,7 @@ OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S -targets := Image +targets := Image loader $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) @@ -24,6 +24,23 @@ $(obj)/Image: vmlinux FORCE $(obj)/Image.gz: $(obj)/Image FORCE $(call if_changed,gzip) +loader.o: $(src)/loader.S $(obj)/Image + +$(obj)/loader: $(obj)/loader.o $(obj)/Image $(obj)/loader.lds FORCE + $(Q)$(LD) -T $(obj)/loader.lds -o $@ $(obj)/loader.o + +$(obj)/Image.bz2: $(obj)/Image FORCE + $(call if_changed,bzip2) + +$(obj)/Image.lz4: $(obj)/Image FORCE + $(call if_changed,lz4) + +$(obj)/Image.lzma: $(obj)/Image FORCE + $(call if_changed,lzma) + +$(obj)/Image.lzo: $(obj)/Image FORCE + $(call if_changed,lzo) + install: $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \ $(obj)/Image System.map "$(INSTALL_PATH)" diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index afa43c7ea369..70a1891e7cd0 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -162,6 +162,13 @@ clocks = <&prci PRCI_CLK_TLCLK>; status = "disabled"; }; + dma: dma@3000000 { + compatible = "sifive,fu540-c000-pdma"; + reg = <0x0 0x3000000 0x0 0x8000>; + interrupt-parent = <&plic0>; + interrupts = <23 24 25 26 27 28 29 30>; + #dma-cells = <1>; + }; uart1: serial@10011000 { compatible = "sifive,fu540-c000-uart", "sifive,uart0"; reg = <0x0 0x10011000 0x0 0x1000>; diff --git a/arch/riscv/boot/loader.S b/arch/riscv/boot/loader.S new file mode 100644 index 000000000000..dcf88cf44dc1 --- /dev/null +++ b/arch/riscv/boot/loader.S @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .align 4 + .section .payload, "ax", %progbits + .globl _start +_start: + .incbin "arch/riscv/boot/Image" + diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S new file mode 100644 index 000000000000..47a5003c2e28 --- /dev/null +++ b/arch/riscv/boot/loader.lds.S @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/page.h> + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +SECTIONS +{ + . = PAGE_OFFSET; + + .payload : { + *(.payload) + . = ALIGN(8); + } +} diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig new file mode 100644 index 000000000000..cf74e179bf90 --- /dev/null +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -0,0 +1,78 @@ +# CONFIG_CPU_ISOLATION is not set +CONFIG_LOG_BUF_SHIFT=16 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_EXPERT=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_FHANDLE is not set +# CONFIG_BASE_FULL is not set +# CONFIG_EPOLL is not set +# CONFIG_SIGNALFD is not set +# CONFIG_TIMERFD is not set +# CONFIG_EVENTFD is not set +# CONFIG_AIO is not set +# CONFIG_IO_URING is not set +# CONFIG_ADVISE_SYSCALLS is not set +# CONFIG_MEMBARRIER is not set +# CONFIG_KALLSYMS is not set +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SLOB=y +# CONFIG_SLAB_MERGE_DEFAULT is not set +# CONFIG_MMU is not set +CONFIG_MAXPHYSMEM_2GB=y +CONFIG_SMP=y +CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0" +CONFIG_CMDLINE_FORCE=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_MSDOS_PARTITION is not set +# CONFIG_EFI_PARTITION is not set +# CONFIG_MQ_IOSCHED_DEADLINE is not set +# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_BINFMT_FLAT=y +# CONFIG_COREDUMP is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_FW_LOADER is not set +# CONFIG_ALLOW_DEV_COREDUMP is not set +CONFIG_VIRTIO_BLK=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_LDISC_AUTOLOAD is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=1 +CONFIG_SERIAL_8250_RUNTIME_UARTS=1 +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set +# CONFIG_VGA_CONSOLE is not set +# CONFIG_HID is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_VIRTIO_MMIO=y +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y +CONFIG_SIFIVE_PLIC=y +# CONFIG_VALIDATE_FS_PARSER is not set +CONFIG_EXT2_FS=y +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_LSM="[]" +CONFIG_PRINTK_TIME=y +# CONFIG_SCHED_DEBUG is not set +# CONFIG_RCU_TRACE is not set +# CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index c9fecd120d18..dd62b691c443 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_RISCV_PROTOTYPES_H +#define _ASM_RISCV_PROTOTYPES_H #include <linux/ftrace.h> #include <asm-generic/asm-prototypes.h> diff --git a/arch/riscv/include/asm/cache.h b/arch/riscv/include/asm/cache.h index bfd523e8f0b2..9b58b104559e 100644 --- a/arch/riscv/include/asm/cache.h +++ b/arch/riscv/include/asm/cache.h @@ -11,4 +11,12 @@ #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) +/* + * RISC-V requires the stack pointer to be 16-byte aligned, so ensure that + * the flat loader aligns it accordingly. + */ +#ifndef CONFIG_MMU +#define ARCH_SLAB_MINALIGN 16 +#endif + #endif /* _ASM_RISCV_CACHE_H */ diff --git a/arch/riscv/include/asm/clint.h b/arch/riscv/include/asm/clint.h new file mode 100644 index 000000000000..6eaa2eedd694 --- /dev/null +++ b/arch/riscv/include/asm/clint.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_CLINT_H +#define _ASM_RISCV_CLINT_H 1 + +#include <linux/io.h> +#include <linux/smp.h> + +#ifdef CONFIG_RISCV_M_MODE +extern u32 __iomem *clint_ipi_base; + +void clint_init_boot_cpu(void); + +static inline void clint_send_ipi_single(unsigned long hartid) +{ + writel(1, clint_ipi_base + hartid); +} + +static inline void clint_send_ipi_mask(const struct cpumask *hartid_mask) +{ + int hartid; + + for_each_cpu(hartid, hartid_mask) + clint_send_ipi_single(hartid); +} + +static inline void clint_clear_ipi(unsigned long hartid) +{ + writel(0, clint_ipi_base + hartid); +} +#else /* CONFIG_RISCV_M_MODE */ +#define clint_init_boot_cpu() do { } while (0) + +/* stubs to for code is only reachable under IS_ENABLED(CONFIG_RISCV_M_MODE): */ +void clint_send_ipi_single(unsigned long hartid); +void clint_send_ipi_mask(const struct cpumask *hartid_mask); +void clint_clear_ipi(unsigned long hartid); +#endif /* CONFIG_RISCV_M_MODE */ + +#endif /* _ASM_RISCV_CLINT_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index a18923fa23c8..0a62d2d68455 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -11,8 +11,11 @@ /* Status register flags */ #define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_MIE _AC(0x00000008, UL) /* Machine Interrupt Enable */ #define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_MPIE _AC(0x00000080, UL) /* Previous Machine IE */ #define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_MPP _AC(0x00001800, UL) /* Previously Machine */ #define SR_SUM _AC(0x00040000, UL) /* Supervisor User Memory Access */ #define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ @@ -44,9 +47,10 @@ #define SATP_MODE SATP_MODE_39 #endif -/* SCAUSE */ -#define SCAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) +/* Exception cause high bit - is an interrupt if set */ +#define CAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) +/* Interrupt causes (minus the high bit) */ #define IRQ_U_SOFT 0 #define IRQ_S_SOFT 1 #define IRQ_M_SOFT 3 @@ -57,6 +61,7 @@ #define IRQ_S_EXT 9 #define IRQ_M_EXT 11 +/* Exception causes */ #define EXC_INST_MISALIGNED 0 #define EXC_INST_ACCESS 1 #define EXC_BREAKPOINT 3 @@ -67,14 +72,14 @@ #define EXC_LOAD_PAGE_FAULT 13 #define EXC_STORE_PAGE_FAULT 15 -/* SIE (Interrupt Enable) and SIP (Interrupt Pending) flags */ -#define SIE_SSIE (_AC(0x1, UL) << IRQ_S_SOFT) -#define SIE_STIE (_AC(0x1, UL) << IRQ_S_TIMER) -#define SIE_SEIE (_AC(0x1, UL) << IRQ_S_EXT) - +/* symbolic CSR names: */ #define CSR_CYCLE 0xc00 #define CSR_TIME 0xc01 #define CSR_INSTRET 0xc02 +#define CSR_CYCLEH 0xc80 +#define CSR_TIMEH 0xc81 +#define CSR_INSTRETH 0xc82 + #define CSR_SSTATUS 0x100 #define CSR_SIE 0x104 #define CSR_STVEC 0x105 @@ -85,9 +90,58 @@ #define CSR_STVAL 0x143 #define CSR_SIP 0x144 #define CSR_SATP 0x180 -#define CSR_CYCLEH 0xc80 -#define CSR_TIMEH 0xc81 -#define CSR_INSTRETH 0xc82 + +#define CSR_MSTATUS 0x300 +#define CSR_MISA 0x301 +#define CSR_MIE 0x304 +#define CSR_MTVEC 0x305 +#define CSR_MSCRATCH 0x340 +#define CSR_MEPC 0x341 +#define CSR_MCAUSE 0x342 +#define CSR_MTVAL 0x343 +#define CSR_MIP 0x344 +#define CSR_MHARTID 0xf14 + +#ifdef CONFIG_RISCV_M_MODE +# define CSR_STATUS CSR_MSTATUS +# define CSR_IE CSR_MIE +# define CSR_TVEC CSR_MTVEC +# define CSR_SCRATCH CSR_MSCRATCH +# define CSR_EPC CSR_MEPC +# define CSR_CAUSE CSR_MCAUSE +# define CSR_TVAL CSR_MTVAL +# define CSR_IP CSR_MIP + +# define SR_IE SR_MIE +# define SR_PIE SR_MPIE +# define SR_PP SR_MPP + +# define IRQ_SOFT IRQ_M_SOFT +# define IRQ_TIMER IRQ_M_TIMER +# define IRQ_EXT IRQ_M_EXT +#else /* CONFIG_RISCV_M_MODE */ +# define CSR_STATUS CSR_SSTATUS +# define CSR_IE CSR_SIE +# define CSR_TVEC CSR_STVEC +# define CSR_SCRATCH CSR_SSCRATCH +# define CSR_EPC CSR_SEPC +# define CSR_CAUSE CSR_SCAUSE +# define CSR_TVAL CSR_STVAL +# define CSR_IP CSR_SIP + +# define SR_IE SR_SIE +# define SR_PIE SR_SPIE +# define SR_PP SR_SPP + +# define IRQ_SOFT IRQ_S_SOFT +# define IRQ_TIMER IRQ_S_TIMER +# define IRQ_EXT IRQ_S_EXT +#endif /* CONFIG_RISCV_M_MODE */ + +/* IE/IP (Supervisor/Machine Interrupt Enable/Pending) flags */ +#define IE_SIE (_AC(0x1, UL) << IRQ_SOFT) +#define IE_TIE (_AC(0x1, UL) << IRQ_TIMER) +#define IE_EIE (_AC(0x1, UL) << IRQ_EXT) #ifndef __ASSEMBLY__ diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h index 44dcf7fc15ee..dd973efe5d7c 100644 --- a/arch/riscv/include/asm/current.h +++ b/arch/riscv/include/asm/current.h @@ -7,8 +7,8 @@ */ -#ifndef __ASM_CURRENT_H -#define __ASM_CURRENT_H +#ifndef _ASM_RISCV_CURRENT_H +#define _ASM_RISCV_CURRENT_H #include <linux/bug.h> #include <linux/compiler.h> @@ -34,4 +34,4 @@ static __always_inline struct task_struct *get_current(void) #endif /* __ASSEMBLY__ */ -#endif /* __ASM_CURRENT_H */ +#endif /* _ASM_RISCV_CURRENT_H */ diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index ef04084bf0de..d83a4efd052b 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -56,16 +56,16 @@ extern unsigned long elf_hwcap; */ #define ELF_PLATFORM (NULL) +#ifdef CONFIG_MMU #define ARCH_DLINFO \ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (elf_addr_t)current->mm->context.vdso); \ } while (0) - - #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_ELF_H */ diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 161f28d04a07..42d2c42f3cc9 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -11,6 +11,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#ifdef CONFIG_MMU /* * Here we define all the compile-time 'special' virtual addresses. * The point is to have a constant address at compile time, but to @@ -42,4 +43,5 @@ extern void __set_fixmap(enum fixed_addresses idx, #include <asm-generic/fixmap.h> +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_FIXMAP_H */ diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index c6dcc5291f97..ace8a6e2d11d 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2017 Andes Technology Corporation */ +#ifndef _ASM_RISCV_FTRACE_H +#define _ASM_RISCV_FTRACE_H + /* * The graph frame test is not possible if CONFIG_FRAME_POINTER is not enabled. * Check arch/riscv/kernel/mcount.S for detail. @@ -64,3 +67,5 @@ do { \ */ #define MCOUNT_INSN_SIZE 8 #endif + +#endif /* _ASM_RISCV_FTRACE_H */ diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 4ad6409c4647..fdfaf7f3df7c 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -4,14 +4,20 @@ * Copyright (c) 2018 Jim Wilson (jimw@sifive.com) */ -#ifndef _ASM_FUTEX_H -#define _ASM_FUTEX_H +#ifndef _ASM_RISCV_FUTEX_H +#define _ASM_RISCV_FUTEX_H #include <linux/futex.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <asm/asm.h> +/* We don't even really need the extable code, but for now keep it simple */ +#ifndef CONFIG_MMU +#define __enable_user_access() do { } while (0) +#define __disable_user_access() do { } while (0) +#endif + #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ { \ uintptr_t tmp; \ @@ -112,4 +118,4 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return ret; } -#endif /* _ASM_FUTEX_H */ +#endif /* _ASM_RISCV_FUTEX_H */ diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 7ecb7c6a57b1..1bb0cd04aec3 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -5,8 +5,8 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2017 SiFive */ -#ifndef __ASM_HWCAP_H -#define __ASM_HWCAP_H +#ifndef _ASM_RISCV_HWCAP_H +#define _ASM_RISCV_HWCAP_H #include <uapi/asm/hwcap.h> @@ -23,4 +23,5 @@ enum { extern unsigned long elf_hwcap; #endif -#endif + +#endif /* _ASM_RISCV_HWCAP_H */ diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h index 344db5244547..7b0f92ba0acc 100644 --- a/arch/riscv/include/asm/image.h +++ b/arch/riscv/include/asm/image.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_IMAGE_H -#define __ASM_IMAGE_H +#ifndef _ASM_RISCV_IMAGE_H +#define _ASM_RISCV_IMAGE_H #define RISCV_IMAGE_MAGIC "RISCV\0\0\0" #define RISCV_IMAGE_MAGIC2 "RSC\x05" @@ -62,4 +62,4 @@ struct riscv_image_header { u32 res4; }; #endif /* __ASSEMBLY__ */ -#endif /* __ASM_IMAGE_H */ +#endif /* _ASM_RISCV_IMAGE_H */ diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 3ba4d93721d3..0f477206a4ed 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -15,158 +15,19 @@ #include <asm/mmiowb.h> #include <asm/pgtable.h> -extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); - -/* - * The RISC-V ISA doesn't yet specify how to query or modify PMAs, so we can't - * change the properties of memory regions. This should be fixed by the - * upcoming platform spec. - */ -#define ioremap_nocache(addr, size) ioremap((addr), (size)) -#define ioremap_wc(addr, size) ioremap((addr), (size)) -#define ioremap_wt(addr, size) ioremap((addr), (size)) - -extern void iounmap(volatile void __iomem *addr); - -/* Generic IO read/write. These perform native-endian accesses. */ -#define __raw_writeb __raw_writeb -static inline void __raw_writeb(u8 val, volatile void __iomem *addr) -{ - asm volatile("sb %0, 0(%1)" : : "r" (val), "r" (addr)); -} - -#define __raw_writew __raw_writew -static inline void __raw_writew(u16 val, volatile void __iomem *addr) -{ - asm volatile("sh %0, 0(%1)" : : "r" (val), "r" (addr)); -} - -#define __raw_writel __raw_writel -static inline void __raw_writel(u32 val, volatile void __iomem *addr) -{ - asm volatile("sw %0, 0(%1)" : : "r" (val), "r" (addr)); -} - -#ifdef CONFIG_64BIT -#define __raw_writeq __raw_writeq -static inline void __raw_writeq(u64 val, volatile void __iomem *addr) -{ - asm volatile("sd %0, 0(%1)" : : "r" (val), "r" (addr)); -} -#endif - -#define __raw_readb __raw_readb -static inline u8 __raw_readb(const volatile void __iomem *addr) -{ - u8 val; - - asm volatile("lb %0, 0(%1)" : "=r" (val) : "r" (addr)); - return val; -} - -#define __raw_readw __raw_readw -static inline u16 __raw_readw(const volatile void __iomem *addr) -{ - u16 val; - - asm volatile("lh %0, 0(%1)" : "=r" (val) : "r" (addr)); - return val; -} - -#define __raw_readl __raw_readl -static inline u32 __raw_readl(const volatile void __iomem *addr) -{ - u32 val; - - asm volatile("lw %0, 0(%1)" : "=r" (val) : "r" (addr)); - return val; -} - -#ifdef CONFIG_64BIT -#define __raw_readq __raw_readq -static inline u64 __raw_readq(const volatile void __iomem *addr) -{ - u64 val; - - asm volatile("ld %0, 0(%1)" : "=r" (val) : "r" (addr)); - return val; -} -#endif - /* - * Unordered I/O memory access primitives. These are even more relaxed than - * the relaxed versions, as they don't even order accesses between successive - * operations to the I/O regions. + * MMIO access functions are separated out to break dependency cycles + * when using {read,write}* fns in low-level headers */ -#define readb_cpu(c) ({ u8 __r = __raw_readb(c); __r; }) -#define readw_cpu(c) ({ u16 __r = le16_to_cpu((__force __le16)__raw_readw(c)); __r; }) -#define readl_cpu(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) - -#define writeb_cpu(v,c) ((void)__raw_writeb((v),(c))) -#define writew_cpu(v,c) ((void)__raw_writew((__force u16)cpu_to_le16(v),(c))) -#define writel_cpu(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) - -#ifdef CONFIG_64BIT -#define readq_cpu(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) -#define writeq_cpu(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) -#endif - -/* - * Relaxed I/O memory access primitives. These follow the Device memory - * ordering rules but do not guarantee any ordering relative to Normal memory - * accesses. These are defined to order the indicated access (either a read or - * write) with all other I/O memory accesses. Since the platform specification - * defines that all I/O regions are strongly ordered on channel 2, no explicit - * fences are required to enforce this ordering. - */ -/* FIXME: These are now the same as asm-generic */ -#define __io_rbr() do {} while (0) -#define __io_rar() do {} while (0) -#define __io_rbw() do {} while (0) -#define __io_raw() do {} while (0) - -#define readb_relaxed(c) ({ u8 __v; __io_rbr(); __v = readb_cpu(c); __io_rar(); __v; }) -#define readw_relaxed(c) ({ u16 __v; __io_rbr(); __v = readw_cpu(c); __io_rar(); __v; }) -#define readl_relaxed(c) ({ u32 __v; __io_rbr(); __v = readl_cpu(c); __io_rar(); __v; }) - -#define writeb_relaxed(v,c) ({ __io_rbw(); writeb_cpu((v),(c)); __io_raw(); }) -#define writew_relaxed(v,c) ({ __io_rbw(); writew_cpu((v),(c)); __io_raw(); }) -#define writel_relaxed(v,c) ({ __io_rbw(); writel_cpu((v),(c)); __io_raw(); }) - -#ifdef CONFIG_64BIT -#define readq_relaxed(c) ({ u64 __v; __io_rbr(); __v = readq_cpu(c); __io_rar(); __v; }) -#define writeq_relaxed(v,c) ({ __io_rbw(); writeq_cpu((v),(c)); __io_raw(); }) -#endif - -/* - * I/O memory access primitives. Reads are ordered relative to any - * following Normal memory access. Writes are ordered relative to any prior - * Normal memory access. The memory barriers here are necessary as RISC-V - * doesn't define any ordering between the memory space and the I/O space. - */ -#define __io_br() do {} while (0) -#define __io_ar(v) __asm__ __volatile__ ("fence i,r" : : : "memory"); -#define __io_bw() __asm__ __volatile__ ("fence w,o" : : : "memory"); -#define __io_aw() mmiowb_set_pending() - -#define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; }) -#define readw(c) ({ u16 __v; __io_br(); __v = readw_cpu(c); __io_ar(__v); __v; }) -#define readl(c) ({ u32 __v; __io_br(); __v = readl_cpu(c); __io_ar(__v); __v; }) - -#define writeb(v,c) ({ __io_bw(); writeb_cpu((v),(c)); __io_aw(); }) -#define writew(v,c) ({ __io_bw(); writew_cpu((v),(c)); __io_aw(); }) -#define writel(v,c) ({ __io_bw(); writel_cpu((v),(c)); __io_aw(); }) - -#ifdef CONFIG_64BIT -#define readq(c) ({ u64 __v; __io_br(); __v = readq_cpu(c); __io_ar(__v); __v; }) -#define writeq(v,c) ({ __io_bw(); writeq_cpu((v),(c)); __io_aw(); }) -#endif +#include <asm/mmio.h> /* * I/O port access constants. */ +#ifdef CONFIG_MMU #define IO_SPACE_LIMIT (PCI_IO_SIZE - 1) #define PCI_IOBASE ((void __iomem *)PCI_IO_START) +#endif /* CONFIG_MMU */ /* * Emulation routines for the port-mapped IO space used by some PCI drivers. diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index e70f647ce3b7..08d4d6a5b7e9 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -13,31 +13,31 @@ /* read interrupt enabled status */ static inline unsigned long arch_local_save_flags(void) { - return csr_read(CSR_SSTATUS); + return csr_read(CSR_STATUS); } /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { - csr_set(CSR_SSTATUS, SR_SIE); + csr_set(CSR_STATUS, SR_IE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { - csr_clear(CSR_SSTATUS, SR_SIE); + csr_clear(CSR_STATUS, SR_IE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { - return csr_read_clear(CSR_SSTATUS, SR_SIE); + return csr_read_clear(CSR_STATUS, SR_IE); } /* test flags */ static inline int arch_irqs_disabled_flags(unsigned long flags) { - return !(flags & SR_SIE); + return !(flags & SR_IE); } /* test hardware interrupt enable bit */ @@ -49,7 +49,7 @@ static inline int arch_irqs_disabled(void) /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { - csr_set(CSR_SSTATUS, flags & SR_SIE); + csr_set(CSR_STATUS, flags & SR_IE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index 96e30ef637e8..56a98ea30731 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -6,9 +6,9 @@ * Copyright (C) 2017 SiFive */ -#ifndef _RISCV_KPROBES_H -#define _RISCV_KPROBES_H +#ifndef _ASM_RISCV_KPROBES_H +#define _ASM_RISCV_KPROBES_H #include <asm-generic/kprobes.h> -#endif /* _RISCV_KPROBES_H */ +#endif /* _ASM_RISCV_KPROBES_H */ diff --git a/arch/riscv/include/asm/mmio.h b/arch/riscv/include/asm/mmio.h new file mode 100644 index 000000000000..a297a835e402 --- /dev/null +++ b/arch/riscv/include/asm/mmio.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * {read,write}{b,w,l,q} based on arch/arm64/include/asm/io.h + * which was based on arch/arm/include/io.h + * + * Copyright (C) 1996-2000 Russell King + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2014 Regents of the University of California + */ + +#ifndef _ASM_RISCV_MMIO_H +#define _ASM_RISCV_MMIO_H + +#include <linux/types.h> +#include <asm/mmiowb.h> + +#ifdef CONFIG_MMU +void __iomem *ioremap(phys_addr_t offset, unsigned long size); + +/* + * The RISC-V ISA doesn't yet specify how to query or modify PMAs, so we can't + * change the properties of memory regions. This should be fixed by the + * upcoming platform spec. + */ +#define ioremap_nocache(addr, size) ioremap((addr), (size)) +#define ioremap_wc(addr, size) ioremap((addr), (size)) +#define ioremap_wt(addr, size) ioremap((addr), (size)) + +void iounmap(volatile void __iomem *addr); +#else +#define pgprot_noncached(x) (x) +#endif /* CONFIG_MMU */ + +/* Generic IO read/write. These perform native-endian accesses. */ +#define __raw_writeb __raw_writeb +static inline void __raw_writeb(u8 val, volatile void __iomem *addr) +{ + asm volatile("sb %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +#define __raw_writew __raw_writew +static inline void __raw_writew(u16 val, volatile void __iomem *addr) +{ + asm volatile("sh %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +#define __raw_writel __raw_writel +static inline void __raw_writel(u32 val, volatile void __iomem *addr) +{ + asm volatile("sw %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +#ifdef CONFIG_64BIT +#define __raw_writeq __raw_writeq +static inline void __raw_writeq(u64 val, volatile void __iomem *addr) +{ + asm volatile("sd %0, 0(%1)" : : "r" (val), "r" (addr)); +} +#endif + +#define __raw_readb __raw_readb +static inline u8 __raw_readb(const volatile void __iomem *addr) +{ + u8 val; + + asm volatile("lb %0, 0(%1)" : "=r" (val) : "r" (addr)); + return val; +} + +#define __raw_readw __raw_readw +static inline u16 __raw_readw(const volatile void __iomem *addr) +{ + u16 val; + + asm volatile("lh %0, 0(%1)" : "=r" (val) : "r" (addr)); + return val; +} + +#define __raw_readl __raw_readl +static inline u32 __raw_readl(const volatile void __iomem *addr) +{ + u32 val; + + asm volatile("lw %0, 0(%1)" : "=r" (val) : "r" (addr)); + return val; +} + +#ifdef CONFIG_64BIT +#define __raw_readq __raw_readq +static inline u64 __raw_readq(const volatile void __iomem *addr) +{ + u64 val; + + asm volatile("ld %0, 0(%1)" : "=r" (val) : "r" (addr)); + return val; +} +#endif + +/* + * Unordered I/O memory access primitives. These are even more relaxed than + * the relaxed versions, as they don't even order accesses between successive + * operations to the I/O regions. + */ +#define readb_cpu(c) ({ u8 __r = __raw_readb(c); __r; }) +#define readw_cpu(c) ({ u16 __r = le16_to_cpu((__force __le16)__raw_readw(c)); __r; }) +#define readl_cpu(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) + +#define writeb_cpu(v, c) ((void)__raw_writeb((v), (c))) +#define writew_cpu(v, c) ((void)__raw_writew((__force u16)cpu_to_le16(v), (c))) +#define writel_cpu(v, c) ((void)__raw_writel((__force u32)cpu_to_le32(v), (c))) + +#ifdef CONFIG_64BIT +#define readq_cpu(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) +#define writeq_cpu(v, c) ((void)__raw_writeq((__force u64)cpu_to_le64(v), (c))) +#endif + +/* + * Relaxed I/O memory access primitives. These follow the Device memory + * ordering rules but do not guarantee any ordering relative to Normal memory + * accesses. These are defined to order the indicated access (either a read or + * write) with all other I/O memory accesses. Since the platform specification + * defines that all I/O regions are strongly ordered on channel 2, no explicit + * fences are required to enforce this ordering. + */ +/* FIXME: These are now the same as asm-generic */ +#define __io_rbr() do {} while (0) +#define __io_rar() do {} while (0) +#define __io_rbw() do {} while (0) +#define __io_raw() do {} while (0) + +#define readb_relaxed(c) ({ u8 __v; __io_rbr(); __v = readb_cpu(c); __io_rar(); __v; }) +#define readw_relaxed(c) ({ u16 __v; __io_rbr(); __v = readw_cpu(c); __io_rar(); __v; }) +#define readl_relaxed(c) ({ u32 __v; __io_rbr(); __v = readl_cpu(c); __io_rar(); __v; }) + +#define writeb_relaxed(v, c) ({ __io_rbw(); writeb_cpu((v), (c)); __io_raw(); }) +#define writew_relaxed(v, c) ({ __io_rbw(); writew_cpu((v), (c)); __io_raw(); }) +#define writel_relaxed(v, c) ({ __io_rbw(); writel_cpu((v), (c)); __io_raw(); }) + +#ifdef CONFIG_64BIT +#define readq_relaxed(c) ({ u64 __v; __io_rbr(); __v = readq_cpu(c); __io_rar(); __v; }) +#define writeq_relaxed(v, c) ({ __io_rbw(); writeq_cpu((v), (c)); __io_raw(); }) +#endif + +/* + * I/O memory access primitives. Reads are ordered relative to any + * following Normal memory access. Writes are ordered relative to any prior + * Normal memory access. The memory barriers here are necessary as RISC-V + * doesn't define any ordering between the memory space and the I/O space. + */ +#define __io_br() do {} while (0) +#define __io_ar(v) __asm__ __volatile__ ("fence i,r" : : : "memory") +#define __io_bw() __asm__ __volatile__ ("fence w,o" : : : "memory") +#define __io_aw() mmiowb_set_pending() + +#define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; }) +#define readw(c) ({ u16 __v; __io_br(); __v = readw_cpu(c); __io_ar(__v); __v; }) +#define readl(c) ({ u32 __v; __io_br(); __v = readl_cpu(c); __io_ar(__v); __v; }) + +#define writeb(v, c) ({ __io_bw(); writeb_cpu((v), (c)); __io_aw(); }) +#define writew(v, c) ({ __io_bw(); writew_cpu((v), (c)); __io_aw(); }) +#define writel(v, c) ({ __io_bw(); writel_cpu((v), (c)); __io_aw(); }) + +#ifdef CONFIG_64BIT +#define readq(c) ({ u64 __v; __io_br(); __v = readq_cpu(c); __io_ar(__v); __v; }) +#define writeq(v, c) ({ __io_bw(); writeq_cpu((v), (c)); __io_aw(); }) +#endif + +#endif /* _ASM_RISCV_MMIO_H */ diff --git a/arch/riscv/include/asm/mmiowb.h b/arch/riscv/include/asm/mmiowb.h index 5d7e3a2b4e3b..bb4091ff4a21 100644 --- a/arch/riscv/include/asm/mmiowb.h +++ b/arch/riscv/include/asm/mmiowb.h @@ -11,4 +11,4 @@ #include <asm-generic/mmiowb.h> -#endif /* ASM_RISCV_MMIOWB_H */ +#endif /* _ASM_RISCV_MMIOWB_H */ diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 151476fb58cb..967eacb01ab5 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -10,6 +10,9 @@ #ifndef __ASSEMBLY__ typedef struct { +#ifndef CONFIG_MMU + unsigned long end_brk; +#endif void *vdso; #ifdef CONFIG_SMP /* A local icache flush is needed before user execution can resume. */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 3db261c4810f..ac699246ae7e 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -88,8 +88,14 @@ typedef struct page *pgtable_t; #define PTE_FMT "%08lx" #endif +#ifdef CONFIG_MMU extern unsigned long va_pa_offset; extern unsigned long pfn_base; +#define ARCH_PFN_OFFSET (pfn_base) +#else +#define va_pa_offset 0 +#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) +#endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; @@ -112,11 +118,9 @@ extern unsigned long min_low_pfn; #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) \ - (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr)) + (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) #endif -#define ARCH_PFN_OFFSET (pfn_base) - #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h index 5ac8daa1cc36..1c473a1bd986 100644 --- a/arch/riscv/include/asm/pci.h +++ b/arch/riscv/include/asm/pci.h @@ -3,8 +3,8 @@ * Copyright (C) 2016 SiFive */ -#ifndef __ASM_RISCV_PCI_H -#define __ASM_RISCV_PCI_H +#ifndef _ASM_RISCV_PCI_H +#define _ASM_RISCV_PCI_H #include <linux/types.h> #include <linux/slab.h> @@ -34,4 +34,4 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif /* CONFIG_PCI */ -#endif /* __ASM_PCI_H */ +#endif /* _ASM_RISCV_PCI_H */ diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index d59ea92285ec..3f601ee8233f 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <asm/tlb.h> +#ifdef CONFIG_MMU #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -81,5 +82,6 @@ do { \ pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_PGALLOC_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index d3221017194d..beb5f0865e39 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -25,6 +25,7 @@ #include <asm/pgtable-32.h> #endif /* CONFIG_64BIT */ +#ifdef CONFIG_MMU /* Number of entries in the page global directory */ #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) /* Number of entries in the page table */ @@ -32,7 +33,6 @@ /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 /* Page protection bits */ #define _PAGE_BASE (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER) @@ -84,42 +84,6 @@ extern pgd_t swapper_pg_dir[]; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (PAGE_OFFSET - 1) -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) -#define PCI_IO_SIZE SZ_16M - -/* - * Roughly size the vmemmap space to be large enough to fit enough - * struct pages to map half the virtual address space. Then - * position vmemmap directly below the VMALLOC region. - */ -#define VMEMMAP_SHIFT \ - (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) -#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) -#define VMEMMAP_END (VMALLOC_START - 1) -#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) - -#define vmemmap ((struct page *)VMEMMAP_START) - -#define PCI_IO_END VMEMMAP_START -#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) -#define FIXADDR_TOP PCI_IO_START - -#ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE -#else -#define FIXADDR_SIZE PGDIR_SIZE -#endif -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - -/* - * ZERO_PAGE is a global shared page that is always zero, - * used for zero-mapped memory areas, etc. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - static inline int pmd_present(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE)); @@ -430,11 +394,34 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) /* FIXME */ +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#define VMALLOC_END (PAGE_OFFSET - 1) +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) -extern void *dtb_early_va; -extern void setup_bootmem(void); -extern void paging_init(void); +/* + * Roughly size the vmemmap space to be large enough to fit enough + * struct pages to map half the virtual address space. Then + * position vmemmap directly below the VMALLOC region. + */ +#define VMEMMAP_SHIFT \ + (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) +#define VMEMMAP_END (VMALLOC_START - 1) +#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) + +#define vmemmap ((struct page *)VMEMMAP_START) + +#define PCI_IO_SIZE SZ_16M +#define PCI_IO_END VMEMMAP_START +#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) + +#define FIXADDR_TOP PCI_IO_START +#ifdef CONFIG_64BIT +#define FIXADDR_SIZE PMD_SIZE +#else +#define FIXADDR_SIZE PGDIR_SIZE +#endif +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. @@ -446,6 +433,31 @@ extern void paging_init(void); #define TASK_SIZE FIXADDR_START #endif +#else /* CONFIG_MMU */ + +#define PAGE_KERNEL __pgprot(0) +#define swapper_pg_dir NULL +#define VMALLOC_START 0 + +#define TASK_SIZE 0xffffffffUL + +#endif /* !CONFIG_MMU */ + +#define kern_addr_valid(addr) (1) /* FIXME */ + +extern void *dtb_early_va; +void setup_bootmem(void); +void paging_init(void); + +#define FIRST_USER_ADDRESS 0 + +/* + * ZERO_PAGE is a global shared page that is always zero, + * used for zero-mapped memory areas, etc. + */ +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + #include <asm-generic/pgtable.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f539149d04c2..3ddb798264f1 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -42,7 +42,7 @@ struct thread_struct { ((struct pt_regs *)(task_stack_page(tsk) + THREAD_SIZE \ - ALIGN(sizeof(struct pt_regs), STACK_ALIGN))) -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->sepc) +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index d48d1e13973c..ee49f80c9533 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -12,7 +12,7 @@ #ifndef __ASSEMBLY__ struct pt_regs { - unsigned long sepc; + unsigned long epc; unsigned long ra; unsigned long sp; unsigned long gp; @@ -44,10 +44,10 @@ struct pt_regs { unsigned long t4; unsigned long t5; unsigned long t6; - /* Supervisor CSRs */ - unsigned long sstatus; - unsigned long sbadaddr; - unsigned long scause; + /* Supervisor/Machine CSRs */ + unsigned long status; + unsigned long badaddr; + unsigned long cause; /* a0 value before the syscall */ unsigned long orig_a0; }; @@ -58,18 +58,18 @@ struct pt_regs { #define REG_FMT "%08lx" #endif -#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0) +#define user_mode(regs) (((regs)->status & SR_PP) == 0) /* Helpers for working with the instruction pointer */ static inline unsigned long instruction_pointer(struct pt_regs *regs) { - return regs->sepc; + return regs->epc; } static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { - regs->sepc = val; + regs->epc = val; } #define profile_pc(regs) instruction_pointer(regs) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 21134b3ef404..2570c1e683d3 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -8,6 +8,7 @@ #include <linux/types.h> +#ifdef CONFIG_RISCV_SBI #define SBI_SET_TIMER 0 #define SBI_CONSOLE_PUTCHAR 1 #define SBI_CONSOLE_GETCHAR 2 @@ -93,5 +94,11 @@ static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, { SBI_CALL_4(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask, start, size, asid); } - -#endif +#else /* CONFIG_RISCV_SBI */ +/* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */ +void sbi_set_timer(uint64_t stime_value); +void sbi_clear_ipi(void); +void sbi_send_ipi(const unsigned long *hart_mask); +void sbi_remote_fence_i(const unsigned long *hart_mask); +#endif /* CONFIG_RISCV_SBI */ +#endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h new file mode 100644 index 000000000000..bf7744ee3b3d --- /dev/null +++ b/arch/riscv/include/asm/seccomp.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include <asm/unistd.h> + +#include <asm-generic/seccomp.h> + +#endif /* _ASM_SECCOMP_H */ diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h index b58ba2d9ed6e..45a7018a8118 100644 --- a/arch/riscv/include/asm/sparsemem.h +++ b/arch/riscv/include/asm/sparsemem.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_SPARSEMEM_H -#define __ASM_SPARSEMEM_H +#ifndef _ASM_RISCV_SPARSEMEM_H +#define _ASM_RISCV_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM #define MAX_PHYSMEM_BITS CONFIG_PA_BITS #define SECTION_SIZE_BITS 27 #endif /* CONFIG_SPARSEMEM */ -#endif /* __ASM_SPARSEMEM_H */ +#endif /* _ASM_RISCV_SPARSEMEM_H */ diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h index 888cbf8e7111..f398e7638dd6 100644 --- a/arch/riscv/include/asm/spinlock_types.h +++ b/arch/riscv/include/asm/spinlock_types.h @@ -22,4 +22,4 @@ typedef struct { #define __ARCH_RW_LOCK_UNLOCKED { 0 } -#endif +#endif /* _ASM_RISCV_SPINLOCK_TYPES_H */ diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index ee4f0ac62c9d..407bcc96a710 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -17,19 +17,19 @@ extern void __fstate_restore(struct task_struct *restore_from); static inline void __fstate_clean(struct pt_regs *regs) { - regs->sstatus = (regs->sstatus & ~SR_FS) | SR_FS_CLEAN; + regs->status = (regs->status & ~SR_FS) | SR_FS_CLEAN; } static inline void fstate_off(struct task_struct *task, struct pt_regs *regs) { - regs->sstatus = (regs->sstatus & ~SR_FS) | SR_FS_OFF; + regs->status = (regs->status & ~SR_FS) | SR_FS_OFF; } static inline void fstate_save(struct task_struct *task, struct pt_regs *regs) { - if ((regs->sstatus & SR_FS) == SR_FS_DIRTY) { + if ((regs->status & SR_FS) == SR_FS_DIRTY) { __fstate_save(task); __fstate_clean(regs); } @@ -38,7 +38,7 @@ static inline void fstate_save(struct task_struct *task, static inline void fstate_restore(struct task_struct *task, struct pt_regs *regs) { - if ((regs->sstatus & SR_FS) != SR_FS_OFF) { + if ((regs->status & SR_FS) != SR_FS_OFF) { __fstate_restore(task); __fstate_clean(regs); } @@ -50,7 +50,7 @@ static inline void __switch_to_aux(struct task_struct *prev, struct pt_regs *regs; regs = task_pt_regs(prev); - if (unlikely(regs->sstatus & SR_SD)) + if (unlikely(regs->status & SR_SD)) fstate_save(prev, regs); fstate_restore(next, task_pt_regs(next)); } diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 905372d7eeb8..1dd12a0cbb2b 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -75,6 +75,7 @@ struct thread_info { #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ +#define TIF_SECCOMP 8 /* syscall secure computing */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -82,11 +83,13 @@ struct thread_info { #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_WORK_MASK \ (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED) #define _TIF_SYSCALL_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP) #endif /* _ASM_RISCV_THREAD_INFO_H */ diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h index c7ef131b9e4c..bad2a7c2cda5 100644 --- a/arch/riscv/include/asm/timex.h +++ b/arch/riscv/include/asm/timex.h @@ -7,12 +7,25 @@ #define _ASM_RISCV_TIMEX_H #include <asm/csr.h> +#include <asm/mmio.h> typedef unsigned long cycles_t; +extern u64 __iomem *riscv_time_val; +extern u64 __iomem *riscv_time_cmp; + +#ifdef CONFIG_64BIT +#define mmio_get_cycles() readq_relaxed(riscv_time_val) +#else +#define mmio_get_cycles() readl_relaxed(riscv_time_val) +#define mmio_get_cycles_hi() readl_relaxed(((u32 *)riscv_time_val) + 1) +#endif + static inline cycles_t get_cycles(void) { - return csr_read(CSR_TIME); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + return csr_read(CSR_TIME); + return mmio_get_cycles(); } #define get_cycles get_cycles @@ -24,7 +37,9 @@ static inline u64 get_cycles64(void) #else /* CONFIG_64BIT */ static inline u32 get_cycles_hi(void) { - return csr_read(CSR_TIMEH); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + return csr_read(CSR_TIMEH); + return mmio_get_cycles_hi(); } static inline u64 get_cycles64(void) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index f02188a5b0f4..394cfbccdcd9 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -10,6 +10,7 @@ #include <linux/mm_types.h> #include <asm/smp.h> +#ifdef CONFIG_MMU static inline void local_flush_tlb_all(void) { __asm__ __volatile__ ("sfence.vma" : : : "memory"); @@ -20,14 +21,19 @@ static inline void local_flush_tlb_page(unsigned long addr) { __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"); } +#else /* CONFIG_MMU */ +#define local_flush_tlb_all() do { } while (0) +#define local_flush_tlb_page(addr) do { } while (0) +#endif /* CONFIG_MMU */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_MMU) void flush_tlb_all(void); void flush_tlb_mm(struct mm_struct *mm); void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -#else /* CONFIG_SMP */ +#else /* CONFIG_SMP && CONFIG_MMU */ + #define flush_tlb_all() local_flush_tlb_all() #define flush_tlb_page(vma, addr) local_flush_tlb_page(addr) @@ -38,7 +44,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, } #define flush_tlb_mm(mm) flush_tlb_all() -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP || !CONFIG_MMU */ /* Flush a range of kernel pages */ static inline void flush_tlb_kernel_range(unsigned long start, diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index e076437cfafe..f462a183a9c2 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -11,6 +11,7 @@ /* * User space memory access functions */ +#ifdef CONFIG_MMU #include <linux/errno.h> #include <linux/compiler.h> #include <linux/thread_info.h> @@ -475,4 +476,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) __ret; \ }) +#else /* CONFIG_MMU */ +#include <asm-generic/uaccess.h> +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_UACCESS_H */ diff --git a/arch/riscv/include/uapi/asm/elf.h b/arch/riscv/include/uapi/asm/elf.h index 644a00ce6e2e..d696d6610231 100644 --- a/arch/riscv/include/uapi/asm/elf.h +++ b/arch/riscv/include/uapi/asm/elf.h @@ -9,8 +9,8 @@ * (at your option) any later version. */ -#ifndef _UAPI_ASM_ELF_H -#define _UAPI_ASM_ELF_H +#ifndef _UAPI_ASM_RISCV_ELF_H +#define _UAPI_ASM_RISCV_ELF_H #include <asm/ptrace.h> @@ -95,4 +95,4 @@ typedef union __riscv_fp_state elf_fpregset_t; #define R_RISCV_32_PCREL 57 -#endif /* _UAPI_ASM_ELF_H */ +#endif /* _UAPI_ASM_RISCV_ELF_H */ diff --git a/arch/riscv/include/uapi/asm/hwcap.h b/arch/riscv/include/uapi/asm/hwcap.h index 4e7646077056..dee98ee28318 100644 --- a/arch/riscv/include/uapi/asm/hwcap.h +++ b/arch/riscv/include/uapi/asm/hwcap.h @@ -5,8 +5,8 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2017 SiFive */ -#ifndef __UAPI_ASM_HWCAP_H -#define __UAPI_ASM_HWCAP_H +#ifndef _UAPI_ASM_RISCV_HWCAP_H +#define _UAPI_ASM_RISCV_HWCAP_H /* * Linux saves the floating-point registers according to the ISA Linux is @@ -22,4 +22,4 @@ #define COMPAT_HWCAP_ISA_D (1 << ('D' - 'A')) #define COMPAT_HWCAP_ISA_C (1 << ('C' - 'A')) -#endif +#endif /* _UAPI_ASM_RISCV_HWCAP_H */ diff --git a/arch/riscv/include/uapi/asm/ucontext.h b/arch/riscv/include/uapi/asm/ucontext.h index 411dd7b52ed6..44eb993950e5 100644 --- a/arch/riscv/include/uapi/asm/ucontext.h +++ b/arch/riscv/include/uapi/asm/ucontext.h @@ -5,8 +5,8 @@ * * This file was copied from arch/arm64/include/uapi/asm/ucontext.h */ -#ifndef _UAPI__ASM_UCONTEXT_H -#define _UAPI__ASM_UCONTEXT_H +#ifndef _UAPI_ASM_RISCV_UCONTEXT_H +#define _UAPI_ASM_RISCV_UCONTEXT_H #include <linux/types.h> @@ -31,4 +31,4 @@ struct ucontext { struct sigcontext uc_mcontext; }; -#endif /* _UAPI__ASM_UCONTEXT_H */ +#endif /* _UAPI_ASM_RISCV_UCONTEXT_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 696020ff72db..f40205cb9a22 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -25,10 +25,10 @@ obj-y += time.o obj-y += traps.o obj-y += riscv_ksyms.o obj-y += stacktrace.o -obj-y += vdso.o obj-y += cacheinfo.o -obj-y += vdso/ +obj-$(CONFIG_MMU) += vdso.o vdso/ +obj-$(CONFIG_RISCV_M_MODE) += clint.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o @@ -41,5 +41,6 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o +obj-$(CONFIG_RISCV_SBI) += sbi.o clean: diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 9f5628c38ac9..07cb9c10de4e 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -71,7 +71,7 @@ void asm_offsets(void) OFFSET(TASK_THREAD_FCSR, task_struct, thread.fstate.fcsr); DEFINE(PT_SIZE, sizeof(struct pt_regs)); - OFFSET(PT_SEPC, pt_regs, sepc); + OFFSET(PT_EPC, pt_regs, epc); OFFSET(PT_RA, pt_regs, ra); OFFSET(PT_FP, pt_regs, s0); OFFSET(PT_S0, pt_regs, s0); @@ -105,9 +105,9 @@ void asm_offsets(void) OFFSET(PT_T6, pt_regs, t6); OFFSET(PT_GP, pt_regs, gp); OFFSET(PT_ORIG_A0, pt_regs, orig_a0); - OFFSET(PT_SSTATUS, pt_regs, sstatus); - OFFSET(PT_SBADADDR, pt_regs, sbadaddr); - OFFSET(PT_SCAUSE, pt_regs, scause); + OFFSET(PT_STATUS, pt_regs, status); + OFFSET(PT_BADADDR, pt_regs, badaddr); + OFFSET(PT_CAUSE, pt_regs, cause); /* * THREAD_{F,X}* might be larger than a S-type offset can handle, but diff --git a/arch/riscv/kernel/clint.c b/arch/riscv/kernel/clint.c new file mode 100644 index 000000000000..3647980d14c3 --- /dev/null +++ b/arch/riscv/kernel/clint.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Christoph Hellwig. + */ + +#include <linux/io.h> +#include <linux/of_address.h> +#include <linux/types.h> +#include <asm/clint.h> +#include <asm/csr.h> +#include <asm/timex.h> +#include <asm/smp.h> + +/* + * This is the layout used by the SiFive clint, which is also shared by the qemu + * virt platform, and the Kendryte KD210 at least. + */ +#define CLINT_IPI_OFF 0 +#define CLINT_TIME_CMP_OFF 0x4000 +#define CLINT_TIME_VAL_OFF 0xbff8 + +u32 __iomem *clint_ipi_base; + +void clint_init_boot_cpu(void) +{ + struct device_node *np; + void __iomem *base; + + np = of_find_compatible_node(NULL, NULL, "riscv,clint0"); + if (!np) { + panic("clint not found"); + return; + } + + base = of_iomap(np, 0); + if (!base) + panic("could not map CLINT"); + + clint_ipi_base = base + CLINT_IPI_OFF; + riscv_time_cmp = base + CLINT_TIME_CMP_OFF; + riscv_time_val = base + CLINT_TIME_VAL_OFF; + + clint_clear_ipi(boot_cpu_hartid); +} diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 7da3c6a93abd..40a3c442ac5f 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -46,51 +46,12 @@ int riscv_of_processor_hartid(struct device_node *node) #ifdef CONFIG_PROC_FS -static void print_isa(struct seq_file *f, const char *orig_isa) +static void print_isa(struct seq_file *f, const char *isa) { - static const char *ext = "mafdcsu"; - const char *isa = orig_isa; - const char *e; - - /* - * Linux doesn't support rv32e or rv128i, and we only support booting - * kernels on harts with the same ISA that the kernel is compiled for. - */ -#if defined(CONFIG_32BIT) - if (strncmp(isa, "rv32i", 5) != 0) - return; -#elif defined(CONFIG_64BIT) - if (strncmp(isa, "rv64i", 5) != 0) - return; -#endif - - /* Print the base ISA, as we already know it's legal. */ + /* Print the entire ISA as it is */ seq_puts(f, "isa\t\t: "); - seq_write(f, isa, 5); - isa += 5; - - /* - * Check the rest of the ISA string for valid extensions, printing those - * we find. RISC-V ISA strings define an order, so we only print the - * extension bits when they're in order. Hide the supervisor (S) - * extension from userspace as it's not accessible from there. - */ - for (e = ext; *e != '\0'; ++e) { - if (isa[0] == e[0]) { - if (isa[0] != 's') - seq_write(f, isa, 1); - - isa++; - } - } + seq_write(f, isa, strlen(isa)); seq_puts(f, "\n"); - - /* - * If we were given an unsupported ISA in the device tree then print - * a bit of info describing what went wrong. - */ - if (isa[0] != '\0') - pr_info("unsupported ISA \"%s\" in device tree\n", orig_isa); } static void print_mmu(struct seq_file *f, const char *mmu_type) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 8ca479831142..a1349ca64669 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -26,14 +26,14 @@ /* * If coming from userspace, preserve the user thread pointer and load - * the kernel thread pointer. If we came from the kernel, sscratch - * will contain 0, and we should continue on the current TP. + * the kernel thread pointer. If we came from the kernel, the scratch + * register will contain 0, and we should continue on the current TP. */ - csrrw tp, CSR_SSCRATCH, tp + csrrw tp, CSR_SCRATCH, tp bnez tp, _save_context _restore_kernel_tpsp: - csrr tp, CSR_SSCRATCH + csrr tp, CSR_SCRATCH REG_S sp, TASK_TI_KERNEL_SP(tp) _save_context: REG_S sp, TASK_TI_USER_SP(tp) @@ -79,16 +79,16 @@ _save_context: li t0, SR_SUM | SR_FS REG_L s0, TASK_TI_USER_SP(tp) - csrrc s1, CSR_SSTATUS, t0 - csrr s2, CSR_SEPC - csrr s3, CSR_STVAL - csrr s4, CSR_SCAUSE - csrr s5, CSR_SSCRATCH + csrrc s1, CSR_STATUS, t0 + csrr s2, CSR_EPC + csrr s3, CSR_TVAL + csrr s4, CSR_CAUSE + csrr s5, CSR_SCRATCH REG_S s0, PT_SP(sp) - REG_S s1, PT_SSTATUS(sp) - REG_S s2, PT_SEPC(sp) - REG_S s3, PT_SBADADDR(sp) - REG_S s4, PT_SCAUSE(sp) + REG_S s1, PT_STATUS(sp) + REG_S s2, PT_EPC(sp) + REG_S s3, PT_BADADDR(sp) + REG_S s4, PT_CAUSE(sp) REG_S s5, PT_TP(sp) .endm @@ -97,7 +97,7 @@ _save_context: * registers from the stack. */ .macro RESTORE_ALL - REG_L a0, PT_SSTATUS(sp) + REG_L a0, PT_STATUS(sp) /* * The current load reservation is effectively part of the processor's * state, in the sense that load reservations cannot be shared between @@ -115,11 +115,11 @@ _save_context: * completes, implementations are allowed to expand reservations to be * arbitrarily large. */ - REG_L a2, PT_SEPC(sp) - REG_SC x0, a2, PT_SEPC(sp) + REG_L a2, PT_EPC(sp) + REG_SC x0, a2, PT_EPC(sp) - csrw CSR_SSTATUS, a0 - csrw CSR_SEPC, a2 + csrw CSR_STATUS, a0 + csrw CSR_EPC, a2 REG_L x1, PT_RA(sp) REG_L x3, PT_GP(sp) @@ -163,10 +163,10 @@ ENTRY(handle_exception) SAVE_ALL /* - * Set sscratch register to 0, so that if a recursive exception + * Set the scratch register to 0, so that if a recursive exception * occurs, the exception vector knows it came from the kernel */ - csrw CSR_SSCRATCH, x0 + csrw CSR_SCRATCH, x0 /* Load the global pointer */ .option push @@ -185,11 +185,13 @@ ENTRY(handle_exception) move a0, sp /* pt_regs */ tail do_IRQ 1: - /* Exceptions run with interrupts enabled or disabled - depending on the state of sstatus.SR_SPIE */ - andi t0, s1, SR_SPIE + /* + * Exceptions run with interrupts enabled or disabled depending on the + * state of SR_PIE in m/sstatus. + */ + andi t0, s1, SR_PIE beqz t0, 1f - csrs CSR_SSTATUS, SR_SIE + csrs CSR_STATUS, SR_IE 1: /* Handle syscalls */ @@ -217,7 +219,7 @@ handle_syscall: * scall instruction on sret */ addi s2, s2, 0x4 - REG_S s2, PT_SEPC(sp) + REG_S s2, PT_EPC(sp) /* Trace syscalls, but only if requested by the user. */ REG_L t0, TASK_TI_FLAGS(tp) andi t0, t0, _TIF_SYSCALL_WORK @@ -226,8 +228,25 @@ check_syscall_nr: /* Check to make sure we don't jump to a bogus syscall number. */ li t0, __NR_syscalls la s0, sys_ni_syscall - /* Syscall number held in a7 */ - bgeu a7, t0, 1f + /* + * The tracer can change syscall number to valid/invalid value. + * We use syscall_set_nr helper in syscall_trace_enter thus we + * cannot trust the current value in a7 and have to reload from + * the current task pt_regs. + */ + REG_L a7, PT_A7(sp) + /* + * Syscall number held in a7. + * If syscall number is above allowed value, redirect to ni_syscall. + */ + bge a7, t0, 1f + /* + * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1. + * If yes, we pretend it was executed. + */ + li t1, -1 + beq a7, t1, ret_from_syscall_rejected + /* Call syscall */ la s0, sys_call_table slli t0, a7, RISCV_LGPTR add s0, s0, t0 @@ -238,15 +257,27 @@ check_syscall_nr: ret_from_syscall: /* Set user a0 to kernel a0 */ REG_S a0, PT_A0(sp) + /* + * We didn't execute the actual syscall. + * Seccomp already set return value for the current task pt_regs. + * (If it was configured with SECCOMP_RET_ERRNO/TRACE) + */ +ret_from_syscall_rejected: /* Trace syscalls, but only if requested by the user. */ REG_L t0, TASK_TI_FLAGS(tp) andi t0, t0, _TIF_SYSCALL_WORK bnez t0, handle_syscall_trace_exit ret_from_exception: - REG_L s0, PT_SSTATUS(sp) - csrc CSR_SSTATUS, SR_SIE + REG_L s0, PT_STATUS(sp) + csrc CSR_STATUS, SR_IE +#ifdef CONFIG_RISCV_M_MODE + /* the MPP value is too large to be used as an immediate arg for addi */ + li t0, SR_MPP + and s0, s0, t0 +#else andi s0, s0, SR_SPP +#endif bnez s0, resume_kernel resume_userspace: @@ -260,14 +291,18 @@ resume_userspace: REG_S s0, TASK_TI_KERNEL_SP(tp) /* - * Save TP into sscratch, so we can find the kernel data structures - * again. + * Save TP into the scratch register , so we can find the kernel data + * structures again. */ - csrw CSR_SSCRATCH, tp + csrw CSR_SCRATCH, tp restore_all: RESTORE_ALL +#ifdef CONFIG_RISCV_M_MODE + mret +#else sret +#endif #if IS_ENABLED(CONFIG_PREEMPT) resume_kernel: @@ -287,7 +322,7 @@ work_pending: bnez s1, work_resched work_notifysig: /* Handle pending signals and notify-resume requests */ - csrs CSR_SSTATUS, SR_SIE /* Enable interrupts for do_notify_resume() */ + csrs CSR_STATUS, SR_IE /* Enable interrupts for do_notify_resume() */ move a0, sp /* pt_regs */ move a1, s0 /* current_thread_info->flags */ tail do_notify_resume @@ -386,6 +421,10 @@ ENTRY(__switch_to) ret ENDPROC(__switch_to) +#ifndef CONFIG_MMU +#define do_page_fault do_trap_unknown +#endif + .section ".rodata" /* Exception vector table */ ENTRY(excp_vect_table) @@ -407,3 +446,10 @@ ENTRY(excp_vect_table) RISCV_PTR do_page_fault /* store page fault */ excp_vect_table_end: END(excp_vect_table) + +#ifndef CONFIG_MMU +ENTRY(__user_rt_sigreturn) + li a7, __NR_rt_sigreturn + scall +END(__user_rt_sigreturn) +#endif diff --git a/arch/riscv/kernel/fpu.S b/arch/riscv/kernel/fpu.S index 631d31540660..dd2205473de7 100644 --- a/arch/riscv/kernel/fpu.S +++ b/arch/riscv/kernel/fpu.S @@ -23,7 +23,7 @@ ENTRY(__fstate_save) li a2, TASK_THREAD_F0 add a0, a0, a2 li t1, SR_FS - csrs CSR_SSTATUS, t1 + csrs CSR_STATUS, t1 frcsr t0 fsd f0, TASK_THREAD_F0_F0(a0) fsd f1, TASK_THREAD_F1_F0(a0) @@ -58,7 +58,7 @@ ENTRY(__fstate_save) fsd f30, TASK_THREAD_F30_F0(a0) fsd f31, TASK_THREAD_F31_F0(a0) sw t0, TASK_THREAD_FCSR_F0(a0) - csrc CSR_SSTATUS, t1 + csrc CSR_STATUS, t1 ret ENDPROC(__fstate_save) @@ -67,7 +67,7 @@ ENTRY(__fstate_restore) add a0, a0, a2 li t1, SR_FS lw t0, TASK_THREAD_FCSR_F0(a0) - csrs CSR_SSTATUS, t1 + csrs CSR_STATUS, t1 fld f0, TASK_THREAD_F0_F0(a0) fld f1, TASK_THREAD_F1_F0(a0) fld f2, TASK_THREAD_F2_F0(a0) @@ -101,6 +101,6 @@ ENTRY(__fstate_restore) fld f30, TASK_THREAD_F30_F0(a0) fld f31, TASK_THREAD_F31_F0(a0) fscsr t0 - csrc CSR_SSTATUS, t1 + csrc CSR_STATUS, t1 ret ENDPROC(__fstate_restore) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 72f89b7590dd..84a6f0a4b120 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -11,6 +11,7 @@ #include <asm/thread_info.h> #include <asm/page.h> #include <asm/csr.h> +#include <asm/hwcap.h> #include <asm/image.h> __INIT @@ -47,8 +48,22 @@ ENTRY(_start) .global _start_kernel _start_kernel: /* Mask all interrupts */ - csrw CSR_SIE, zero - csrw CSR_SIP, zero + csrw CSR_IE, zero + csrw CSR_IP, zero + +#ifdef CONFIG_RISCV_M_MODE + /* flush the instruction cache */ + fence.i + + /* Reset all registers except ra, a0, a1 */ + call reset_regs + + /* + * The hartid in a0 is expected later on, and we have no firmware + * to hand it to us. + */ + csrr a0, CSR_MHARTID +#endif /* CONFIG_RISCV_M_MODE */ /* Load the global pointer */ .option push @@ -61,7 +76,7 @@ _start_kernel: * floating point in kernel space */ li t0, SR_FS - csrc CSR_SSTATUS, t0 + csrc CSR_STATUS, t0 #ifdef CONFIG_SMP li t0, CONFIG_NR_CPUS @@ -94,8 +109,10 @@ clear_bss_done: la sp, init_thread_union + THREAD_SIZE mv a0, s1 call setup_vm +#ifdef CONFIG_MMU la a0, early_pg_dir call relocate +#endif /* CONFIG_MMU */ /* Restore C environment */ la tp, init_task @@ -106,6 +123,7 @@ clear_bss_done: call parse_dtb tail start_kernel +#ifdef CONFIG_MMU relocate: /* Relocate return address */ li a1, PAGE_OFFSET @@ -116,7 +134,7 @@ relocate: /* Point stvec to virtual address of intruction after satp write */ la a2, 1f add a2, a2, a1 - csrw CSR_STVEC, a2 + csrw CSR_TVEC, a2 /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT @@ -138,7 +156,7 @@ relocate: 1: /* Set trap vector to spin forever to help debug */ la a0, .Lsecondary_park - csrw CSR_STVEC, a0 + csrw CSR_TVEC, a0 /* Reload the global pointer */ .option push @@ -156,12 +174,13 @@ relocate: sfence.vma ret +#endif /* CONFIG_MMU */ .Lsecondary_start: #ifdef CONFIG_SMP /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park - csrw CSR_STVEC, a3 + csrw CSR_TVEC, a3 slli a3, a0, LGREG la a1, __cpu_up_stack_pointer @@ -181,9 +200,11 @@ relocate: beqz tp, .Lwait_for_cpu_up fence +#ifdef CONFIG_MMU /* Enable virtual memory and relocate to virtual address */ la a0, swapper_pg_dir call relocate +#endif tail smp_callin #endif @@ -195,6 +216,85 @@ relocate: j .Lsecondary_park END(_start) +#ifdef CONFIG_RISCV_M_MODE +ENTRY(reset_regs) + li sp, 0 + li gp, 0 + li tp, 0 + li t0, 0 + li t1, 0 + li t2, 0 + li s0, 0 + li s1, 0 + li a2, 0 + li a3, 0 + li a4, 0 + li a5, 0 + li a6, 0 + li a7, 0 + li s2, 0 + li s3, 0 + li s4, 0 + li s5, 0 + li s6, 0 + li s7, 0 + li s8, 0 + li s9, 0 + li s10, 0 + li s11, 0 + li t3, 0 + li t4, 0 + li t5, 0 + li t6, 0 + csrw sscratch, 0 + +#ifdef CONFIG_FPU + csrr t0, CSR_MISA + andi t0, t0, (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D) + bnez t0, .Lreset_regs_done + + li t1, SR_FS + csrs CSR_STATUS, t1 + fmv.s.x f0, zero + fmv.s.x f1, zero + fmv.s.x f2, zero + fmv.s.x f3, zero + fmv.s.x f4, zero + fmv.s.x f5, zero + fmv.s.x f6, zero + fmv.s.x f7, zero + fmv.s.x f8, zero + fmv.s.x f9, zero + fmv.s.x f10, zero + fmv.s.x f11, zero + fmv.s.x f12, zero + fmv.s.x f13, zero + fmv.s.x f14, zero + fmv.s.x f15, zero + fmv.s.x f16, zero + fmv.s.x f17, zero + fmv.s.x f18, zero + fmv.s.x f19, zero + fmv.s.x f20, zero + fmv.s.x f21, zero + fmv.s.x f22, zero + fmv.s.x f23, zero + fmv.s.x f24, zero + fmv.s.x f25, zero + fmv.s.x f26, zero + fmv.s.x f27, zero + fmv.s.x f28, zero + fmv.s.x f29, zero + fmv.s.x f30, zero + fmv.s.x f31, zero + csrw fcsr, 0 + /* note that the caller must clear SR_FS */ +#endif /* CONFIG_FPU */ +.Lreset_regs_done: + ret +END(reset_regs) +#endif /* CONFIG_RISCV_M_MODE */ + __PAGE_ALIGNED_BSS /* Empty zero page */ .balign PAGE_SIZE diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index fffac6ddb0e0..3f07a91d5afb 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -11,13 +11,6 @@ #include <linux/seq_file.h> #include <asm/smp.h> -/* - * Possible interrupt causes: - */ -#define INTERRUPT_CAUSE_SOFTWARE IRQ_S_SOFT -#define INTERRUPT_CAUSE_TIMER IRQ_S_TIMER -#define INTERRUPT_CAUSE_EXTERNAL IRQ_S_EXT - int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_stats(p, prec); @@ -29,12 +22,12 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); irq_enter(); - switch (regs->scause & ~SCAUSE_IRQ_FLAG) { - case INTERRUPT_CAUSE_TIMER: + switch (regs->cause & ~CAUSE_IRQ_FLAG) { + case IRQ_TIMER: riscv_timer_interrupt(); break; #ifdef CONFIG_SMP - case INTERRUPT_CAUSE_SOFTWARE: + case IRQ_SOFT: /* * We only use software interrupts to pass IPIs, so if a non-SMP * system gets one, then we don't know what to do. @@ -42,11 +35,11 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) riscv_software_interrupt(); break; #endif - case INTERRUPT_CAUSE_EXTERNAL: + case IRQ_EXT: handle_arch_irq(regs); break; default: - pr_alert("unexpected interrupt cause 0x%lx", regs->scause); + pr_alert("unexpected interrupt cause 0x%lx", regs->cause); BUG(); } irq_exit(); diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 70bb94ae61c5..b7401858d872 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -315,8 +315,8 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, /* Ignore unresolved weak symbol */ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) continue; - pr_warning("%s: Unknown symbol %s\n", - me->name, strtab + sym->st_name); + pr_warn("%s: Unknown symbol %s\n", + me->name, strtab + sym->st_name); return -ENOENT; } diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 8d2804f05cf9..cf190197a22f 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -67,7 +67,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, return; fp = regs->s0; - perf_callchain_store(entry, regs->sepc); + perf_callchain_store(entry, regs->epc); fp = user_backtrace(entry, fp, regs->ra); while (fp && !(fp & 0x3) && entry->nr < entry->max_stack) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 85e3c39bb60b..95a3031e5c7c 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -35,8 +35,8 @@ void show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); - pr_cont("sepc: " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n", - regs->sepc, regs->ra, regs->sp); + pr_cont("epc: " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n", + regs->epc, regs->ra, regs->sp); pr_cont(" gp : " REG_FMT " tp : " REG_FMT " t0 : " REG_FMT "\n", regs->gp, regs->tp, regs->t0); pr_cont(" t1 : " REG_FMT " t2 : " REG_FMT " s0 : " REG_FMT "\n", @@ -58,23 +58,23 @@ void show_regs(struct pt_regs *regs) pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT "\n", regs->t5, regs->t6); - pr_cont("sstatus: " REG_FMT " sbadaddr: " REG_FMT " scause: " REG_FMT "\n", - regs->sstatus, regs->sbadaddr, regs->scause); + pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n", + regs->status, regs->badaddr, regs->cause); } void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - regs->sstatus = SR_SPIE; + regs->status = SR_PIE; if (has_fpu) { - regs->sstatus |= SR_FS_INITIAL; + regs->status |= SR_FS_INITIAL; /* * Restore the initial value to the FP register * before starting the user program. */ fstate_restore(current, regs); } - regs->sepc = pc; + regs->epc = pc; regs->sp = sp; set_fs(USER_DS); } @@ -110,7 +110,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, const register unsigned long gp __asm__ ("gp"); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp; - childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */ + /* Supervisor/Machine, irqs on: */ + childregs->status = SR_PP | SR_PIE; p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = usp; /* fn */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 1252113ef8b2..0f84628b9385 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -154,6 +154,16 @@ __visible void do_syscall_trace_enter(struct pt_regs *regs) if (tracehook_report_syscall_entry(regs)) syscall_set_nr(current, regs, -1); + /* + * Do the secure computing after ptrace; failures should be fast. + * If this fails we might have return value in a0 from seccomp + * (via SECCOMP_RET_ERRNO/TRACE). + */ + if (secure_computing(NULL) == -1) { + syscall_set_nr(current, regs, -1); + return; + } + #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall_get_nr(current, regs)); diff --git a/arch/riscv/kernel/reset.c b/arch/riscv/kernel/reset.c index aa56bb135ec4..ee5878d968cc 100644 --- a/arch/riscv/kernel/reset.c +++ b/arch/riscv/kernel/reset.c @@ -5,12 +5,11 @@ #include <linux/reboot.h> #include <linux/pm.h> -#include <asm/sbi.h> static void default_power_off(void) { - sbi_shutdown(); - while (1); + while (1) + wait_for_interrupt(); } void (*pm_power_off)(void) = default_power_off; diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c new file mode 100644 index 000000000000..f6c7c3e82d28 --- /dev/null +++ b/arch/riscv/kernel/sbi.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/init.h> +#include <linux/pm.h> +#include <asm/sbi.h> + +static void sbi_power_off(void) +{ + sbi_shutdown(); +} + +static int __init sbi_init(void) +{ + pm_power_off = sbi_power_off; + return 0; +} +early_initcall(sbi_init); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 845ae0e12115..365ff8420bfe 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -17,6 +17,7 @@ #include <linux/sched/task.h> #include <linux/swiotlb.h> +#include <asm/clint.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -67,6 +68,7 @@ void __init setup_arch(char **cmdline_p) setup_bootmem(); paging_init(); unflatten_device_tree(); + clint_init_boot_cpu(); #ifdef CONFIG_SWIOTLB swiotlb_init(1); diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index d0f6f212f5df..17ba190e84a5 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -17,11 +17,16 @@ #include <asm/switch_to.h> #include <asm/csr.h> +extern u32 __user_rt_sigreturn[2]; + #define DEBUG_SIG 0 struct rt_sigframe { struct siginfo info; struct ucontext uc; +#ifndef CONFIG_MMU + u32 sigreturn_code[2]; +#endif }; #ifdef CONFIG_FPU @@ -124,7 +129,7 @@ badframe: pr_info_ratelimited( "%s[%d]: bad frame in %s: frame=%p pc=%p sp=%p\n", task->comm, task_pid_nr(task), __func__, - frame, (void *)regs->sepc, (void *)regs->sp); + frame, (void *)regs->epc, (void *)regs->sp); } force_sig(SIGSEGV); return 0; @@ -166,7 +171,6 @@ static inline void __user *get_sigframe(struct ksignal *ksig, return (void __user *)sp; } - static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { @@ -189,8 +193,19 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, return -EFAULT; /* Set up to return from userspace. */ +#ifdef CONFIG_MMU regs->ra = (unsigned long)VDSO_SYMBOL( current->mm->context.vdso, rt_sigreturn); +#else + /* + * For the nommu case we don't have a VDSO. Instead we push two + * instructions to call the rt_sigreturn syscall onto the user stack. + */ + if (copy_to_user(&frame->sigreturn_code, __user_rt_sigreturn, + sizeof(frame->sigreturn_code))) + return -EFAULT; + regs->ra = (unsigned long)&frame->sigreturn_code; +#endif /* CONFIG_MMU */ /* * Set up registers for signal handler. @@ -199,7 +214,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, * We always pass siginfo and mcontext, regardless of SA_SIGINFO, * since some things rely on this (e.g. glibc's debug/segfault.c). */ - regs->sepc = (unsigned long)ksig->ka.sa.sa_handler; + regs->epc = (unsigned long)ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; regs->a0 = ksig->sig; /* a0: signal number */ regs->a1 = (unsigned long)(&frame->info); /* a1: siginfo pointer */ @@ -208,7 +223,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, #if DEBUG_SIG pr_info("SIG deliver (%s:%d): sig=%d pc=%p ra=%p sp=%p\n", current->comm, task_pid_nr(current), ksig->sig, - (void *)regs->sepc, (void *)regs->ra, frame); + (void *)regs->epc, (void *)regs->ra, frame); #endif return 0; @@ -220,10 +235,9 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) int ret; /* Are we from a system call? */ - if (regs->scause == EXC_SYSCALL) { + if (regs->cause == EXC_SYSCALL) { /* Avoid additional syscall restarting via ret_from_exception */ - regs->scause = -1UL; - + regs->cause = -1UL; /* If so, check system call restarting.. */ switch (regs->a0) { case -ERESTART_RESTARTBLOCK: @@ -239,7 +253,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* fallthrough */ case -ERESTARTNOINTR: regs->a0 = regs->orig_a0; - regs->sepc -= 0x4; + regs->epc -= 0x4; break; } } @@ -261,9 +275,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (regs->scause == EXC_SYSCALL) { + if (regs->cause == EXC_SYSCALL) { /* Avoid additional syscall restarting via ret_from_exception */ - regs->scause = -1UL; + regs->cause = -1UL; /* Restart the system call - no handlers present */ switch (regs->a0) { @@ -271,12 +285,12 @@ static void do_signal(struct pt_regs *regs) case -ERESTARTSYS: case -ERESTARTNOINTR: regs->a0 = regs->orig_a0; - regs->sepc -= 0x4; + regs->epc -= 0x4; break; case -ERESTART_RESTARTBLOCK: regs->a0 = regs->orig_a0; regs->a7 = __NR_restart_syscall; - regs->sepc -= 0x4; + regs->epc -= 0x4; break; } } diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 5c9ec78422c2..eb878abcaaf8 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -16,6 +16,7 @@ #include <linux/seq_file.h> #include <linux/delay.h> +#include <asm/clint.h> #include <asm/sbi.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> @@ -92,7 +93,10 @@ static void send_ipi_mask(const struct cpumask *mask, enum ipi_message_type op) smp_mb__after_atomic(); riscv_cpuid_to_hartid_mask(mask, &hartid_mask); - sbi_send_ipi(cpumask_bits(&hartid_mask)); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + sbi_send_ipi(cpumask_bits(&hartid_mask)); + else + clint_send_ipi_mask(&hartid_mask); } static void send_ipi_single(int cpu, enum ipi_message_type op) @@ -103,12 +107,18 @@ static void send_ipi_single(int cpu, enum ipi_message_type op) set_bit(op, &ipi_data[cpu].bits); smp_mb__after_atomic(); - sbi_send_ipi(cpumask_bits(cpumask_of(hartid))); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + sbi_send_ipi(cpumask_bits(cpumask_of(hartid))); + else + clint_send_ipi_single(hartid); } static inline void clear_ipi(void) { - csr_clear(CSR_SIP, SIE_SSIE); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + csr_clear(CSR_IP, IE_SIE); + else + clint_clear_ipi(cpuid_to_hartid_map(smp_processor_id())); } void riscv_software_interrupt(void) diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 261f4087cc39..8bc01f0ca73b 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -24,6 +24,7 @@ #include <linux/of.h> #include <linux/sched/task_stack.h> #include <linux/sched/mm.h> +#include <asm/clint.h> #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -137,6 +138,9 @@ asmlinkage __visible void __init smp_callin(void) { struct mm_struct *mm = &init_mm; + if (!IS_ENABLED(CONFIG_RISCV_SBI)) + clint_clear_ipi(cpuid_to_hartid_map(smp_processor_id())); + /* All kernel threads share the same mm context. */ mmgrab(mm); current->active_mm = mm; diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 473de3ae8bb7..f4cad5163bf2 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -41,7 +41,7 @@ void die(struct pt_regs *regs, const char *str) print_modules(); show_regs(regs); - ret = notify_die(DIE_OOPS, str, regs, 0, regs->scause, SIGSEGV); + ret = notify_die(DIE_OOPS, str, regs, 0, regs->cause, SIGSEGV); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); @@ -86,7 +86,7 @@ static void do_trap_error(struct pt_regs *regs, int signo, int code, #define DO_ERROR_INFO(name, signo, code, str) \ asmlinkage __visible void name(struct pt_regs *regs) \ { \ - do_trap_error(regs, signo, code, regs->sepc, "Oops - " str); \ + do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ } DO_ERROR_INFO(do_trap_unknown, @@ -124,9 +124,9 @@ static inline unsigned long get_break_insn_length(unsigned long pc) asmlinkage __visible void do_trap_break(struct pt_regs *regs) { if (user_mode(regs)) - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->sepc); - else if (report_bug(regs->sepc, regs) == BUG_TRAP_TYPE_WARN) - regs->sepc += get_break_insn_length(regs->sepc); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->epc); + else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN) + regs->epc += get_break_insn_length(regs->epc); else die(regs, "Kernel BUG"); } @@ -153,9 +153,9 @@ void __init trap_init(void) * Set sup0 scratch register to 0, indicating to exception vector * that we are presently executing in the kernel */ - csr_write(CSR_SSCRATCH, 0); + csr_write(CSR_SCRATCH, 0); /* Set the exception vector address */ - csr_write(CSR_STVEC, &handle_exception); + csr_write(CSR_TVEC, &handle_exception); /* Enable all interrupts */ - csr_write(CSR_SIE, -1); + csr_write(CSR_IE, -1); } diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 23cd1a9e52a1..12f42f96d46e 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -52,12 +52,12 @@ SECTIONS /* Start of data section */ _sdata = .; - RO_DATA_SECTION(L1_CACHE_BYTES) + RO_DATA(L1_CACHE_BYTES) .srodata : { *(.srodata*) } - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) .sdata : { __global_pointer$ = . + 0x800; *(.sdata*) @@ -69,7 +69,6 @@ SECTIONS BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) EXCEPTION_TABLE(0x10) - NOTES .rel.dyn : { *(.rel.dyn*) diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 267feaa10f6a..47e7a8204460 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -lib-y += delay.o -lib-y += memcpy.o -lib-y += memset.o -lib-y += uaccess.o - -lib-$(CONFIG_64BIT) += tishift.o +lib-y += delay.o +lib-y += memcpy.o +lib-y += memset.o +lib-$(CONFIG_MMU) += uaccess.o +lib-$(CONFIG_64BIT) += tishift.o diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index ed2696c0143d..fecd65657a6f 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -18,7 +18,7 @@ ENTRY(__asm_copy_from_user) /* Enable access to user memory */ li t6, SR_SUM - csrs CSR_SSTATUS, t6 + csrs CSR_STATUS, t6 add a3, a1, a2 /* Use word-oriented copy only if low-order bits match */ @@ -47,7 +47,7 @@ ENTRY(__asm_copy_from_user) 3: /* Disable access to user memory */ - csrc CSR_SSTATUS, t6 + csrc CSR_STATUS, t6 li a0, 0 ret 4: /* Edge case: unalignment */ @@ -72,7 +72,7 @@ ENTRY(__clear_user) /* Enable access to user memory */ li t6, SR_SUM - csrs CSR_SSTATUS, t6 + csrs CSR_STATUS, t6 add a3, a0, a1 addi t0, a0, SZREG-1 @@ -94,7 +94,7 @@ ENTRY(__clear_user) 3: /* Disable access to user memory */ - csrc CSR_SSTATUS, t6 + csrc CSR_STATUS, t6 li a0, 0 ret 4: /* Edge case: unalignment */ @@ -114,11 +114,11 @@ ENDPROC(__clear_user) /* Fixup code for __copy_user(10) and __clear_user(11) */ 10: /* Disable access to user memory */ - csrs CSR_SSTATUS, t6 + csrs CSR_STATUS, t6 mv a0, a2 ret 11: - csrs CSR_SSTATUS, t6 + csrs CSR_STATUS, t6 mv a0, a1 ret .previous diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 9d9a17335686..44ab8f28c3fa 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -6,9 +6,8 @@ CFLAGS_REMOVE_init.o = -pg endif obj-y += init.o -obj-y += fault.o obj-y += extable.o -obj-y += ioremap.o +obj-$(CONFIG_MMU) += fault.o ioremap.o obj-y += cacheflush.o obj-y += context.o obj-y += sifive_l2_cache.o diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 3f15938dec89..8f1900686640 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -10,9 +10,17 @@ #include <asm/sbi.h> +static void ipi_remote_fence_i(void *info) +{ + return local_flush_icache_all(); +} + void flush_icache_all(void) { - sbi_remote_fence_i(NULL); + if (IS_ENABLED(CONFIG_RISCV_SBI)) + sbi_remote_fence_i(NULL); + else + on_each_cpu(ipi_remote_fence_i, NULL, 1); } /* @@ -28,7 +36,7 @@ void flush_icache_all(void) void flush_icache_mm(struct mm_struct *mm, bool local) { unsigned int cpu; - cpumask_t others, hmask, *mask; + cpumask_t others, *mask; preempt_disable(); @@ -46,10 +54,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local) */ cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); local |= cpumask_empty(&others); - if (mm != current->active_mm || !local) { - riscv_cpuid_to_hartid_mask(&others, &hmask); - sbi_remote_fence_i(hmask.bits); - } else { + if (mm == current->active_mm && local) { /* * It's assumed that at least one strongly ordered operation is * performed on this hart between setting a hart's cpumask bit @@ -59,6 +64,13 @@ void flush_icache_mm(struct mm_struct *mm, bool local) * with flush_icache_deferred(). */ smp_mb(); + } else if (IS_ENABLED(CONFIG_RISCV_SBI)) { + cpumask_t hartid_mask; + + riscv_cpuid_to_hartid_mask(&others, &hartid_mask); + sbi_remote_fence_i(cpumask_bits(&hartid_mask)); + } else { + on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1); } preempt_enable(); @@ -66,6 +78,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local) #endif /* CONFIG_SMP */ +#ifdef CONFIG_MMU void flush_icache_pte(pte_t pte) { struct page *page = pte_page(pte); @@ -73,3 +86,4 @@ void flush_icache_pte(pte_t pte) if (!test_and_set_bit(PG_dcache_clean, &page->flags)) flush_icache_all(); } +#endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ca66d44156b6..613ec81a8979 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -58,8 +58,10 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, cpumask_clear_cpu(cpu, mm_cpumask(prev)); cpumask_set_cpu(cpu, mm_cpumask(next)); +#ifdef CONFIG_MMU csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE); local_flush_tlb_all(); +#endif flush_icache_deferred(next); } diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c index 7aed9178d365..2fc729422151 100644 --- a/arch/riscv/mm/extable.c +++ b/arch/riscv/mm/extable.c @@ -15,9 +15,9 @@ int fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->sepc); + fixup = search_exception_tables(regs->epc); if (fixup) { - regs->sepc = fixup->fixup; + regs->epc = fixup->fixup; return 1; } return 0; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 247b8c859c44..cf7248e07f43 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -34,8 +34,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) int code = SEGV_MAPERR; vm_fault_t fault; - cause = regs->scause; - addr = regs->sbadaddr; + cause = regs->cause; + addr = regs->badaddr; tsk = current; mm = tsk->mm; @@ -53,7 +53,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto vmalloc_fault; /* Enable interrupts if they were enabled in the parent context. */ - if (likely(regs->sstatus & SR_SPIE)) + if (likely(regs->status & SR_PIE)) local_irq_enable(); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 573463d1c799..b2fe9d1be833 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -26,6 +26,7 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; +void *dtb_early_va; static void __init zone_sizes_init(void) { @@ -40,7 +41,7 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -void setup_zero_page(void) +static void setup_zero_page(void) { memset((void *)empty_zero_page, 0, PAGE_SIZE); } @@ -142,12 +143,12 @@ void __init setup_bootmem(void) } } +#ifdef CONFIG_MMU unsigned long va_pa_offset; EXPORT_SYMBOL(va_pa_offset); unsigned long pfn_base; EXPORT_SYMBOL(pfn_base); -void *dtb_early_va; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; @@ -273,7 +274,6 @@ static void __init create_pmd_mapping(pmd_t *pmdp, #define get_pgd_next_virt(__pa) get_pmd_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) -#define PTE_PARENT_SIZE PMD_SIZE #define fixmap_pgd_next fixmap_pmd #else #define pgd_next_t pte_t @@ -281,7 +281,6 @@ static void __init create_pmd_mapping(pmd_t *pmdp, #define get_pgd_next_virt(__pa) get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) -#define PTE_PARENT_SIZE PGDIR_SIZE #define fixmap_pgd_next fixmap_pte #endif @@ -314,14 +313,11 @@ static void __init create_pgd_mapping(pgd_t *pgdp, static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) { - uintptr_t map_size = PAGE_SIZE; - - /* Upgrade to PMD/PGDIR mappings whenever possible */ - if (!(base & (PTE_PARENT_SIZE - 1)) && - !(size & (PTE_PARENT_SIZE - 1))) - map_size = PTE_PARENT_SIZE; + /* Upgrade to PMD_SIZE mappings whenever possible */ + if ((base & (PMD_SIZE - 1)) || (size & (PMD_SIZE - 1))) + return PAGE_SIZE; - return map_size; + return PMD_SIZE; } /* @@ -449,6 +445,16 @@ static void __init setup_vm_final(void) csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); } +#else +asmlinkage void __init setup_vm(uintptr_t dtb_pa) +{ + dtb_early_va = (void *)dtb_pa; +} + +static inline void setup_vm_final(void) +{ +} +#endif /* CONFIG_MMU */ void __init paging_init(void) { diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 24cd33d2c48f..720b443c4528 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -2,6 +2,7 @@ #include <linux/mm.h> #include <linux/smp.h> +#include <linux/sched.h> #include <asm/sbi.h> void flush_tlb_all(void) @@ -9,13 +10,33 @@ void flush_tlb_all(void) sbi_remote_sfence_vma(NULL, 0, -1); } +/* + * This function must not be called with cmask being null. + * Kernel may panic if cmask is NULL. + */ static void __sbi_tlb_flush_range(struct cpumask *cmask, unsigned long start, unsigned long size) { struct cpumask hmask; + unsigned int cpuid; - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma(hmask.bits, start, size); + if (cpumask_empty(cmask)) + return; + + cpuid = get_cpu(); + + if (cpumask_any_but(cmask, cpuid) >= nr_cpu_ids) { + /* local cpu is the only cpu present in cpumask */ + if (size <= PAGE_SIZE) + local_flush_tlb_page(start); + else + local_flush_tlb_all(); + } else { + riscv_cpuid_to_hartid_mask(cmask, &hmask); + sbi_remote_sfence_vma(cpumask_bits(&hmask), start, size); + } + + put_cpu(); } void flush_tlb_mm(struct mm_struct *mm) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 38d64030aacf..2e60c80395ab 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -62,7 +62,6 @@ CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y -CONFIG_REFCOUNT_FULL=y CONFIG_LOCK_EVENT_COUNTS=y CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 9803e96d2924..ead0b2c9881d 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -44,7 +44,7 @@ struct s390_aes_ctx { int key_len; unsigned long fc; union { - struct crypto_sync_skcipher *blk; + struct crypto_skcipher *skcipher; struct crypto_cipher *cip; } fallback; }; @@ -54,7 +54,7 @@ struct s390_xts_ctx { u8 pcc_key[32]; int key_len; unsigned long fc; - struct crypto_sync_skcipher *fallback; + struct crypto_skcipher *fallback; }; struct gcm_sg_walk { @@ -178,66 +178,41 @@ static struct crypto_alg aes_alg = { } }; -static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key, - unsigned int len) +static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) { - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - unsigned int ret; - - crypto_sync_skcipher_clear_flags(sctx->fallback.blk, - CRYPTO_TFM_REQ_MASK); - crypto_sync_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); - - ret = crypto_sync_skcipher_setkey(sctx->fallback.blk, key, len); - - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= crypto_sync_skcipher_get_flags(sctx->fallback.blk) & - CRYPTO_TFM_RES_MASK; - - return ret; -} - -static int fallback_blk_dec(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - unsigned int ret; - struct crypto_blkcipher *tfm = desc->tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); - SYNC_SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - - skcipher_request_set_sync_tfm(req, sctx->fallback.blk); - skcipher_request_set_callback(req, desc->flags, NULL, NULL); - skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - - ret = crypto_skcipher_decrypt(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + int ret; - skcipher_request_zero(req); + crypto_skcipher_clear_flags(sctx->fallback.skcipher, + CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(sctx->fallback.skcipher, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + ret = crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); + crypto_skcipher_set_flags(tfm, + crypto_skcipher_get_flags(sctx->fallback.skcipher) & + CRYPTO_TFM_RES_MASK); return ret; } -static int fallback_blk_enc(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx, + struct skcipher_request *req, + unsigned long modifier) { - unsigned int ret; - struct crypto_blkcipher *tfm = desc->tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); - SYNC_SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - - skcipher_request_set_sync_tfm(req, sctx->fallback.blk); - skcipher_request_set_callback(req, desc->flags, NULL, NULL); - skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); + struct skcipher_request *subreq = skcipher_request_ctx(req); - ret = crypto_skcipher_encrypt(req); - return ret; + *subreq = *req; + skcipher_request_set_tfm(subreq, sctx->fallback.skcipher); + return (modifier & CPACF_DECRYPT) ? + crypto_skcipher_decrypt(subreq) : + crypto_skcipher_encrypt(subreq); } -static int ecb_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int ecb_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); unsigned long fc; /* Pick the correct function code based on the key length */ @@ -248,111 +223,92 @@ static int ecb_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, /* Check if the function code is available */ sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; if (!sctx->fc) - return setkey_fallback_blk(tfm, in_key, key_len); + return setkey_fallback_skcipher(tfm, in_key, key_len); sctx->key_len = key_len; memcpy(sctx->key, in_key, key_len); return 0; } -static int ecb_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int ecb_aes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n; int ret; - ret = blkcipher_walk_virt(desc, walk); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, modifier); + + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); cpacf_km(sctx->fc | modifier, sctx->key, - walk->dst.virt.addr, walk->src.virt.addr, n); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + walk.dst.virt.addr, walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); } - return ret; } -static int ecb_aes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_aes_encrypt(struct skcipher_request *req) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_enc(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_aes_crypt(desc, 0, &walk); + return ecb_aes_crypt(req, 0); } -static int ecb_aes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_aes_decrypt(struct skcipher_request *req) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_dec(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_aes_crypt(desc, CPACF_DECRYPT, &walk); + return ecb_aes_crypt(req, CPACF_DECRYPT); } -static int fallback_init_blk(struct crypto_tfm *tfm) +static int fallback_init_skcipher(struct crypto_skcipher *tfm) { - const char *name = tfm->__crt_alg->cra_name; - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + const char *name = crypto_tfm_alg_name(&tfm->base); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); - sctx->fallback.blk = crypto_alloc_sync_skcipher(name, 0, - CRYPTO_ALG_NEED_FALLBACK); + sctx->fallback.skcipher = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC); - if (IS_ERR(sctx->fallback.blk)) { + if (IS_ERR(sctx->fallback.skcipher)) { pr_err("Allocating AES fallback algorithm %s failed\n", name); - return PTR_ERR(sctx->fallback.blk); + return PTR_ERR(sctx->fallback.skcipher); } + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(sctx->fallback.skcipher)); return 0; } -static void fallback_exit_blk(struct crypto_tfm *tfm) +static void fallback_exit_skcipher(struct crypto_skcipher *tfm) { - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); - crypto_free_sync_skcipher(sctx->fallback.blk); + crypto_free_skcipher(sctx->fallback.skcipher); } -static struct crypto_alg ecb_aes_alg = { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-s390", - .cra_priority = 401, /* combo: aes + ecb + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_NEED_FALLBACK, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_aes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = fallback_init_blk, - .cra_exit = fallback_exit_blk, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ecb_aes_set_key, - .encrypt = ecb_aes_encrypt, - .decrypt = ecb_aes_decrypt, - } - } +static struct skcipher_alg ecb_aes_alg = { + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = ecb_aes_set_key, + .encrypt = ecb_aes_encrypt, + .decrypt = ecb_aes_decrypt, }; -static int cbc_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int cbc_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); unsigned long fc; /* Pick the correct function code based on the key length */ @@ -363,17 +319,18 @@ static int cbc_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, /* Check if the function code is available */ sctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; if (!sctx->fc) - return setkey_fallback_blk(tfm, in_key, key_len); + return setkey_fallback_skcipher(tfm, in_key, key_len); sctx->key_len = key_len; memcpy(sctx->key, in_key, key_len); return 0; } -static int cbc_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n; int ret; struct { @@ -381,134 +338,74 @@ static int cbc_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, u8 key[AES_MAX_KEY_SIZE]; } param; - ret = blkcipher_walk_virt(desc, walk); - memcpy(param.iv, walk->iv, AES_BLOCK_SIZE); + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, modifier); + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); memcpy(param.key, sctx->key, sctx->key_len); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); cpacf_kmc(sctx->fc | modifier, ¶m, - walk->dst.virt.addr, walk->src.virt.addr, n); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + walk.dst.virt.addr, walk.src.virt.addr, n); + memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); } - memcpy(walk->iv, param.iv, AES_BLOCK_SIZE); return ret; } -static int cbc_aes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_aes_encrypt(struct skcipher_request *req) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_enc(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, 0, &walk); + return cbc_aes_crypt(req, 0); } -static int cbc_aes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_aes_decrypt(struct skcipher_request *req) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_dec(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, CPACF_DECRYPT, &walk); + return cbc_aes_crypt(req, CPACF_DECRYPT); } -static struct crypto_alg cbc_aes_alg = { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-s390", - .cra_priority = 402, /* ecb-aes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_NEED_FALLBACK, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_aes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = fallback_init_blk, - .cra_exit = fallback_exit_blk, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = cbc_aes_set_key, - .encrypt = cbc_aes_encrypt, - .decrypt = cbc_aes_decrypt, - } - } +static struct skcipher_alg cbc_aes_alg = { + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_aes_set_key, + .encrypt = cbc_aes_encrypt, + .decrypt = cbc_aes_decrypt, }; -static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int len) -{ - struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - unsigned int ret; - - crypto_sync_skcipher_clear_flags(xts_ctx->fallback, - CRYPTO_TFM_REQ_MASK); - crypto_sync_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); - - ret = crypto_sync_skcipher_setkey(xts_ctx->fallback, key, len); - - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= crypto_sync_skcipher_get_flags(xts_ctx->fallback) & - CRYPTO_TFM_RES_MASK; - - return ret; -} - -static int xts_fallback_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct crypto_blkcipher *tfm = desc->tfm; - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); - SYNC_SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); - unsigned int ret; - - skcipher_request_set_sync_tfm(req, xts_ctx->fallback); - skcipher_request_set_callback(req, desc->flags, NULL, NULL); - skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - - ret = crypto_skcipher_decrypt(req); - - skcipher_request_zero(req); - return ret; -} - -static int xts_fallback_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) { - struct crypto_blkcipher *tfm = desc->tfm; - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); - SYNC_SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); - unsigned int ret; - - skcipher_request_set_sync_tfm(req, xts_ctx->fallback); - skcipher_request_set_callback(req, desc->flags, NULL, NULL); - skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - - ret = crypto_skcipher_encrypt(req); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + int ret; - skcipher_request_zero(req); + crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(xts_ctx->fallback, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len); + crypto_skcipher_set_flags(tfm, + crypto_skcipher_get_flags(xts_ctx->fallback) & + CRYPTO_TFM_RES_MASK); return ret; } -static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); unsigned long fc; int err; @@ -518,7 +415,7 @@ static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, /* In fips mode only 128 bit or 256 bit keys are valid */ if (fips_enabled && key_len != 32 && key_len != 64) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -539,10 +436,11 @@ static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return 0; } -static int xts_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int offset, nbytes, n; int ret; struct { @@ -557,113 +455,100 @@ static int xts_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, u8 init[16]; } xts_param; - ret = blkcipher_walk_virt(desc, walk); + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + if (unlikely(!xts_ctx->fc || (req->cryptlen % AES_BLOCK_SIZE) != 0)) { + struct skcipher_request *subreq = skcipher_request_ctx(req); + + *subreq = *req; + skcipher_request_set_tfm(subreq, xts_ctx->fallback); + return (modifier & CPACF_DECRYPT) ? + crypto_skcipher_decrypt(subreq) : + crypto_skcipher_encrypt(subreq); + } + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; offset = xts_ctx->key_len & 0x10; memset(pcc_param.block, 0, sizeof(pcc_param.block)); memset(pcc_param.bit, 0, sizeof(pcc_param.bit)); memset(pcc_param.xts, 0, sizeof(pcc_param.xts)); - memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); memcpy(pcc_param.key + offset, xts_ctx->pcc_key, xts_ctx->key_len); cpacf_pcc(xts_ctx->fc, pcc_param.key + offset); memcpy(xts_param.key + offset, xts_ctx->key, xts_ctx->key_len); memcpy(xts_param.init, pcc_param.xts, 16); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); cpacf_km(xts_ctx->fc | modifier, xts_param.key + offset, - walk->dst.virt.addr, walk->src.virt.addr, n); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + walk.dst.virt.addr, walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); } return ret; } -static int xts_aes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int xts_aes_encrypt(struct skcipher_request *req) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (!nbytes) - return -EINVAL; - - if (unlikely(!xts_ctx->fc || (nbytes % XTS_BLOCK_SIZE) != 0)) - return xts_fallback_encrypt(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_aes_crypt(desc, 0, &walk); + return xts_aes_crypt(req, 0); } -static int xts_aes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int xts_aes_decrypt(struct skcipher_request *req) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (!nbytes) - return -EINVAL; - - if (unlikely(!xts_ctx->fc || (nbytes % XTS_BLOCK_SIZE) != 0)) - return xts_fallback_decrypt(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_aes_crypt(desc, CPACF_DECRYPT, &walk); + return xts_aes_crypt(req, CPACF_DECRYPT); } -static int xts_fallback_init(struct crypto_tfm *tfm) +static int xts_fallback_init(struct crypto_skcipher *tfm) { - const char *name = tfm->__crt_alg->cra_name; - struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + const char *name = crypto_tfm_alg_name(&tfm->base); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); - xts_ctx->fallback = crypto_alloc_sync_skcipher(name, 0, - CRYPTO_ALG_NEED_FALLBACK); + xts_ctx->fallback = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC); if (IS_ERR(xts_ctx->fallback)) { pr_err("Allocating XTS fallback algorithm %s failed\n", name); return PTR_ERR(xts_ctx->fallback); } + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(xts_ctx->fallback)); return 0; } -static void xts_fallback_exit(struct crypto_tfm *tfm) +static void xts_fallback_exit(struct crypto_skcipher *tfm) { - struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); - crypto_free_sync_skcipher(xts_ctx->fallback); + crypto_free_skcipher(xts_ctx->fallback); } -static struct crypto_alg xts_aes_alg = { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-s390", - .cra_priority = 402, /* ecb-aes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_NEED_FALLBACK, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_xts_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = xts_fallback_init, - .cra_exit = xts_fallback_exit, - .cra_u = { - .blkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_aes_set_key, - .encrypt = xts_aes_encrypt, - .decrypt = xts_aes_decrypt, - } - } +static struct skcipher_alg xts_aes_alg = { + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_xts_ctx), + .base.cra_module = THIS_MODULE, + .init = xts_fallback_init, + .exit = xts_fallback_exit, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aes_set_key, + .encrypt = xts_aes_encrypt, + .decrypt = xts_aes_decrypt, }; -static int ctr_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int ctr_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); unsigned long fc; /* Pick the correct function code based on the key length */ @@ -674,7 +559,7 @@ static int ctr_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, /* Check if the function code is available */ sctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; if (!sctx->fc) - return setkey_fallback_blk(tfm, in_key, key_len); + return setkey_fallback_skcipher(tfm, in_key, key_len); sctx->key_len = key_len; memcpy(sctx->key, in_key, key_len); @@ -696,30 +581,34 @@ static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) return n; } -static int ctr_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int ctr_aes_crypt(struct skcipher_request *req) { - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); u8 buf[AES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; unsigned int n, nbytes; int ret, locked; + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, 0); + locked = mutex_trylock(&ctrblk_lock); - ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { n = AES_BLOCK_SIZE; + if (nbytes >= 2*AES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk->iv, nbytes); - ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv; - cpacf_kmctr(sctx->fc | modifier, sctx->key, - walk->dst.virt.addr, walk->src.virt.addr, - n, ctrptr); + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; + cpacf_kmctr(sctx->fc, sctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); if (ctrptr == ctrblk) - memcpy(walk->iv, ctrptr + n - AES_BLOCK_SIZE, + memcpy(walk.iv, ctrptr + n - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - crypto_inc(walk->iv, AES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); } if (locked) mutex_unlock(&ctrblk_lock); @@ -727,67 +616,33 @@ static int ctr_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, * final block may be < AES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - cpacf_kmctr(sctx->fc | modifier, sctx->key, - buf, walk->src.virt.addr, - AES_BLOCK_SIZE, walk->iv); - memcpy(walk->dst.virt.addr, buf, nbytes); - crypto_inc(walk->iv, AES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, 0); + cpacf_kmctr(sctx->fc, sctx->key, buf, walk.src.virt.addr, + AES_BLOCK_SIZE, walk.iv); + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, 0); } return ret; } -static int ctr_aes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_enc(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_aes_crypt(desc, 0, &walk); -} - -static int ctr_aes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - - if (unlikely(!sctx->fc)) - return fallback_blk_dec(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_aes_crypt(desc, CPACF_DECRYPT, &walk); -} - -static struct crypto_alg ctr_aes_alg = { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-s390", - .cra_priority = 402, /* ecb-aes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_NEED_FALLBACK, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct s390_aes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = fallback_init_blk, - .cra_exit = fallback_exit_blk, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ctr_aes_set_key, - .encrypt = ctr_aes_encrypt, - .decrypt = ctr_aes_decrypt, - } - } +static struct skcipher_alg ctr_aes_alg = { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_aes_set_key, + .encrypt = ctr_aes_crypt, + .decrypt = ctr_aes_crypt, + .chunksize = AES_BLOCK_SIZE, }; static int gcm_aes_setkey(struct crypto_aead *tfm, const u8 *key, @@ -1116,24 +971,27 @@ static struct aead_alg gcm_aes_aead = { }, }; -static struct crypto_alg *aes_s390_algs_ptr[5]; -static int aes_s390_algs_num; +static struct crypto_alg *aes_s390_alg; +static struct skcipher_alg *aes_s390_skcipher_algs[4]; +static int aes_s390_skciphers_num; static struct aead_alg *aes_s390_aead_alg; -static int aes_s390_register_alg(struct crypto_alg *alg) +static int aes_s390_register_skcipher(struct skcipher_alg *alg) { int ret; - ret = crypto_register_alg(alg); + ret = crypto_register_skcipher(alg); if (!ret) - aes_s390_algs_ptr[aes_s390_algs_num++] = alg; + aes_s390_skcipher_algs[aes_s390_skciphers_num++] = alg; return ret; } static void aes_s390_fini(void) { - while (aes_s390_algs_num--) - crypto_unregister_alg(aes_s390_algs_ptr[aes_s390_algs_num]); + if (aes_s390_alg) + crypto_unregister_alg(aes_s390_alg); + while (aes_s390_skciphers_num--) + crypto_unregister_skcipher(aes_s390_skcipher_algs[aes_s390_skciphers_num]); if (ctrblk) free_page((unsigned long) ctrblk); @@ -1154,10 +1012,11 @@ static int __init aes_s390_init(void) if (cpacf_test_func(&km_functions, CPACF_KM_AES_128) || cpacf_test_func(&km_functions, CPACF_KM_AES_192) || cpacf_test_func(&km_functions, CPACF_KM_AES_256)) { - ret = aes_s390_register_alg(&aes_alg); + ret = crypto_register_alg(&aes_alg); if (ret) goto out_err; - ret = aes_s390_register_alg(&ecb_aes_alg); + aes_s390_alg = &aes_alg; + ret = aes_s390_register_skcipher(&ecb_aes_alg); if (ret) goto out_err; } @@ -1165,14 +1024,14 @@ static int __init aes_s390_init(void) if (cpacf_test_func(&kmc_functions, CPACF_KMC_AES_128) || cpacf_test_func(&kmc_functions, CPACF_KMC_AES_192) || cpacf_test_func(&kmc_functions, CPACF_KMC_AES_256)) { - ret = aes_s390_register_alg(&cbc_aes_alg); + ret = aes_s390_register_skcipher(&cbc_aes_alg); if (ret) goto out_err; } if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128) || cpacf_test_func(&km_functions, CPACF_KM_XTS_256)) { - ret = aes_s390_register_alg(&xts_aes_alg); + ret = aes_s390_register_skcipher(&xts_aes_alg); if (ret) goto out_err; } @@ -1185,7 +1044,7 @@ static int __init aes_s390_init(void) ret = -ENOMEM; goto out_err; } - ret = aes_s390_register_alg(&ctr_aes_alg); + ret = aes_s390_register_skcipher(&ctr_aes_alg); if (ret) goto out_err; } diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index 439b100c6f2e..bfbafd35bcbd 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -17,6 +17,7 @@ #include <linux/mutex.h> #include <crypto/algapi.h> #include <crypto/internal/des.h> +#include <crypto/internal/skcipher.h> #include <asm/cpacf.h> #define DES3_KEY_SIZE (3 * DES_KEY_SIZE) @@ -45,6 +46,12 @@ static int des_setkey(struct crypto_tfm *tfm, const u8 *key, return 0; } +static int des_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + return des_setkey(crypto_skcipher_tfm(tfm), key, key_len); +} + static void s390_des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); @@ -79,28 +86,30 @@ static struct crypto_alg des_alg = { } }; -static int ecb_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, - struct blkcipher_walk *walk) +static int ecb_desall_crypt(struct skcipher_request *req, unsigned long fc) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n; int ret; - ret = blkcipher_walk_virt(desc, walk); - while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(DES_BLOCK_SIZE - 1); - cpacf_km(fc, ctx->key, walk->dst.virt.addr, - walk->src.virt.addr, n); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + cpacf_km(fc, ctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); } return ret; } -static int cbc_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, - struct blkcipher_walk *walk) +static int cbc_desall_crypt(struct skcipher_request *req, unsigned long fc) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n; int ret; struct { @@ -108,99 +117,69 @@ static int cbc_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, u8 key[DES3_KEY_SIZE]; } param; - ret = blkcipher_walk_virt(desc, walk); - memcpy(param.iv, walk->iv, DES_BLOCK_SIZE); + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + memcpy(param.iv, walk.iv, DES_BLOCK_SIZE); memcpy(param.key, ctx->key, DES3_KEY_SIZE); - while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(DES_BLOCK_SIZE - 1); - cpacf_kmc(fc, ¶m, walk->dst.virt.addr, - walk->src.virt.addr, n); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + cpacf_kmc(fc, ¶m, walk.dst.virt.addr, + walk.src.virt.addr, n); + memcpy(walk.iv, param.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); } - memcpy(walk->iv, param.iv, DES_BLOCK_SIZE); return ret; } -static int ecb_des_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_des_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, CPACF_KM_DEA, &walk); + return ecb_desall_crypt(req, CPACF_KM_DEA); } -static int ecb_des_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_des_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, CPACF_KM_DEA | CPACF_DECRYPT, &walk); + return ecb_desall_crypt(req, CPACF_KM_DEA | CPACF_DECRYPT); } -static struct crypto_alg ecb_des_alg = { - .cra_name = "ecb(des)", - .cra_driver_name = "ecb-des-s390", - .cra_priority = 400, /* combo: des + ecb */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .setkey = des_setkey, - .encrypt = ecb_des_encrypt, - .decrypt = ecb_des_decrypt, - } - } +static struct skcipher_alg ecb_des_alg = { + .base.cra_name = "ecb(des)", + .base.cra_driver_name = "ecb-des-s390", + .base.cra_priority = 400, /* combo: des + ecb */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = ecb_des_encrypt, + .decrypt = ecb_des_decrypt, }; -static int cbc_des_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_des_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, CPACF_KMC_DEA, &walk); + return cbc_desall_crypt(req, CPACF_KMC_DEA); } -static int cbc_des_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_des_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, CPACF_KMC_DEA | CPACF_DECRYPT, &walk); + return cbc_desall_crypt(req, CPACF_KMC_DEA | CPACF_DECRYPT); } -static struct crypto_alg cbc_des_alg = { - .cra_name = "cbc(des)", - .cra_driver_name = "cbc-des-s390", - .cra_priority = 400, /* combo: des + cbc */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des_setkey, - .encrypt = cbc_des_encrypt, - .decrypt = cbc_des_decrypt, - } - } +static struct skcipher_alg cbc_des_alg = { + .base.cra_name = "cbc(des)", + .base.cra_driver_name = "cbc-des-s390", + .base.cra_priority = 400, /* combo: des + cbc */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = cbc_des_encrypt, + .decrypt = cbc_des_decrypt, }; /* @@ -232,6 +211,12 @@ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, return 0; } +static int des3_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + return des3_setkey(crypto_skcipher_tfm(tfm), key, key_len); +} + static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); @@ -266,87 +251,53 @@ static struct crypto_alg des3_alg = { } }; -static int ecb_des3_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_des3_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, CPACF_KM_TDEA_192, &walk); + return ecb_desall_crypt(req, CPACF_KM_TDEA_192); } -static int ecb_des3_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_des3_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, CPACF_KM_TDEA_192 | CPACF_DECRYPT, - &walk); + return ecb_desall_crypt(req, CPACF_KM_TDEA_192 | CPACF_DECRYPT); } -static struct crypto_alg ecb_des3_alg = { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "ecb-des3_ede-s390", - .cra_priority = 400, /* combo: des3 + ecb */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .setkey = des3_setkey, - .encrypt = ecb_des3_encrypt, - .decrypt = ecb_des3_decrypt, - } - } +static struct skcipher_alg ecb_des3_alg = { + .base.cra_name = "ecb(des3_ede)", + .base.cra_driver_name = "ecb-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + ecb */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = ecb_des3_encrypt, + .decrypt = ecb_des3_decrypt, }; -static int cbc_des3_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_des3_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, CPACF_KMC_TDEA_192, &walk); + return cbc_desall_crypt(req, CPACF_KMC_TDEA_192); } -static int cbc_des3_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_des3_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, CPACF_KMC_TDEA_192 | CPACF_DECRYPT, - &walk); + return cbc_desall_crypt(req, CPACF_KMC_TDEA_192 | CPACF_DECRYPT); } -static struct crypto_alg cbc_des3_alg = { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "cbc-des3_ede-s390", - .cra_priority = 400, /* combo: des3 + cbc */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des3_setkey, - .encrypt = cbc_des3_encrypt, - .decrypt = cbc_des3_decrypt, - } - } +static struct skcipher_alg cbc_des3_alg = { + .base.cra_name = "cbc(des3_ede)", + .base.cra_driver_name = "cbc-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + cbc */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = cbc_des3_encrypt, + .decrypt = cbc_des3_decrypt, }; static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) @@ -364,128 +315,90 @@ static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) return n; } -static int ctr_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, - struct blkcipher_walk *walk) +static int ctr_desall_crypt(struct skcipher_request *req, unsigned long fc) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); u8 buf[DES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; unsigned int n, nbytes; int ret, locked; locked = mutex_trylock(&ctrblk_lock); - ret = blkcipher_walk_virt_block(desc, walk, DES_BLOCK_SIZE); - while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) >= DES_BLOCK_SIZE) { n = DES_BLOCK_SIZE; if (nbytes >= 2*DES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk->iv, nbytes); - ctrptr = (n > DES_BLOCK_SIZE) ? ctrblk : walk->iv; - cpacf_kmctr(fc, ctx->key, walk->dst.virt.addr, - walk->src.virt.addr, n, ctrptr); + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > DES_BLOCK_SIZE) ? ctrblk : walk.iv; + cpacf_kmctr(fc, ctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); if (ctrptr == ctrblk) - memcpy(walk->iv, ctrptr + n - DES_BLOCK_SIZE, + memcpy(walk.iv, ctrptr + n - DES_BLOCK_SIZE, DES_BLOCK_SIZE); - crypto_inc(walk->iv, DES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + crypto_inc(walk.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); } if (locked) mutex_unlock(&ctrblk_lock); /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - cpacf_kmctr(fc, ctx->key, buf, walk->src.virt.addr, - DES_BLOCK_SIZE, walk->iv); - memcpy(walk->dst.virt.addr, buf, nbytes); - crypto_inc(walk->iv, DES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, 0); + cpacf_kmctr(fc, ctx->key, buf, walk.src.virt.addr, + DES_BLOCK_SIZE, walk.iv); + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, 0); } return ret; } -static int ctr_des_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, CPACF_KMCTR_DEA, &walk); -} - -static int ctr_des_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_des_crypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, CPACF_KMCTR_DEA | CPACF_DECRYPT, &walk); + return ctr_desall_crypt(req, CPACF_KMCTR_DEA); } -static struct crypto_alg ctr_des_alg = { - .cra_name = "ctr(des)", - .cra_driver_name = "ctr-des-s390", - .cra_priority = 400, /* combo: des + ctr */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des_setkey, - .encrypt = ctr_des_encrypt, - .decrypt = ctr_des_decrypt, - } - } +static struct skcipher_alg ctr_des_alg = { + .base.cra_name = "ctr(des)", + .base.cra_driver_name = "ctr-des-s390", + .base.cra_priority = 400, /* combo: des + ctr */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = ctr_des_crypt, + .decrypt = ctr_des_crypt, + .chunksize = DES_BLOCK_SIZE, }; -static int ctr_des3_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, CPACF_KMCTR_TDEA_192, &walk); -} - -static int ctr_des3_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_des3_crypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, CPACF_KMCTR_TDEA_192 | CPACF_DECRYPT, - &walk); + return ctr_desall_crypt(req, CPACF_KMCTR_TDEA_192); } -static struct crypto_alg ctr_des3_alg = { - .cra_name = "ctr(des3_ede)", - .cra_driver_name = "ctr-des3_ede-s390", - .cra_priority = 400, /* combo: des3 + ede */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct s390_des_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_KEY_SIZE, - .max_keysize = DES3_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des3_setkey, - .encrypt = ctr_des3_encrypt, - .decrypt = ctr_des3_decrypt, - } - } +static struct skcipher_alg ctr_des3_alg = { + .base.cra_name = "ctr(des3_ede)", + .base.cra_driver_name = "ctr-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + ede */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = ctr_des3_crypt, + .decrypt = ctr_des3_crypt, + .chunksize = DES_BLOCK_SIZE, }; -static struct crypto_alg *des_s390_algs_ptr[8]; +static struct crypto_alg *des_s390_algs_ptr[2]; static int des_s390_algs_num; +static struct skcipher_alg *des_s390_skciphers_ptr[6]; +static int des_s390_skciphers_num; static int des_s390_register_alg(struct crypto_alg *alg) { @@ -497,10 +410,22 @@ static int des_s390_register_alg(struct crypto_alg *alg) return ret; } +static int des_s390_register_skcipher(struct skcipher_alg *alg) +{ + int ret; + + ret = crypto_register_skcipher(alg); + if (!ret) + des_s390_skciphers_ptr[des_s390_skciphers_num++] = alg; + return ret; +} + static void des_s390_exit(void) { while (des_s390_algs_num--) crypto_unregister_alg(des_s390_algs_ptr[des_s390_algs_num]); + while (des_s390_skciphers_num--) + crypto_unregister_skcipher(des_s390_skciphers_ptr[des_s390_skciphers_num]); if (ctrblk) free_page((unsigned long) ctrblk); } @@ -518,12 +443,12 @@ static int __init des_s390_init(void) ret = des_s390_register_alg(&des_alg); if (ret) goto out_err; - ret = des_s390_register_alg(&ecb_des_alg); + ret = des_s390_register_skcipher(&ecb_des_alg); if (ret) goto out_err; } if (cpacf_test_func(&kmc_functions, CPACF_KMC_DEA)) { - ret = des_s390_register_alg(&cbc_des_alg); + ret = des_s390_register_skcipher(&cbc_des_alg); if (ret) goto out_err; } @@ -531,12 +456,12 @@ static int __init des_s390_init(void) ret = des_s390_register_alg(&des3_alg); if (ret) goto out_err; - ret = des_s390_register_alg(&ecb_des3_alg); + ret = des_s390_register_skcipher(&ecb_des3_alg); if (ret) goto out_err; } if (cpacf_test_func(&kmc_functions, CPACF_KMC_TDEA_192)) { - ret = des_s390_register_alg(&cbc_des3_alg); + ret = des_s390_register_skcipher(&cbc_des3_alg); if (ret) goto out_err; } @@ -551,12 +476,12 @@ static int __init des_s390_init(void) } if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA)) { - ret = des_s390_register_alg(&ctr_des_alg); + ret = des_s390_register_skcipher(&ctr_des_alg); if (ret) goto out_err; } if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { - ret = des_s390_register_alg(&ctr_des3_alg); + ret = des_s390_register_skcipher(&ctr_des3_alg); if (ret) goto out_err; } diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index 6184dceed340..c7119c617b6e 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -21,6 +21,7 @@ #include <linux/cpufeature.h> #include <linux/init.h> #include <linux/spinlock.h> +#include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <asm/cpacf.h> #include <asm/pkey.h> @@ -123,27 +124,27 @@ static int __paes_set_key(struct s390_paes_ctx *ctx) return ctx->fc ? 0 : -EINVAL; } -static int ecb_paes_init(struct crypto_tfm *tfm) +static int ecb_paes_init(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; return 0; } -static void ecb_paes_exit(struct crypto_tfm *tfm) +static void ecb_paes_exit(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); } -static int ecb_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { int rc; - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); @@ -151,91 +152,75 @@ static int ecb_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return rc; if (__paes_set_key(ctx)) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } return 0; } -static int ecb_paes_crypt(struct blkcipher_desc *desc, - unsigned long modifier, - struct blkcipher_walk *walk) +static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n, k; int ret; - ret = blkcipher_walk_virt(desc, walk); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey, - walk->dst.virt.addr, walk->src.virt.addr, n); + walk.dst.virt.addr, walk.src.virt.addr, n); if (k) - ret = blkcipher_walk_done(desc, walk, nbytes - k); + ret = skcipher_walk_done(&walk, nbytes - k); if (k < n) { if (__paes_set_key(ctx) != 0) - return blkcipher_walk_done(desc, walk, -EIO); + return skcipher_walk_done(&walk, -EIO); } } return ret; } -static int ecb_paes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_paes_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_paes_crypt(desc, CPACF_ENCRYPT, &walk); + return ecb_paes_crypt(req, 0); } -static int ecb_paes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_paes_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_paes_crypt(desc, CPACF_DECRYPT, &walk); + return ecb_paes_crypt(req, CPACF_DECRYPT); } -static struct crypto_alg ecb_paes_alg = { - .cra_name = "ecb(paes)", - .cra_driver_name = "ecb-paes-s390", - .cra_priority = 401, /* combo: aes + ecb + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_paes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ecb_paes_alg.cra_list), - .cra_init = ecb_paes_init, - .cra_exit = ecb_paes_exit, - .cra_u = { - .blkcipher = { - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .setkey = ecb_paes_set_key, - .encrypt = ecb_paes_encrypt, - .decrypt = ecb_paes_decrypt, - } - } +static struct skcipher_alg ecb_paes_alg = { + .base.cra_name = "ecb(paes)", + .base.cra_driver_name = "ecb-paes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.cra_list), + .init = ecb_paes_init, + .exit = ecb_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .setkey = ecb_paes_set_key, + .encrypt = ecb_paes_encrypt, + .decrypt = ecb_paes_decrypt, }; -static int cbc_paes_init(struct crypto_tfm *tfm) +static int cbc_paes_init(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; return 0; } -static void cbc_paes_exit(struct crypto_tfm *tfm) +static void cbc_paes_exit(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); } @@ -258,11 +243,11 @@ static int __cbc_paes_set_key(struct s390_paes_ctx *ctx) return ctx->fc ? 0 : -EINVAL; } -static int cbc_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { int rc; - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); @@ -270,16 +255,17 @@ static int cbc_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return rc; if (__cbc_paes_set_key(ctx)) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } return 0; } -static int cbc_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes, n, k; int ret; struct { @@ -287,73 +273,60 @@ static int cbc_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, u8 key[MAXPROTKEYSIZE]; } param; - ret = blkcipher_walk_virt(desc, walk); - memcpy(param.iv, walk->iv, AES_BLOCK_SIZE); + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); k = cpacf_kmc(ctx->fc | modifier, ¶m, - walk->dst.virt.addr, walk->src.virt.addr, n); - if (k) - ret = blkcipher_walk_done(desc, walk, nbytes - k); + walk.dst.virt.addr, walk.src.virt.addr, n); + if (k) { + memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - k); + } if (k < n) { if (__cbc_paes_set_key(ctx) != 0) - return blkcipher_walk_done(desc, walk, -EIO); + return skcipher_walk_done(&walk, -EIO); memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); } } - memcpy(walk->iv, param.iv, AES_BLOCK_SIZE); return ret; } -static int cbc_paes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_paes_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_paes_crypt(desc, 0, &walk); + return cbc_paes_crypt(req, 0); } -static int cbc_paes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_paes_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_paes_crypt(desc, CPACF_DECRYPT, &walk); + return cbc_paes_crypt(req, CPACF_DECRYPT); } -static struct crypto_alg cbc_paes_alg = { - .cra_name = "cbc(paes)", - .cra_driver_name = "cbc-paes-s390", - .cra_priority = 402, /* ecb-paes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_paes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(cbc_paes_alg.cra_list), - .cra_init = cbc_paes_init, - .cra_exit = cbc_paes_exit, - .cra_u = { - .blkcipher = { - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = cbc_paes_set_key, - .encrypt = cbc_paes_encrypt, - .decrypt = cbc_paes_decrypt, - } - } +static struct skcipher_alg cbc_paes_alg = { + .base.cra_name = "cbc(paes)", + .base.cra_driver_name = "cbc-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.cra_list), + .init = cbc_paes_init, + .exit = cbc_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_paes_set_key, + .encrypt = cbc_paes_encrypt, + .decrypt = cbc_paes_decrypt, }; -static int xts_paes_init(struct crypto_tfm *tfm) +static int xts_paes_init(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb[0].key = NULL; ctx->kb[1].key = NULL; @@ -361,9 +334,9 @@ static int xts_paes_init(struct crypto_tfm *tfm) return 0; } -static void xts_paes_exit(struct crypto_tfm *tfm) +static void xts_paes_exit(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb[0]); _free_kb_keybuf(&ctx->kb[1]); @@ -391,11 +364,11 @@ static int __xts_paes_set_key(struct s390_pxts_ctx *ctx) return ctx->fc ? 0 : -EINVAL; } -static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int xts_key_len) { int rc; - struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); u8 ckey[2 * AES_MAX_KEY_SIZE]; unsigned int ckey_len, key_len; @@ -414,7 +387,7 @@ static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return rc; if (__xts_paes_set_key(ctx)) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -427,13 +400,14 @@ static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, AES_KEYSIZE_128 : AES_KEYSIZE_256; memcpy(ckey, ctx->pk[0].protkey, ckey_len); memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); - return xts_check_key(tfm, ckey, 2*ckey_len); + return xts_verify_key(tfm, ckey, 2*ckey_len); } -static int xts_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) { - struct s390_pxts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int keylen, offset, nbytes, n, k; int ret; struct { @@ -448,90 +422,76 @@ static int xts_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, u8 init[16]; } xts_param; - ret = blkcipher_walk_virt(desc, walk); + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; retry: memset(&pcc_param, 0, sizeof(pcc_param)); - memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); cpacf_pcc(ctx->fc, pcc_param.key + offset); memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen); memcpy(xts_param.init, pcc_param.xts, 16); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); k = cpacf_km(ctx->fc | modifier, xts_param.key + offset, - walk->dst.virt.addr, walk->src.virt.addr, n); + walk.dst.virt.addr, walk.src.virt.addr, n); if (k) - ret = blkcipher_walk_done(desc, walk, nbytes - k); + ret = skcipher_walk_done(&walk, nbytes - k); if (k < n) { if (__xts_paes_set_key(ctx) != 0) - return blkcipher_walk_done(desc, walk, -EIO); + return skcipher_walk_done(&walk, -EIO); goto retry; } } return ret; } -static int xts_paes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int xts_paes_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_paes_crypt(desc, 0, &walk); + return xts_paes_crypt(req, 0); } -static int xts_paes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int xts_paes_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_paes_crypt(desc, CPACF_DECRYPT, &walk); + return xts_paes_crypt(req, CPACF_DECRYPT); } -static struct crypto_alg xts_paes_alg = { - .cra_name = "xts(paes)", - .cra_driver_name = "xts-paes-s390", - .cra_priority = 402, /* ecb-paes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct s390_pxts_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(xts_paes_alg.cra_list), - .cra_init = xts_paes_init, - .cra_exit = xts_paes_exit, - .cra_u = { - .blkcipher = { - .min_keysize = 2 * PAES_MIN_KEYSIZE, - .max_keysize = 2 * PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_paes_set_key, - .encrypt = xts_paes_encrypt, - .decrypt = xts_paes_decrypt, - } - } +static struct skcipher_alg xts_paes_alg = { + .base.cra_name = "xts(paes)", + .base.cra_driver_name = "xts-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.cra_list), + .init = xts_paes_init, + .exit = xts_paes_exit, + .min_keysize = 2 * PAES_MIN_KEYSIZE, + .max_keysize = 2 * PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_paes_set_key, + .encrypt = xts_paes_encrypt, + .decrypt = xts_paes_decrypt, }; -static int ctr_paes_init(struct crypto_tfm *tfm) +static int ctr_paes_init(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; return 0; } -static void ctr_paes_exit(struct crypto_tfm *tfm) +static void ctr_paes_exit(struct crypto_skcipher *tfm) { - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); } @@ -555,11 +515,11 @@ static int __ctr_paes_set_key(struct s390_paes_ctx *ctx) return ctx->fc ? 0 : -EINVAL; } -static int ctr_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { int rc; - struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); @@ -567,7 +527,7 @@ static int ctr_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return rc; if (__ctr_paes_set_key(ctx)) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } return 0; @@ -588,37 +548,37 @@ static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) return n; } -static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, - struct blkcipher_walk *walk) +static int ctr_paes_crypt(struct skcipher_request *req) { - struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); u8 buf[AES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; unsigned int nbytes, n, k; int ret, locked; locked = spin_trylock(&ctrblk_lock); - ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); - while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { n = AES_BLOCK_SIZE; if (nbytes >= 2*AES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk->iv, nbytes); - ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv; - k = cpacf_kmctr(ctx->fc | modifier, ctx->pk.protkey, - walk->dst.virt.addr, walk->src.virt.addr, - n, ctrptr); + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; + k = cpacf_kmctr(ctx->fc, ctx->pk.protkey, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); if (k) { if (ctrptr == ctrblk) - memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE, + memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - crypto_inc(walk->iv, AES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, nbytes - n); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); } if (k < n) { if (__ctr_paes_set_key(ctx) != 0) { if (locked) spin_unlock(&ctrblk_lock); - return blkcipher_walk_done(desc, walk, -EIO); + return skcipher_walk_done(&walk, -EIO); } } } @@ -629,80 +589,54 @@ static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, */ if (nbytes) { while (1) { - if (cpacf_kmctr(ctx->fc | modifier, - ctx->pk.protkey, buf, - walk->src.virt.addr, AES_BLOCK_SIZE, - walk->iv) == AES_BLOCK_SIZE) + if (cpacf_kmctr(ctx->fc, ctx->pk.protkey, buf, + walk.src.virt.addr, AES_BLOCK_SIZE, + walk.iv) == AES_BLOCK_SIZE) break; if (__ctr_paes_set_key(ctx) != 0) - return blkcipher_walk_done(desc, walk, -EIO); + return skcipher_walk_done(&walk, -EIO); } - memcpy(walk->dst.virt.addr, buf, nbytes); - crypto_inc(walk->iv, AES_BLOCK_SIZE); - ret = blkcipher_walk_done(desc, walk, 0); + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, 0); } return ret; } -static int ctr_paes_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_paes_crypt(desc, 0, &walk); -} - -static int ctr_paes_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_paes_crypt(desc, CPACF_DECRYPT, &walk); -} - -static struct crypto_alg ctr_paes_alg = { - .cra_name = "ctr(paes)", - .cra_driver_name = "ctr-paes-s390", - .cra_priority = 402, /* ecb-paes-s390 + 1 */ - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct s390_paes_ctx), - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ctr_paes_alg.cra_list), - .cra_init = ctr_paes_init, - .cra_exit = ctr_paes_exit, - .cra_u = { - .blkcipher = { - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ctr_paes_set_key, - .encrypt = ctr_paes_encrypt, - .decrypt = ctr_paes_decrypt, - } - } +static struct skcipher_alg ctr_paes_alg = { + .base.cra_name = "ctr(paes)", + .base.cra_driver_name = "ctr-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.cra_list), + .init = ctr_paes_init, + .exit = ctr_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_paes_set_key, + .encrypt = ctr_paes_crypt, + .decrypt = ctr_paes_crypt, + .chunksize = AES_BLOCK_SIZE, }; -static inline void __crypto_unregister_alg(struct crypto_alg *alg) +static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg) { - if (!list_empty(&alg->cra_list)) - crypto_unregister_alg(alg); + if (!list_empty(&alg->base.cra_list)) + crypto_unregister_skcipher(alg); } static void paes_s390_fini(void) { if (ctrblk) free_page((unsigned long) ctrblk); - __crypto_unregister_alg(&ctr_paes_alg); - __crypto_unregister_alg(&xts_paes_alg); - __crypto_unregister_alg(&cbc_paes_alg); - __crypto_unregister_alg(&ecb_paes_alg); + __crypto_unregister_skcipher(&ctr_paes_alg); + __crypto_unregister_skcipher(&xts_paes_alg); + __crypto_unregister_skcipher(&cbc_paes_alg); + __crypto_unregister_skcipher(&ecb_paes_alg); } static int __init paes_s390_init(void) @@ -717,7 +651,7 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) || cpacf_test_func(&km_functions, CPACF_KM_PAES_192) || cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) { - ret = crypto_register_alg(&ecb_paes_alg); + ret = crypto_register_skcipher(&ecb_paes_alg); if (ret) goto out_err; } @@ -725,14 +659,14 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) { - ret = crypto_register_alg(&cbc_paes_alg); + ret = crypto_register_skcipher(&cbc_paes_alg); if (ret) goto out_err; } if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) || cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) { - ret = crypto_register_alg(&xts_paes_alg); + ret = crypto_register_skcipher(&xts_paes_alg); if (ret) goto out_err; } @@ -740,7 +674,7 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) { - ret = crypto_register_alg(&ctr_paes_alg); + ret = crypto_register_skcipher(&ctr_paes_alg); if (ret) goto out_err; ctrblk = (u8 *) __get_free_page(GFP_KERNEL); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 7e0eb4020917..37695499717d 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -15,6 +15,8 @@ /* Handle ro_after_init data on our own. */ #define RO_AFTER_INIT_DATA +#define EMITS_PT_NOTE + #include <asm-generic/vmlinux.lds.h> #include <asm/vmlinux.lds.h> @@ -50,11 +52,7 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x0700 - NOTES :text :note - - .dummy : { *(.dummy) } :data - - RO_DATA_SECTION(PAGE_SIZE) + RO_DATA(PAGE_SIZE) . = ALIGN(PAGE_SIZE); _sdata = .; /* Start of data section */ @@ -64,12 +62,12 @@ SECTIONS .data..ro_after_init : { *(.data..ro_after_init) JUMP_TABLE_DATA - } + } :data EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; - RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) BOOT_DATA_PRESERVED _edata = .; /* End of data section */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index c475ca49cfc6..8df10d3c8f6c 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -247,9 +247,9 @@ void vtime_account_irq_enter(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -void vtime_account_system(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) __attribute__((alias("vtime_account_irq_enter"))); -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_kernel); /* * Sorted add to a list. List is linear searched until first bigger diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index ce88211b9c6c..8d2134136290 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -23,6 +23,8 @@ #include <linux/filter.h> #include <linux/init.h> #include <linux/bpf.h> +#include <linux/mm.h> +#include <linux/kernel.h> #include <asm/cacheflush.h> #include <asm/dis.h> #include <asm/facility.h> @@ -38,10 +40,11 @@ struct bpf_jit { int size; /* Size of program and literal pool */ int size_prg; /* Size of program */ int prg; /* Current position in program */ - int lit_start; /* Start of literal pool */ - int lit; /* Current position in literal pool */ + int lit32_start; /* Start of 32-bit literal pool */ + int lit32; /* Current position in 32-bit literal pool */ + int lit64_start; /* Start of 64-bit literal pool */ + int lit64; /* Current position in 64-bit literal pool */ int base_ip; /* Base address for literal pool */ - int ret0_ip; /* Address of return 0 */ int exit_ip; /* Address of exit */ int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ @@ -49,14 +52,10 @@ struct bpf_jit { int labels[1]; /* Labels for local jumps */ }; -#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */ - -#define SEEN_MEM (1 << 0) /* use mem[] for temporary storage */ -#define SEEN_RET0 (1 << 1) /* ret0_ip points to a valid return 0 */ -#define SEEN_LITERAL (1 << 2) /* code uses literals */ -#define SEEN_FUNC (1 << 3) /* calls C functions */ -#define SEEN_TAIL_CALL (1 << 4) /* code uses tail calls */ -#define SEEN_REG_AX (1 << 5) /* code uses constant blinding */ +#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ +#define SEEN_LITERAL BIT(1) /* code uses literals */ +#define SEEN_FUNC BIT(2) /* calls C functions */ +#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM) /* @@ -131,13 +130,13 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define _EMIT2(op) \ ({ \ if (jit->prg_buf) \ - *(u16 *) (jit->prg_buf + jit->prg) = op; \ + *(u16 *) (jit->prg_buf + jit->prg) = (op); \ jit->prg += 2; \ }) #define EMIT2(op, b1, b2) \ ({ \ - _EMIT2(op | reg(b1, b2)); \ + _EMIT2((op) | reg(b1, b2)); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) @@ -145,20 +144,20 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define _EMIT4(op) \ ({ \ if (jit->prg_buf) \ - *(u32 *) (jit->prg_buf + jit->prg) = op; \ + *(u32 *) (jit->prg_buf + jit->prg) = (op); \ jit->prg += 4; \ }) #define EMIT4(op, b1, b2) \ ({ \ - _EMIT4(op | reg(b1, b2)); \ + _EMIT4((op) | reg(b1, b2)); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) #define EMIT4_RRF(op, b1, b2, b3) \ ({ \ - _EMIT4(op | reg_high(b3) << 8 | reg(b1, b2)); \ + _EMIT4((op) | reg_high(b3) << 8 | reg(b1, b2)); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ REG_SET_SEEN(b3); \ @@ -167,13 +166,13 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define _EMIT4_DISP(op, disp) \ ({ \ unsigned int __disp = (disp) & 0xfff; \ - _EMIT4(op | __disp); \ + _EMIT4((op) | __disp); \ }) #define EMIT4_DISP(op, b1, b2, disp) \ ({ \ - _EMIT4_DISP(op | reg_high(b1) << 16 | \ - reg_high(b2) << 8, disp); \ + _EMIT4_DISP((op) | reg_high(b1) << 16 | \ + reg_high(b2) << 8, (disp)); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) @@ -181,21 +180,27 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT4_IMM(op, b1, imm) \ ({ \ unsigned int __imm = (imm) & 0xffff; \ - _EMIT4(op | reg_high(b1) << 16 | __imm); \ + _EMIT4((op) | reg_high(b1) << 16 | __imm); \ REG_SET_SEEN(b1); \ }) #define EMIT4_PCREL(op, pcrel) \ ({ \ long __pcrel = ((pcrel) >> 1) & 0xffff; \ - _EMIT4(op | __pcrel); \ + _EMIT4((op) | __pcrel); \ +}) + +#define EMIT4_PCREL_RIC(op, mask, target) \ +({ \ + int __rel = ((target) - jit->prg) / 2; \ + _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ }) #define _EMIT6(op1, op2) \ ({ \ if (jit->prg_buf) { \ - *(u32 *) (jit->prg_buf + jit->prg) = op1; \ - *(u16 *) (jit->prg_buf + jit->prg + 4) = op2; \ + *(u32 *) (jit->prg_buf + jit->prg) = (op1); \ + *(u16 *) (jit->prg_buf + jit->prg + 4) = (op2); \ } \ jit->prg += 6; \ }) @@ -203,20 +208,20 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define _EMIT6_DISP(op1, op2, disp) \ ({ \ unsigned int __disp = (disp) & 0xfff; \ - _EMIT6(op1 | __disp, op2); \ + _EMIT6((op1) | __disp, op2); \ }) #define _EMIT6_DISP_LH(op1, op2, disp) \ ({ \ - u32 _disp = (u32) disp; \ + u32 _disp = (u32) (disp); \ unsigned int __disp_h = _disp & 0xff000; \ unsigned int __disp_l = _disp & 0x00fff; \ - _EMIT6(op1 | __disp_l, op2 | __disp_h >> 4); \ + _EMIT6((op1) | __disp_l, (op2) | __disp_h >> 4); \ }) #define EMIT6_DISP_LH(op1, op2, b1, b2, b3, disp) \ ({ \ - _EMIT6_DISP_LH(op1 | reg(b1, b2) << 16 | \ + _EMIT6_DISP_LH((op1) | reg(b1, b2) << 16 | \ reg_high(b3) << 8, op2, disp); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ @@ -226,8 +231,8 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ ({ \ int rel = (jit->labels[label] - jit->prg) >> 1; \ - _EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff), \ - op2 | mask << 12); \ + _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ + (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) @@ -235,68 +240,83 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ ({ \ int rel = (jit->labels[label] - jit->prg) >> 1; \ - _EMIT6(op1 | (reg_high(b1) | mask) << 16 | \ - (rel & 0xffff), op2 | (imm & 0xff) << 8); \ + _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ + (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ - BUILD_BUG_ON(((unsigned long) imm) > 0xff); \ + BUILD_BUG_ON(((unsigned long) (imm)) > 0xff); \ }) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ /* Branch instruction needs 6 bytes */ \ - int rel = (addrs[i + off + 1] - (addrs[i + 1] - 6)) / 2;\ - _EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff), op2 | mask); \ + int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\ + _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) #define EMIT6_PCREL_RILB(op, b, target) \ ({ \ - int rel = (target - jit->prg) / 2; \ - _EMIT6(op | reg_high(b) << 16 | rel >> 16, rel & 0xffff); \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ REG_SET_SEEN(b); \ }) #define EMIT6_PCREL_RIL(op, target) \ ({ \ - int rel = (target - jit->prg) / 2; \ - _EMIT6(op | rel >> 16, rel & 0xffff); \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op) | rel >> 16, rel & 0xffff); \ +}) + +#define EMIT6_PCREL_RILC(op, mask, target) \ +({ \ + EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ }) #define _EMIT6_IMM(op, imm) \ ({ \ unsigned int __imm = (imm); \ - _EMIT6(op | (__imm >> 16), __imm & 0xffff); \ + _EMIT6((op) | (__imm >> 16), __imm & 0xffff); \ }) #define EMIT6_IMM(op, b1, imm) \ ({ \ - _EMIT6_IMM(op | reg_high(b1) << 16, imm); \ + _EMIT6_IMM((op) | reg_high(b1) << 16, imm); \ REG_SET_SEEN(b1); \ }) -#define EMIT_CONST_U32(val) \ +#define _EMIT_CONST_U32(val) \ ({ \ unsigned int ret; \ - ret = jit->lit - jit->base_ip; \ - jit->seen |= SEEN_LITERAL; \ + ret = jit->lit32; \ if (jit->prg_buf) \ - *(u32 *) (jit->prg_buf + jit->lit) = (u32) val; \ - jit->lit += 4; \ + *(u32 *)(jit->prg_buf + jit->lit32) = (u32)(val);\ + jit->lit32 += 4; \ ret; \ }) -#define EMIT_CONST_U64(val) \ +#define EMIT_CONST_U32(val) \ ({ \ - unsigned int ret; \ - ret = jit->lit - jit->base_ip; \ jit->seen |= SEEN_LITERAL; \ + _EMIT_CONST_U32(val) - jit->base_ip; \ +}) + +#define _EMIT_CONST_U64(val) \ +({ \ + unsigned int ret; \ + ret = jit->lit64; \ if (jit->prg_buf) \ - *(u64 *) (jit->prg_buf + jit->lit) = (u64) val; \ - jit->lit += 8; \ + *(u64 *)(jit->prg_buf + jit->lit64) = (u64)(val);\ + jit->lit64 += 8; \ ret; \ }) +#define EMIT_CONST_U64(val) \ +({ \ + jit->seen |= SEEN_LITERAL; \ + _EMIT_CONST_U64(val) - jit->base_ip; \ +}) + #define EMIT_ZERO(b1) \ ({ \ if (!fp->aux->verifier_zext) { \ @@ -307,6 +327,67 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) }) /* + * Return whether this is the first pass. The first pass is special, since we + * don't know any sizes yet, and thus must be conservative. + */ +static bool is_first_pass(struct bpf_jit *jit) +{ + return jit->size == 0; +} + +/* + * Return whether this is the code generation pass. The code generation pass is + * special, since we should change as little as possible. + */ +static bool is_codegen_pass(struct bpf_jit *jit) +{ + return jit->prg_buf; +} + +/* + * Return whether "rel" can be encoded as a short PC-relative offset + */ +static bool is_valid_rel(int rel) +{ + return rel >= -65536 && rel <= 65534; +} + +/* + * Return whether "off" can be reached using a short PC-relative offset + */ +static bool can_use_rel(struct bpf_jit *jit, int off) +{ + return is_valid_rel(off - jit->prg); +} + +/* + * Return whether given displacement can be encoded using + * Long-Displacement Facility + */ +static bool is_valid_ldisp(int disp) +{ + return disp >= -524288 && disp <= 524287; +} + +/* + * Return whether the next 32-bit literal pool entry can be referenced using + * Long-Displacement Facility + */ +static bool can_use_ldisp_for_lit32(struct bpf_jit *jit) +{ + return is_valid_ldisp(jit->lit32 - jit->base_ip); +} + +/* + * Return whether the next 64-bit literal pool entry can be referenced using + * Long-Displacement Facility + */ +static bool can_use_ldisp_for_lit64(struct bpf_jit *jit) +{ + return is_valid_ldisp(jit->lit64 - jit->base_ip); +} + +/* * Fill whole space with illegal instructions */ static void jit_fill_hole(void *area, unsigned int size) @@ -383,9 +464,18 @@ static int get_end(struct bpf_jit *jit, int start) */ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) { - + const int last = 15, save_restore_size = 6; int re = 6, rs; + if (is_first_pass(jit)) { + /* + * We don't know yet which registers are used. Reserve space + * conservatively. + */ + jit->prg += (last - re + 1) * save_restore_size; + return; + } + do { rs = get_start(jit, re); if (!rs) @@ -396,7 +486,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) else restore_regs(jit, rs, re, stack_depth); re++; - } while (re <= 15); + } while (re <= last); } /* @@ -420,21 +510,28 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) /* Save registers */ save_restore_regs(jit, REGS_SAVE, stack_depth); /* Setup literal pool */ - if (jit->seen & SEEN_LITERAL) { - /* basr %r13,0 */ - EMIT2(0x0d00, REG_L, REG_0); - jit->base_ip = jit->prg; + if (is_first_pass(jit) || (jit->seen & SEEN_LITERAL)) { + if (!is_first_pass(jit) && + is_valid_ldisp(jit->size - (jit->prg + 2))) { + /* basr %l,0 */ + EMIT2(0x0d00, REG_L, REG_0); + jit->base_ip = jit->prg; + } else { + /* larl %l,lit32_start */ + EMIT6_PCREL_RILB(0xc0000000, REG_L, jit->lit32_start); + jit->base_ip = jit->lit32_start; + } } /* Setup stack and backchain */ - if (jit->seen & SEEN_STACK) { - if (jit->seen & SEEN_FUNC) + if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) /* lgr %w1,%r15 (backchain) */ EMIT4(0xb9040000, REG_W1, REG_15); /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); /* aghi %r15,-STK_OFF */ EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); - if (jit->seen & SEEN_FUNC) + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) /* stg %w1,152(%r15) (backchain) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, 152); @@ -446,12 +543,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) */ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) { - /* Return 0 */ - if (jit->seen & SEEN_RET0) { - jit->ret0_ip = jit->prg; - /* lghi %b0,0 */ - EMIT4_IMM(0xa7090000, BPF_REG_0, 0); - } jit->exit_ip = jit->prg; /* Load exit code: lgr %r2,%b0 */ EMIT4(0xb9040000, REG_2, BPF_REG_0); @@ -476,7 +567,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) _EMIT2(0x07fe); if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && - (jit->seen & SEEN_FUNC)) { + (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ if (test_facility(35)) { @@ -506,16 +597,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i, bool extra_pass) { struct bpf_insn *insn = &fp->insnsi[i]; - int jmp_off, last, insn_count = 1; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; + int last, insn_count = 1; u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; unsigned int mask; - if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) - jit->seen |= SEEN_REG_AX; switch (insn->code) { /* * BPF_MOV @@ -549,9 +638,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, u64 imm64; imm64 = (u64)(u32) insn[0].imm | ((u64)(u32) insn[1].imm) << 32; - /* lg %dst,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, REG_0, REG_L, - EMIT_CONST_U64(imm64)); + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, _EMIT_CONST_U64(imm64)); insn_count = 2; break; } @@ -680,9 +768,18 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 0); /* lr %w1,%dst */ EMIT2(0x1800, REG_W1, dst_reg); - /* dl %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L, - EMIT_CONST_U32(imm)); + if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) { + /* dl %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L, + EMIT_CONST_U32(imm)); + } else { + /* lgfrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40c0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dlr %w0,%dst */ + EMIT4(0xb9970000, REG_W0, dst_reg); + } /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); if (insn_is_zext(&insn[1])) @@ -704,9 +801,18 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7090000, REG_W0, 0); /* lgr %w1,%dst */ EMIT4(0xb9040000, REG_W1, dst_reg); - /* dlg %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L, - EMIT_CONST_U64(imm)); + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* dlg %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dlgr %w0,%dst */ + EMIT4(0xb9870000, REG_W0, dst_reg); + } /* lgr %dst,%rc */ EMIT4(0xb9040000, dst_reg, rc_reg); break; @@ -729,9 +835,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */ - /* ng %dst,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0080, dst_reg, REG_0, REG_L, - EMIT_CONST_U64(imm)); + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* ng %dst,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0080, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* ngr %dst,%w0 */ + EMIT4(0xb9800000, dst_reg, REG_W0); + } break; /* * BPF_OR @@ -751,9 +867,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_OR | BPF_K: /* dst = dst | imm */ - /* og %dst,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0081, dst_reg, REG_0, REG_L, - EMIT_CONST_U64(imm)); + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* og %dst,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0081, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* ogr %dst,%w0 */ + EMIT4(0xb9810000, dst_reg, REG_W0); + } break; /* * BPF_XOR @@ -775,9 +901,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ - /* xg %dst,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0082, dst_reg, REG_0, REG_L, - EMIT_CONST_U64(imm)); + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* xg %dst,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0082, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* xgr %dst,%w0 */ + EMIT4(0xb9820000, dst_reg, REG_W0); + } break; /* * BPF_LSH @@ -1023,9 +1159,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; - /* lg %w1,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, - EMIT_CONST_U64(func)); + /* lgrl %w1,func */ + EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); @@ -1054,9 +1189,17 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* llgf %w1,map.max_entries(%b2) */ EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); - /* clrj %b3,%w1,0xa,label0: if (u32)%b3 >= (u32)%w1 goto out */ - EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, - REG_W1, 0, 0xa); + /* if ((u32)%b3 >= (u32)%w1) goto out; */ + if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { + /* clrj %b3,%w1,0xa,label0 */ + EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, + REG_W1, 0, 0xa); + } else { + /* clr %b3,%w1 */ + EMIT2(0x1500, BPF_REG_3, REG_W1); + /* brcl 0xa,label0 */ + EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]); + } /* * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) @@ -1071,9 +1214,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, - MAX_TAIL_CALL_CNT, 0, 0x2); + if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { + /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ + EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, + MAX_TAIL_CALL_CNT, 0, 0x2); + } else { + /* clfi %w1,MAX_TAIL_CALL_CNT */ + EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT); + /* brcl 0x2,label0 */ + EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]); + } /* * prog = array->ptrs[index]; @@ -1085,11 +1235,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9160000, REG_1, BPF_REG_3); /* sllg %r1,%r1,3: %r1 *= 8 */ EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, REG_1, REG_0, 3); - /* lg %r1,prog(%b2,%r1) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2, + /* ltg %r1,prog(%b2,%r1) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); - /* clgij %r1,0,0x8,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007d, REG_1, 0, 0, 0x8); + if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { + /* brc 0x8,label0 */ + EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]); + } else { + /* brcl 0x8,label0 */ + EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]); + } /* * Restore registers before calling function @@ -1110,7 +1265,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; - if (last && !(jit->seen & SEEN_RET0)) + if (last) break; /* j <exit> */ EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg); @@ -1246,36 +1401,83 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, goto branch_oc; branch_ks: is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* lgfi %w1,imm (load sign extend imm) */ - EMIT6_IMM(0xc0010000, REG_W1, imm); - /* crj or cgrj %dst,%w1,mask,off */ - EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064), - dst_reg, REG_W1, i, off, mask); + /* cfi or cgfi %dst,imm */ + EMIT6_IMM(is_jmp32 ? 0xc20d0000 : 0xc20c0000, + dst_reg, imm); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* brc mask,off */ + EMIT4_PCREL_RIC(0xa7040000, + mask >> 12, addrs[i + off + 1]); + } else { + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } break; branch_ku: is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* lgfi %w1,imm (load sign extend imm) */ - EMIT6_IMM(0xc0010000, REG_W1, imm); - /* clrj or clgrj %dst,%w1,mask,off */ - EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065), - dst_reg, REG_W1, i, off, mask); + /* clfi or clgfi %dst,imm */ + EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000, + dst_reg, imm); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* brc mask,off */ + EMIT4_PCREL_RIC(0xa7040000, + mask >> 12, addrs[i + off + 1]); + } else { + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } break; branch_xs: is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* crj or cgrj %dst,%src,mask,off */ - EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064), - dst_reg, src_reg, i, off, mask); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* crj or cgrj %dst,%src,mask,off */ + EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064), + dst_reg, src_reg, i, off, mask); + } else { + /* cr or cgr %dst,%src */ + if (is_jmp32) + EMIT2(0x1900, dst_reg, src_reg); + else + EMIT4(0xb9200000, dst_reg, src_reg); + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } break; branch_xu: is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* clrj or clgrj %dst,%src,mask,off */ - EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065), - dst_reg, src_reg, i, off, mask); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* clrj or clgrj %dst,%src,mask,off */ + EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065), + dst_reg, src_reg, i, off, mask); + } else { + /* clr or clgr %dst,%src */ + if (is_jmp32) + EMIT2(0x1500, dst_reg, src_reg); + else + EMIT4(0xb9210000, dst_reg, src_reg); + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } break; branch_oc: - /* brc mask,jmp_off (branch instruction needs 4 bytes) */ - jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4); - EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* brc mask,off */ + EMIT4_PCREL_RIC(0xa7040000, + mask >> 12, addrs[i + off + 1]); + } else { + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } break; } default: /* too complex, give up */ @@ -1286,28 +1488,67 @@ branch_oc: } /* + * Return whether new i-th instruction address does not violate any invariant + */ +static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i) +{ + /* On the first pass anything goes */ + if (is_first_pass(jit)) + return true; + + /* The codegen pass must not change anything */ + if (is_codegen_pass(jit)) + return jit->addrs[i] == jit->prg; + + /* Passes in between must not increase code size */ + return jit->addrs[i] >= jit->prg; +} + +/* + * Update the address of i-th instruction + */ +static int bpf_set_addr(struct bpf_jit *jit, int i) +{ + if (!bpf_is_new_addr_sane(jit, i)) + return -1; + jit->addrs[i] = jit->prg; + return 0; +} + +/* * Compile eBPF program into s390x code */ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, bool extra_pass) { - int i, insn_count; + int i, insn_count, lit32_size, lit64_size; - jit->lit = jit->lit_start; + jit->lit32 = jit->lit32_start; + jit->lit64 = jit->lit64_start; jit->prg = 0; bpf_jit_prologue(jit, fp->aux->stack_depth); + if (bpf_set_addr(jit, 0) < 0) + return -1; for (i = 0; i < fp->len; i += insn_count) { insn_count = bpf_jit_insn(jit, fp, i, extra_pass); if (insn_count < 0) return -1; /* Next instruction address */ - jit->addrs[i + insn_count] = jit->prg; + if (bpf_set_addr(jit, i + insn_count) < 0) + return -1; } bpf_jit_epilogue(jit, fp->aux->stack_depth); - jit->lit_start = jit->prg; - jit->size = jit->lit; + lit32_size = jit->lit32 - jit->lit32_start; + lit64_size = jit->lit64 - jit->lit64_start; + jit->lit32_start = jit->prg; + if (lit32_size) + jit->lit32_start = ALIGN(jit->lit32_start, 4); + jit->lit64_start = jit->lit32_start + lit32_size; + if (lit64_size) + jit->lit64_start = ALIGN(jit->lit64_start, 8); + jit->size = jit->lit64_start + lit64_size; jit->size_prg = jit->prg; return 0; } @@ -1369,7 +1610,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) } memset(&jit, 0, sizeof(jit)); - jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); + jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) { fp = orig_fp; goto out; @@ -1388,12 +1629,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* * Final pass: Allocate and generate program */ - if (jit.size >= BPF_SIZE_MAX) { - fp = orig_fp; - goto free_addrs; - } - - header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole); + header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole); if (!header) { fp = orig_fp; goto free_addrs; @@ -1422,7 +1658,7 @@ skip_init_ctx: if (!fp->is_func || extra_pass) { bpf_prog_fill_jited_linfo(fp, jit.addrs + 1); free_addrs: - kfree(jit.addrs); + kvfree(jit.addrs); kfree(jit_data); fp->aux->jit_data = NULL; } diff --git a/arch/sh/boards/mach-sdk7786/nmi.c b/arch/sh/boards/mach-sdk7786/nmi.c index c2e09d798537..afba49679a12 100644 --- a/arch/sh/boards/mach-sdk7786/nmi.c +++ b/arch/sh/boards/mach-sdk7786/nmi.c @@ -37,7 +37,7 @@ static int __init nmi_mode_setup(char *str) nmi_mode = NMI_MODE_ANY; else { nmi_mode = NMI_MODE_UNKNOWN; - pr_warning("Unknown NMI mode %s\n", str); + pr_warn("Unknown NMI mode %s\n", str); } printk("Set NMI mode to %d\n", nmi_mode); diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index c15cac9251b9..e69ec12cbbe6 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -111,6 +111,11 @@ void __stack_chk_fail(void) error("stack-protector: Kernel stack is corrupted\n"); } +/* Needed because vmlinux.lds.h references this */ +void ftrace_stub(void) +{ +} + #ifdef CONFIG_SUPERH64 #define stackalign 8 #else diff --git a/arch/sh/drivers/Makefile b/arch/sh/drivers/Makefile index 3e93b434e604..56b0acace6e7 100644 --- a/arch/sh/drivers/Makefile +++ b/arch/sh/drivers/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux SuperH-specific device drivers. # -obj-y += dma/ +obj-y += dma/ platform_early.o obj-$(CONFIG_PCI) += pci/ obj-$(CONFIG_SUPERHYWAY) += superhyway/ diff --git a/arch/sh/drivers/pci/fixups-sdk7786.c b/arch/sh/drivers/pci/fixups-sdk7786.c index 8cbfa5310a4b..6972af7b4e93 100644 --- a/arch/sh/drivers/pci/fixups-sdk7786.c +++ b/arch/sh/drivers/pci/fixups-sdk7786.c @@ -53,7 +53,7 @@ static int __init sdk7786_pci_init(void) /* Warn about forced rerouting if slot#3 is occupied */ if ((data & PCIECR_PRST3) == 0) { - pr_warning("Unreachable card detected in slot#3\n"); + pr_warn("Unreachable card detected in slot#3\n"); return -EBUSY; } } else diff --git a/arch/sh/drivers/platform_early.c b/arch/sh/drivers/platform_early.c new file mode 100644 index 000000000000..f6d148451dfc --- /dev/null +++ b/arch/sh/drivers/platform_early.c @@ -0,0 +1,347 @@ +// SPDX--License-Identifier: GPL-2.0 + +#include <asm/platform_early.h> +#include <linux/mod_devicetable.h> +#include <linux/pm.h> + +static __initdata LIST_HEAD(sh_early_platform_driver_list); +static __initdata LIST_HEAD(sh_early_platform_device_list); + +static const struct platform_device_id * +platform_match_id(const struct platform_device_id *id, + struct platform_device *pdev) +{ + while (id->name[0]) { + if (strcmp(pdev->name, id->name) == 0) { + pdev->id_entry = id; + return id; + } + id++; + } + return NULL; +} + +static int platform_match(struct device *dev, struct device_driver *drv) +{ + struct platform_device *pdev = to_platform_device(dev); + struct platform_driver *pdrv = to_platform_driver(drv); + + /* When driver_override is set, only bind to the matching driver */ + if (pdev->driver_override) + return !strcmp(pdev->driver_override, drv->name); + + /* Then try to match against the id table */ + if (pdrv->id_table) + return platform_match_id(pdrv->id_table, pdev) != NULL; + + /* fall-back to driver name match */ + return (strcmp(pdev->name, drv->name) == 0); +} + +#ifdef CONFIG_PM +static void device_pm_init_common(struct device *dev) +{ + if (!dev->power.early_init) { + spin_lock_init(&dev->power.lock); + dev->power.qos = NULL; + dev->power.early_init = true; + } +} + +static void pm_runtime_early_init(struct device *dev) +{ + dev->power.disable_depth = 1; + device_pm_init_common(dev); +} +#else +static void pm_runtime_early_init(struct device *dev) {} +#endif + +/** + * sh_early_platform_driver_register - register early platform driver + * @epdrv: sh_early_platform driver structure + * @buf: string passed from early_param() + * + * Helper function for sh_early_platform_init() / sh_early_platform_init_buffer() + */ +int __init sh_early_platform_driver_register(struct sh_early_platform_driver *epdrv, + char *buf) +{ + char *tmp; + int n; + + /* Simply add the driver to the end of the global list. + * Drivers will by default be put on the list in compiled-in order. + */ + if (!epdrv->list.next) { + INIT_LIST_HEAD(&epdrv->list); + list_add_tail(&epdrv->list, &sh_early_platform_driver_list); + } + + /* If the user has specified device then make sure the driver + * gets prioritized. The driver of the last device specified on + * command line will be put first on the list. + */ + n = strlen(epdrv->pdrv->driver.name); + if (buf && !strncmp(buf, epdrv->pdrv->driver.name, n)) { + list_move(&epdrv->list, &sh_early_platform_driver_list); + + /* Allow passing parameters after device name */ + if (buf[n] == '\0' || buf[n] == ',') + epdrv->requested_id = -1; + else { + epdrv->requested_id = simple_strtoul(&buf[n + 1], + &tmp, 10); + + if (buf[n] != '.' || (tmp == &buf[n + 1])) { + epdrv->requested_id = EARLY_PLATFORM_ID_ERROR; + n = 0; + } else + n += strcspn(&buf[n + 1], ",") + 1; + } + + if (buf[n] == ',') + n++; + + if (epdrv->bufsize) { + memcpy(epdrv->buffer, &buf[n], + min_t(int, epdrv->bufsize, strlen(&buf[n]) + 1)); + epdrv->buffer[epdrv->bufsize - 1] = '\0'; + } + } + + return 0; +} + +/** + * sh_early_platform_add_devices - adds a number of early platform devices + * @devs: array of early platform devices to add + * @num: number of early platform devices in array + * + * Used by early architecture code to register early platform devices and + * their platform data. + */ +void __init sh_early_platform_add_devices(struct platform_device **devs, int num) +{ + struct device *dev; + int i; + + /* simply add the devices to list */ + for (i = 0; i < num; i++) { + dev = &devs[i]->dev; + + if (!dev->devres_head.next) { + pm_runtime_early_init(dev); + INIT_LIST_HEAD(&dev->devres_head); + list_add_tail(&dev->devres_head, + &sh_early_platform_device_list); + } + } +} + +/** + * sh_early_platform_driver_register_all - register early platform drivers + * @class_str: string to identify early platform driver class + * + * Used by architecture code to register all early platform drivers + * for a certain class. If omitted then only early platform drivers + * with matching kernel command line class parameters will be registered. + */ +void __init sh_early_platform_driver_register_all(char *class_str) +{ + /* The "class_str" parameter may or may not be present on the kernel + * command line. If it is present then there may be more than one + * matching parameter. + * + * Since we register our early platform drivers using early_param() + * we need to make sure that they also get registered in the case + * when the parameter is missing from the kernel command line. + * + * We use parse_early_options() to make sure the early_param() gets + * called at least once. The early_param() may be called more than + * once since the name of the preferred device may be specified on + * the kernel command line. sh_early_platform_driver_register() handles + * this case for us. + */ + parse_early_options(class_str); +} + +/** + * sh_early_platform_match - find early platform device matching driver + * @epdrv: early platform driver structure + * @id: id to match against + */ +static struct platform_device * __init +sh_early_platform_match(struct sh_early_platform_driver *epdrv, int id) +{ + struct platform_device *pd; + + list_for_each_entry(pd, &sh_early_platform_device_list, dev.devres_head) + if (platform_match(&pd->dev, &epdrv->pdrv->driver)) + if (pd->id == id) + return pd; + + return NULL; +} + +/** + * sh_early_platform_left - check if early platform driver has matching devices + * @epdrv: early platform driver structure + * @id: return true if id or above exists + */ +static int __init sh_early_platform_left(struct sh_early_platform_driver *epdrv, + int id) +{ + struct platform_device *pd; + + list_for_each_entry(pd, &sh_early_platform_device_list, dev.devres_head) + if (platform_match(&pd->dev, &epdrv->pdrv->driver)) + if (pd->id >= id) + return 1; + + return 0; +} + +/** + * sh_early_platform_driver_probe_id - probe drivers matching class_str and id + * @class_str: string to identify early platform driver class + * @id: id to match against + * @nr_probe: number of platform devices to successfully probe before exiting + */ +static int __init sh_early_platform_driver_probe_id(char *class_str, + int id, + int nr_probe) +{ + struct sh_early_platform_driver *epdrv; + struct platform_device *match; + int match_id; + int n = 0; + int left = 0; + + list_for_each_entry(epdrv, &sh_early_platform_driver_list, list) { + /* only use drivers matching our class_str */ + if (strcmp(class_str, epdrv->class_str)) + continue; + + if (id == -2) { + match_id = epdrv->requested_id; + left = 1; + + } else { + match_id = id; + left += sh_early_platform_left(epdrv, id); + + /* skip requested id */ + switch (epdrv->requested_id) { + case EARLY_PLATFORM_ID_ERROR: + case EARLY_PLATFORM_ID_UNSET: + break; + default: + if (epdrv->requested_id == id) + match_id = EARLY_PLATFORM_ID_UNSET; + } + } + + switch (match_id) { + case EARLY_PLATFORM_ID_ERROR: + pr_warn("%s: unable to parse %s parameter\n", + class_str, epdrv->pdrv->driver.name); + /* fall-through */ + case EARLY_PLATFORM_ID_UNSET: + match = NULL; + break; + default: + match = sh_early_platform_match(epdrv, match_id); + } + + if (match) { + /* + * Set up a sensible init_name to enable + * dev_name() and others to be used before the + * rest of the driver core is initialized. + */ + if (!match->dev.init_name && slab_is_available()) { + if (match->id != -1) + match->dev.init_name = + kasprintf(GFP_KERNEL, "%s.%d", + match->name, + match->id); + else + match->dev.init_name = + kasprintf(GFP_KERNEL, "%s", + match->name); + + if (!match->dev.init_name) + return -ENOMEM; + } + + if (epdrv->pdrv->probe(match)) + pr_warn("%s: unable to probe %s early.\n", + class_str, match->name); + else + n++; + } + + if (n >= nr_probe) + break; + } + + if (left) + return n; + else + return -ENODEV; +} + +/** + * sh_early_platform_driver_probe - probe a class of registered drivers + * @class_str: string to identify early platform driver class + * @nr_probe: number of platform devices to successfully probe before exiting + * @user_only: only probe user specified early platform devices + * + * Used by architecture code to probe registered early platform drivers + * within a certain class. For probe to happen a registered early platform + * device matching a registered early platform driver is needed. + */ +int __init sh_early_platform_driver_probe(char *class_str, + int nr_probe, + int user_only) +{ + int k, n, i; + + n = 0; + for (i = -2; n < nr_probe; i++) { + k = sh_early_platform_driver_probe_id(class_str, i, nr_probe - n); + + if (k < 0) + break; + + n += k; + + if (user_only) + break; + } + + return n; +} + +/** + * sh_early_platform_cleanup - clean up early platform code + */ +static int __init sh_early_platform_cleanup(void) +{ + struct platform_device *pd, *pd2; + + /* clean up the devres list used to chain devices */ + list_for_each_entry_safe(pd, pd2, &sh_early_platform_device_list, + dev.devres_head) { + list_del(&pd->dev.devres_head); + memset(&pd->dev.devres_head, 0, sizeof(pd->dev.devres_head)); + } + + return 0; +} +/* + * This must happen once after all early devices are probed but before probing + * real platform devices. + */ +subsys_initcall(sh_early_platform_cleanup); diff --git a/arch/sh/include/asm/platform_early.h b/arch/sh/include/asm/platform_early.h new file mode 100644 index 000000000000..fc802137c37d --- /dev/null +++ b/arch/sh/include/asm/platform_early.h @@ -0,0 +1,61 @@ +/* SPDX--License-Identifier: GPL-2.0 */ + +#ifndef __PLATFORM_EARLY__ +#define __PLATFORM_EARLY__ + +#include <linux/types.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> + +struct sh_early_platform_driver { + const char *class_str; + struct platform_driver *pdrv; + struct list_head list; + int requested_id; + char *buffer; + int bufsize; +}; + +#define EARLY_PLATFORM_ID_UNSET -2 +#define EARLY_PLATFORM_ID_ERROR -3 + +extern int sh_early_platform_driver_register(struct sh_early_platform_driver *epdrv, + char *buf); +extern void sh_early_platform_add_devices(struct platform_device **devs, int num); + +static inline int is_sh_early_platform_device(struct platform_device *pdev) +{ + return !pdev->dev.driver; +} + +extern void sh_early_platform_driver_register_all(char *class_str); +extern int sh_early_platform_driver_probe(char *class_str, + int nr_probe, int user_only); + +#define sh_early_platform_init(class_string, platdrv) \ + sh_early_platform_init_buffer(class_string, platdrv, NULL, 0) + +#ifndef MODULE +#define sh_early_platform_init_buffer(class_string, platdrv, buf, bufsiz) \ +static __initdata struct sh_early_platform_driver early_driver = { \ + .class_str = class_string, \ + .buffer = buf, \ + .bufsize = bufsiz, \ + .pdrv = platdrv, \ + .requested_id = EARLY_PLATFORM_ID_UNSET, \ +}; \ +static int __init sh_early_platform_driver_setup_func(char *buffer) \ +{ \ + return sh_early_platform_driver_register(&early_driver, buffer); \ +} \ +early_param(class_string, sh_early_platform_driver_setup_func) +#else /* MODULE */ +#define sh_early_platform_init_buffer(class_string, platdrv, buf, bufsiz) \ +static inline char *sh_early_platform_driver_setup_func(void) \ +{ \ + return bufsiz ? buf : NULL; \ +} +#endif /* MODULE */ + +#endif /* __PLATFORM_EARLY__ */ diff --git a/arch/sh/include/cpu-sh4/cpu/sh7734.h b/arch/sh/include/cpu-sh4/cpu/sh7734.h index 96f0246ad2f2..82b63208135a 100644 --- a/arch/sh/include/cpu-sh4/cpu/sh7734.h +++ b/arch/sh/include/cpu-sh4/cpu/sh7734.h @@ -134,7 +134,7 @@ enum { GPIO_FN_EX_WAIT1, GPIO_FN_SD1_DAT0_A, GPIO_FN_DREQ2, GPIO_FN_CAN1_TX_C, GPIO_FN_ET0_LINK_C, GPIO_FN_ET0_ETXD5_A, GPIO_FN_EX_WAIT0, GPIO_FN_TCLK1_B, - GPIO_FN_RD_WR, GPIO_FN_TCLK0, + GPIO_FN_RD_WR, GPIO_FN_TCLK0, GPIO_FN_CAN_CLK_B, GPIO_FN_ET0_ETXD4, GPIO_FN_EX_CS5, GPIO_FN_SD1_CMD_A, GPIO_FN_ATADIR, GPIO_FN_QSSL_B, GPIO_FN_ET0_ETXD3_A, GPIO_FN_EX_CS4, GPIO_FN_SD1_WP_A, GPIO_FN_ATAWR, GPIO_FN_QMI_QIO1_B, diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index f5b6841ef7e1..b1c877b6a420 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -12,6 +12,7 @@ #include <linux/sh_eth.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -199,6 +200,6 @@ void __init plat_early_device_setup(void) /* enable CMT clock */ __raw_writeb(__raw_readb(STBCR3) & ~0x10, STBCR3); - early_platform_add_devices(sh7619_early_devices, + sh_early_platform_add_devices(sh7619_early_devices, ARRAY_SIZE(sh7619_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c index 52350ad0b0a2..cefa07924c16 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c +++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c @@ -9,6 +9,7 @@ #include <linux/serial.h> #include <linux/serial_sci.h> #include <linux/sh_timer.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -169,6 +170,6 @@ static struct platform_device *mxg_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(mxg_early_devices, + sh_early_platform_add_devices(mxg_early_devices, ARRAY_SIZE(mxg_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c index b51ed761ae08..28f1bebf3405 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c @@ -11,6 +11,7 @@ #include <linux/serial_sci.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -412,6 +413,6 @@ void __init plat_early_device_setup(void) /* enable MTU2 clock */ __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); - early_platform_add_devices(sh7201_early_devices, + sh_early_platform_add_devices(sh7201_early_devices, ARRAY_SIZE(sh7201_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index 89b3e49fc250..4839f3aaeb4c 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -10,6 +10,7 @@ #include <linux/serial_sci.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -349,6 +350,6 @@ void __init plat_early_device_setup(void) /* enable MTU2 clock */ __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); - early_platform_add_devices(sh7203_early_devices, + sh_early_platform_add_devices(sh7203_early_devices, ARRAY_SIZE(sh7203_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index 36ff3a3139da..68add5af4cc5 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -11,6 +11,7 @@ #include <linux/serial_sci.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -285,6 +286,6 @@ void __init plat_early_device_setup(void) /* enable MTU2 clock */ __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); - early_platform_add_devices(sh7206_early_devices, + sh_early_platform_add_devices(sh7206_early_devices, ARRAY_SIZE(sh7206_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7264.c b/arch/sh/kernel/cpu/sh2a/setup-sh7264.c index d199618d877c..8a1cb613dd2e 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7264.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7264.c @@ -11,6 +11,7 @@ #include <linux/usb/r8a66597.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -546,6 +547,6 @@ static struct platform_device *sh7264_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7264_early_devices, + sh_early_platform_add_devices(sh7264_early_devices, ARRAY_SIZE(sh7264_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7269.c b/arch/sh/kernel/cpu/sh2a/setup-sh7269.c index 9095c960b455..8b1ef3028320 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7269.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7269.c @@ -12,6 +12,7 @@ #include <linux/usb/r8a66597.h> #include <linux/sh_timer.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -562,6 +563,6 @@ static struct platform_device *sh7269_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7269_early_devices, + sh_early_platform_add_devices(sh7269_early_devices, ARRAY_SIZE(sh7269_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh3/setup-sh3.c b/arch/sh/kernel/cpu/sh3/setup-sh3.c index 8058c01cf09d..cf2a3f09fee4 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh3.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh3.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/io.h> +#include <asm/platform_early.h> /* All SH3 devices are equipped with IRQ0->5 (except sh7708) */ diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c index e19d1ce7b6ad..0544134b3f20 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c @@ -14,6 +14,7 @@ #include <linux/sh_intc.h> #include <asm/rtc.h> #include <cpu/serial.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -178,7 +179,7 @@ static struct platform_device *sh7705_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7705_early_devices, + sh_early_platform_add_devices(sh7705_early_devices, ARRAY_SIZE(sh7705_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c index 5c5144bee6bc..4947f57748bc 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c @@ -18,6 +18,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <cpu/serial.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -230,7 +231,7 @@ static struct platform_device *sh770x_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh770x_early_devices, + sh_early_platform_add_devices(sh770x_early_devices, ARRAY_SIZE(sh770x_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c index 4776e2495738..381910761579 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c @@ -13,6 +13,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <asm/rtc.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -177,7 +178,7 @@ static struct platform_device *sh7710_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7710_early_devices, + sh_early_platform_add_devices(sh7710_early_devices, ARRAY_SIZE(sh7710_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c index 1d4c34e7b7db..425d067dae9b 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c @@ -19,6 +19,7 @@ #include <linux/sh_intc.h> #include <linux/usb/ohci_pdriver.h> #include <asm/rtc.h> +#include <asm/platform_early.h> #include <cpu/serial.h> static struct resource rtc_resources[] = { @@ -211,7 +212,7 @@ static struct platform_device *sh7720_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7720_early_devices, + sh_early_platform_add_devices(sh7720_early_devices, ARRAY_SIZE(sh7720_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c index a40ef35d101a..e6737f3d0df2 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c @@ -12,6 +12,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <linux/io.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE, @@ -76,7 +77,7 @@ static struct platform_device *sh4202_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh4202_early_devices, + sh_early_platform_add_devices(sh4202_early_devices, ARRAY_SIZE(sh4202_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c index b37bda66a532..19c8f1d69071 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c @@ -13,6 +13,7 @@ #include <linux/sh_intc.h> #include <linux/serial_sci.h> #include <generated/machtypes.h> +#include <asm/platform_early.h> static struct resource rtc_resources[] = { [0] = { @@ -161,15 +162,15 @@ void __init plat_early_device_setup(void) if (mach_is_rts7751r2d()) { scif_platform_data.scscr |= SCSCR_CKE1; dev[0] = &scif_device; - early_platform_add_devices(dev, 1); + sh_early_platform_add_devices(dev, 1); } else { dev[0] = &sci_device; - early_platform_add_devices(dev, 1); + sh_early_platform_add_devices(dev, 1); dev[0] = &scif_device; - early_platform_add_devices(dev, 1); + sh_early_platform_add_devices(dev, 1); } - early_platform_add_devices(sh7750_early_devices, + sh_early_platform_add_devices(sh7750_early_devices, ARRAY_SIZE(sh7750_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c index 86845da85997..14212f5d803c 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c @@ -11,6 +11,7 @@ #include <linux/sh_intc.h> #include <linux/serial_sci.h> #include <linux/io.h> +#include <asm/platform_early.h> enum { UNUSED = 0, @@ -271,7 +272,7 @@ static struct platform_device *sh7760_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7760_early_devices, + sh_early_platform_add_devices(sh7760_early_devices, ARRAY_SIZE(sh7760_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index a15e25690b5f..b6015188fab1 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -12,6 +12,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <asm/clock.h> +#include <asm/platform_early.h> /* Serial */ static struct plat_sci_port scif0_platform_data = { @@ -296,7 +297,7 @@ static struct platform_device *sh7343_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7343_early_devices, + sh_early_platform_add_devices(sh7343_early_devices, ARRAY_SIZE(sh7343_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index 7bd2776441ba..6676beef053e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -15,6 +15,7 @@ #include <linux/sh_intc.h> #include <linux/usb/r8a66597.h> #include <asm/clock.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE, @@ -240,7 +241,7 @@ static struct platform_device *sh7366_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7366_early_devices, + sh_early_platform_add_devices(sh7366_early_devices, ARRAY_SIZE(sh7366_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 1ce65f88f060..0c6757ef63f4 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -18,6 +18,7 @@ #include <asm/clock.h> #include <asm/mmzone.h> #include <asm/siu.h> +#include <asm/platform_early.h> #include <cpu/dma-register.h> #include <cpu/sh7722.h> @@ -512,7 +513,7 @@ static struct platform_device *sh7722_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7722_early_devices, + sh_early_platform_add_devices(sh7722_early_devices, ARRAY_SIZE(sh7722_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index edb649950662..83ae1ad4a86e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <asm/clock.h> #include <asm/mmzone.h> +#include <asm/platform_early.h> #include <cpu/sh7723.h> /* Serial */ @@ -410,7 +411,7 @@ static struct platform_device *sh7723_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7723_early_devices, + sh_early_platform_add_devices(sh7723_early_devices, ARRAY_SIZE(sh7723_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 3e9825031d3d..0d990ab1ba2a 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -24,6 +24,7 @@ #include <asm/suspend.h> #include <asm/clock.h> #include <asm/mmzone.h> +#include <asm/platform_early.h> #include <cpu/dma-register.h> #include <cpu/sh7724.h> @@ -830,7 +831,7 @@ static struct platform_device *sh7724_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7724_early_devices, + sh_early_platform_add_devices(sh7724_early_devices, ARRAY_SIZE(sh7724_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7734.c b/arch/sh/kernel/cpu/sh4a/setup-sh7734.c index 06a91569697a..9911da794358 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7734.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7734.c @@ -18,6 +18,7 @@ #include <linux/io.h> #include <asm/clock.h> #include <asm/irq.h> +#include <asm/platform_early.h> #include <cpu/sh7734.h> /* SCIF */ @@ -280,7 +281,7 @@ static struct platform_device *sh7734_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7734_early_devices, + sh_early_platform_add_devices(sh7734_early_devices, ARRAY_SIZE(sh7734_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c index 2501ce656511..67e330b7ea46 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c @@ -19,6 +19,7 @@ #include <linux/usb/ohci_pdriver.h> #include <cpu/dma-register.h> #include <cpu/sh7757.h> +#include <asm/platform_early.h> static struct plat_sci_port scif2_platform_data = { .scscr = SCSCR_REIE, @@ -767,7 +768,7 @@ static struct platform_device *sh7757_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7757_early_devices, + sh_early_platform_add_devices(sh7757_early_devices, ARRAY_SIZE(sh7757_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c index 419c5efe4a17..b0608664785f 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c @@ -14,6 +14,7 @@ #include <linux/io.h> #include <linux/serial_sci.h> #include <linux/usb/ohci_pdriver.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE, @@ -221,7 +222,7 @@ static struct platform_device *sh7763_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7763_early_devices, + sh_early_platform_add_devices(sh7763_early_devices, ARRAY_SIZE(sh7763_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c index 5fb4cf9b58c6..5efec6ceb04d 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c @@ -11,6 +11,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <linux/io.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE | SCSCR_TOIE, @@ -316,7 +317,7 @@ static struct platform_device *sh7770_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7770_early_devices, + sh_early_platform_add_devices(sh7770_early_devices, ARRAY_SIZE(sh7770_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c index ab7d6b715865..c818b788ecb0 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c @@ -13,6 +13,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <cpu/dma-register.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE | SCSCR_CKE1, @@ -285,7 +286,7 @@ void __init plat_early_device_setup(void) scif1_platform_data.scscr &= ~SCSCR_CKE1; } - early_platform_add_devices(sh7780_early_devices, + sh_early_platform_add_devices(sh7780_early_devices, ARRAY_SIZE(sh7780_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index a438da47285d..3b4a414d60a9 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -14,6 +14,7 @@ #include <linux/sh_timer.h> #include <linux/sh_intc.h> #include <asm/mmzone.h> +#include <asm/platform_early.h> #include <cpu/dma-register.h> static struct plat_sci_port scif0_platform_data = { @@ -353,7 +354,7 @@ static struct platform_device *sh7785_early_devices[] __initdata = { void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7785_early_devices, + sh_early_platform_add_devices(sh7785_early_devices, ARRAY_SIZE(sh7785_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c index d894165a0ef6..4b0db8259e3d 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c @@ -23,6 +23,7 @@ #include <linux/usb/ohci_pdriver.h> #include <cpu/dma-register.h> #include <asm/mmzone.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .scscr = SCSCR_REIE | SCSCR_CKE1, @@ -834,6 +835,6 @@ arch_initcall(sh7786_devices_setup); void __init plat_early_device_setup(void) { - early_platform_add_devices(sh7786_early_devices, + sh_early_platform_add_devices(sh7786_early_devices, ARRAY_SIZE(sh7786_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c index 14aa4552bc45..7014d6d199b3 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c @@ -14,6 +14,7 @@ #include <linux/sh_intc.h> #include <cpu/shx3.h> #include <asm/mmzone.h> +#include <asm/platform_early.h> /* * This intentionally only registers SCIF ports 0, 1, and 3. SCIF 2 @@ -152,7 +153,7 @@ arch_initcall(shx3_devices_setup); void __init plat_early_device_setup(void) { - early_platform_add_devices(shx3_early_devices, + sh_early_platform_add_devices(shx3_early_devices, ARRAY_SIZE(shx3_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c index 41c1673afc0b..dc8476d67244 100644 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/sh_timer.h> #include <asm/addrspace.h> +#include <asm/platform_early.h> static struct plat_sci_port scif0_platform_data = { .flags = UPF_IOREMAP, @@ -115,6 +116,6 @@ arch_initcall(sh5_devices_setup); void __init plat_early_device_setup(void) { - early_platform_add_devices(sh5_early_devices, + sh_early_platform_add_devices(sh5_early_devices, ARRAY_SIZE(sh5_early_devices)); } diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index bacad6da4fe4..60c828a2b8a2 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -99,7 +99,7 @@ int register_trapped_io(struct trapped_io *tiop) return 0; bad: - pr_warning("unable to install trapped io filter\n"); + pr_warn("unable to install trapped io filter\n"); return -1; } EXPORT_SYMBOL_GPL(register_trapped_io); diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 2c0e0f37a318..d232cfa01877 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -44,6 +44,7 @@ #include <asm/mmu_context.h> #include <asm/mmzone.h> #include <asm/sparsemem.h> +#include <asm/platform_early.h> /* * Initialize loops_per_jiffy as 10000000 (1000MIPS). @@ -328,7 +329,7 @@ void __init setup_arch(char **cmdline_p) sh_mv_setup(); /* Let earlyprintk output early console messages */ - early_platform_driver_probe("earlyprintk", 1, 1); + sh_early_platform_driver_probe("earlyprintk", 1, 1); #ifdef CONFIG_OF_FLATTREE #ifdef CONFIG_USE_BUILTIN_DTB @@ -354,7 +355,7 @@ void __init setup_arch(char **cmdline_p) /* processor boot mode configuration */ int generic_mode_pins(void) { - pr_warning("generic_mode_pins(): missing mode pin configuration\n"); + pr_warn("generic_mode_pins(): missing mode pin configuration\n"); return 0; } diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index e16b2cd269a3..821a09cbd605 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -18,6 +18,7 @@ #include <linux/rtc.h> #include <asm/clock.h> #include <asm/rtc.h> +#include <asm/platform_early.h> static void __init sh_late_time_init(void) { @@ -30,8 +31,8 @@ static void __init sh_late_time_init(void) * clocksource and the jiffies clocksource is used transparently * instead. No error handling is necessary here. */ - early_platform_driver_register_all("earlytimer"); - early_platform_driver_probe("earlytimer", 2, 0); + sh_early_platform_driver_register_all("earlytimer"); + sh_early_platform_driver_probe("earlytimer", 2, 0); } void __init time_init(void) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 77a59d8c6b4d..c60b19958c35 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -48,11 +48,10 @@ SECTIONS } = 0x0009 EXCEPTION_TABLE(16) - NOTES _sdata = .; RO_DATA(PAGE_SIZE) - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; DWARF_EH_FRAME diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c index 792f36129062..3169a343a5ab 100644 --- a/arch/sh/mm/consistent.c +++ b/arch/sh/mm/consistent.c @@ -43,8 +43,7 @@ int __init platform_resource_setup_memory(struct platform_device *pdev, r = pdev->resource + pdev->num_resources - 1; if (r->flags) { - pr_warning("%s: unable to find empty space for resource\n", - name); + pr_warn("%s: unable to find empty space for resource\n", name); return -EINVAL; } @@ -54,7 +53,7 @@ int __init platform_resource_setup_memory(struct platform_device *pdev, buf = dma_alloc_coherent(&pdev->dev, memsize, &dma_handle, GFP_KERNEL); if (!buf) { - pr_warning("%s: unable to allocate memory\n", name); + pr_warn("%s: unable to allocate memory\n", name); return -ENOMEM; } diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index 7b946b3dee9d..0f5a501c95a9 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -24,6 +24,7 @@ #include <linux/types.h> #include <crypto/algapi.h> #include <crypto/aes.h> +#include <crypto/internal/skcipher.h> #include <asm/fpumacro.h> #include <asm/pstate.h> @@ -197,6 +198,12 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return 0; } +static int aes_set_key_skcipher(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + return aes_set_key(crypto_skcipher_tfm(tfm), in_key, key_len); +} + static void crypto_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm); @@ -211,131 +218,108 @@ static void crypto_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) ctx->ops->decrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst); } -#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) - -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_sparc64_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; ctx->ops->load_encrypt_keys(&ctx->key[0]); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & AES_BLOCK_MASK; - - if (likely(block_len)) { - ctx->ops->ecb_encrypt(&ctx->key[0], - (const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len); - } - nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + ctx->ops->ecb_encrypt(&ctx->key[0], walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, AES_BLOCK_SIZE)); + err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE); } fprs_write(0); return err; } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - u64 *key_end; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_sparc64_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + const u64 *key_end; + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; ctx->ops->load_decrypt_keys(&ctx->key[0]); key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)]; - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & AES_BLOCK_MASK; - - if (likely(block_len)) { - ctx->ops->ecb_decrypt(key_end, - (const u64 *) walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, block_len); - } - nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + ctx->ops->ecb_decrypt(key_end, walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, AES_BLOCK_SIZE)); + err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE); } fprs_write(0); return err; } -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_sparc64_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; ctx->ops->load_encrypt_keys(&ctx->key[0]); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & AES_BLOCK_MASK; - - if (likely(block_len)) { - ctx->ops->cbc_encrypt(&ctx->key[0], - (const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len, (u64 *) walk.iv); - } - nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + ctx->ops->cbc_encrypt(&ctx->key[0], walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, AES_BLOCK_SIZE), + walk.iv); + err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE); } fprs_write(0); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - u64 *key_end; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_sparc64_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + const u64 *key_end; + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; ctx->ops->load_decrypt_keys(&ctx->key[0]); key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)]; - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & AES_BLOCK_MASK; - - if (likely(block_len)) { - ctx->ops->cbc_decrypt(key_end, - (const u64 *) walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len, (u64 *) walk.iv); - } - nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + ctx->ops->cbc_decrypt(key_end, walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, AES_BLOCK_SIZE), + walk.iv); + err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE); } fprs_write(0); return err; } -static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx, - struct blkcipher_walk *walk) +static void ctr_crypt_final(const struct crypto_sparc64_aes_ctx *ctx, + struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u64 keystream[AES_BLOCK_SIZE / sizeof(u64)]; @@ -349,40 +333,35 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx, crypto_inc(ctrblk, AES_BLOCK_SIZE); } -static int ctr_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_sparc64_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; ctx->ops->load_encrypt_keys(&ctx->key[0]); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - unsigned int block_len = nbytes & AES_BLOCK_MASK; - - if (likely(block_len)) { - ctx->ops->ctr_crypt(&ctx->key[0], - (const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len, (u64 *) walk.iv); - } - nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + ctx->ops->ctr_crypt(&ctx->key[0], walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, AES_BLOCK_SIZE), + walk.iv); + err = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE); } if (walk.nbytes) { ctr_crypt_final(ctx, &walk); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } fprs_write(0); return err; } -static struct crypto_alg algs[] = { { +static struct crypto_alg cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-sparc64", .cra_priority = SPARC_CR_OPCODE_PRIORITY, @@ -400,66 +379,53 @@ static struct crypto_alg algs[] = { { .cia_decrypt = crypto_aes_decrypt } } -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -} }; +}; + +static struct skcipher_alg skcipher_algs[] = { + { + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + .chunksize = AES_BLOCK_SIZE, + } +}; static bool __init sparc64_has_aes_opcode(void) { @@ -477,17 +443,27 @@ static bool __init sparc64_has_aes_opcode(void) static int __init aes_sparc64_mod_init(void) { - if (sparc64_has_aes_opcode()) { - pr_info("Using sparc64 aes opcodes optimized AES implementation\n"); - return crypto_register_algs(algs, ARRAY_SIZE(algs)); + int err; + + if (!sparc64_has_aes_opcode()) { + pr_info("sparc64 aes opcodes not available.\n"); + return -ENODEV; } - pr_info("sparc64 aes opcodes not available.\n"); - return -ENODEV; + pr_info("Using sparc64 aes opcodes optimized AES implementation\n"); + err = crypto_register_alg(&cipher_alg); + if (err) + return err; + err = crypto_register_skciphers(skcipher_algs, + ARRAY_SIZE(skcipher_algs)); + if (err) + crypto_unregister_alg(&cipher_alg); + return err; } static void __exit aes_sparc64_mod_fini(void) { - crypto_unregister_algs(algs, ARRAY_SIZE(algs)); + crypto_unregister_alg(&cipher_alg); + crypto_unregister_skciphers(skcipher_algs, ARRAY_SIZE(skcipher_algs)); } module_init(aes_sparc64_mod_init); diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c index 3823f9491a72..1700f863748c 100644 --- a/arch/sparc/crypto/camellia_glue.c +++ b/arch/sparc/crypto/camellia_glue.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/types.h> #include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <asm/fpumacro.h> #include <asm/pstate.h> @@ -52,6 +53,12 @@ static int camellia_set_key(struct crypto_tfm *tfm, const u8 *_in_key, return 0; } +static int camellia_set_key_skcipher(struct crypto_skcipher *tfm, + const u8 *in_key, unsigned int key_len) +{ + return camellia_set_key(crypto_skcipher_tfm(tfm), in_key, key_len); +} + extern void camellia_sparc64_crypt(const u64 *key, const u32 *input, u32 *output, unsigned int key_len); @@ -81,61 +88,46 @@ typedef void ecb_crypt_op(const u64 *input, u64 *output, unsigned int len, extern ecb_crypt_op camellia_sparc64_ecb_crypt_3_grand_rounds; extern ecb_crypt_op camellia_sparc64_ecb_crypt_4_grand_rounds; -#define CAMELLIA_BLOCK_MASK (~(CAMELLIA_BLOCK_SIZE - 1)) - -static int __ecb_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes, bool encrypt) +static int __ecb_crypt(struct skcipher_request *req, bool encrypt) { - struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct camellia_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; ecb_crypt_op *op; const u64 *key; + unsigned int nbytes; int err; op = camellia_sparc64_ecb_crypt_3_grand_rounds; if (ctx->key_len != 16) op = camellia_sparc64_ecb_crypt_4_grand_rounds; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; if (encrypt) key = &ctx->encrypt_key[0]; else key = &ctx->decrypt_key[0]; camellia_sparc64_load_keys(key, ctx->key_len); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64; - u64 *dst64; - - src64 = (const u64 *)walk.src.virt.addr; - dst64 = (u64 *) walk.dst.virt.addr; - op(src64, dst64, block_len, key); - } - nbytes &= CAMELLIA_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + op(walk.src.virt.addr, walk.dst.virt.addr, + round_down(nbytes, CAMELLIA_BLOCK_SIZE), key); + err = skcipher_walk_done(&walk, nbytes % CAMELLIA_BLOCK_SIZE); } fprs_write(0); return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return __ecb_crypt(desc, dst, src, nbytes, true); + return __ecb_crypt(req, true); } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return __ecb_crypt(desc, dst, src, nbytes, false); + return __ecb_crypt(req, false); } typedef void cbc_crypt_op(const u64 *input, u64 *output, unsigned int len, @@ -146,85 +138,65 @@ extern cbc_crypt_op camellia_sparc64_cbc_encrypt_4_grand_rounds; extern cbc_crypt_op camellia_sparc64_cbc_decrypt_3_grand_rounds; extern cbc_crypt_op camellia_sparc64_cbc_decrypt_4_grand_rounds; -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct camellia_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; cbc_crypt_op *op; const u64 *key; + unsigned int nbytes; int err; op = camellia_sparc64_cbc_encrypt_3_grand_rounds; if (ctx->key_len != 16) op = camellia_sparc64_cbc_encrypt_4_grand_rounds; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; key = &ctx->encrypt_key[0]; camellia_sparc64_load_keys(key, ctx->key_len); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64; - u64 *dst64; - - src64 = (const u64 *)walk.src.virt.addr; - dst64 = (u64 *) walk.dst.virt.addr; - op(src64, dst64, block_len, key, - (u64 *) walk.iv); - } - nbytes &= CAMELLIA_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + op(walk.src.virt.addr, walk.dst.virt.addr, + round_down(nbytes, CAMELLIA_BLOCK_SIZE), key, walk.iv); + err = skcipher_walk_done(&walk, nbytes % CAMELLIA_BLOCK_SIZE); } fprs_write(0); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct camellia_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; cbc_crypt_op *op; const u64 *key; + unsigned int nbytes; int err; op = camellia_sparc64_cbc_decrypt_3_grand_rounds; if (ctx->key_len != 16) op = camellia_sparc64_cbc_decrypt_4_grand_rounds; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; key = &ctx->decrypt_key[0]; camellia_sparc64_load_keys(key, ctx->key_len); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64; - u64 *dst64; - - src64 = (const u64 *)walk.src.virt.addr; - dst64 = (u64 *) walk.dst.virt.addr; - op(src64, dst64, block_len, key, - (u64 *) walk.iv); - } - nbytes &= CAMELLIA_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + op(walk.src.virt.addr, walk.dst.virt.addr, + round_down(nbytes, CAMELLIA_BLOCK_SIZE), key, walk.iv); + err = skcipher_walk_done(&walk, nbytes % CAMELLIA_BLOCK_SIZE); } fprs_write(0); return err; } -static struct crypto_alg algs[] = { { +static struct crypto_alg cipher_alg = { .cra_name = "camellia", .cra_driver_name = "camellia-sparc64", .cra_priority = SPARC_CR_OPCODE_PRIORITY, @@ -242,46 +214,37 @@ static struct crypto_alg algs[] = { { .cia_decrypt = camellia_decrypt } } -}, { - .cra_name = "ecb(camellia)", - .cra_driver_name = "ecb-camellia-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(camellia)", - .cra_driver_name = "cbc-camellia-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -} +}; + +static struct skcipher_alg skcipher_algs[] = { + { + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_set_key_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_set_key_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + } }; static bool __init sparc64_has_camellia_opcode(void) @@ -300,17 +263,27 @@ static bool __init sparc64_has_camellia_opcode(void) static int __init camellia_sparc64_mod_init(void) { - if (sparc64_has_camellia_opcode()) { - pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n"); - return crypto_register_algs(algs, ARRAY_SIZE(algs)); + int err; + + if (!sparc64_has_camellia_opcode()) { + pr_info("sparc64 camellia opcodes not available.\n"); + return -ENODEV; } - pr_info("sparc64 camellia opcodes not available.\n"); - return -ENODEV; + pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n"); + err = crypto_register_alg(&cipher_alg); + if (err) + return err; + err = crypto_register_skciphers(skcipher_algs, + ARRAY_SIZE(skcipher_algs)); + if (err) + crypto_unregister_alg(&cipher_alg); + return err; } static void __exit camellia_sparc64_mod_fini(void) { - crypto_unregister_algs(algs, ARRAY_SIZE(algs)); + crypto_unregister_alg(&cipher_alg); + crypto_unregister_skciphers(skcipher_algs, ARRAY_SIZE(skcipher_algs)); } module_init(camellia_sparc64_mod_init); diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c index db6010b4e52e..a499102bf706 100644 --- a/arch/sparc/crypto/des_glue.c +++ b/arch/sparc/crypto/des_glue.c @@ -13,6 +13,7 @@ #include <linux/types.h> #include <crypto/algapi.h> #include <crypto/internal/des.h> +#include <crypto/internal/skcipher.h> #include <asm/fpumacro.h> #include <asm/pstate.h> @@ -61,6 +62,12 @@ static int des_set_key(struct crypto_tfm *tfm, const u8 *key, return 0; } +static int des_set_key_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return des_set_key(crypto_skcipher_tfm(tfm), key, keylen); +} + extern void des_sparc64_crypt(const u64 *key, const u64 *input, u64 *output); @@ -85,113 +92,90 @@ extern void des_sparc64_load_keys(const u64 *key); extern void des_sparc64_ecb_crypt(const u64 *input, u64 *output, unsigned int len); -#define DES_BLOCK_MASK (~(DES_BLOCK_SIZE - 1)) - -static int __ecb_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes, bool encrypt) +static int __ecb_crypt(struct skcipher_request *req, bool encrypt) { - struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct des_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; if (encrypt) des_sparc64_load_keys(&ctx->encrypt_expkey[0]); else des_sparc64_load_keys(&ctx->decrypt_expkey[0]); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; - - if (likely(block_len)) { - des_sparc64_ecb_crypt((const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + des_sparc64_ecb_crypt(walk.src.virt.addr, walk.dst.virt.addr, + round_down(nbytes, DES_BLOCK_SIZE)); + err = skcipher_walk_done(&walk, nbytes % DES_BLOCK_SIZE); } fprs_write(0); return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return __ecb_crypt(desc, dst, src, nbytes, true); + return __ecb_crypt(req, true); } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return __ecb_crypt(desc, dst, src, nbytes, false); + return __ecb_crypt(req, false); } extern void des_sparc64_cbc_encrypt(const u64 *input, u64 *output, unsigned int len, u64 *iv); -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +extern void des_sparc64_cbc_decrypt(const u64 *input, u64 *output, + unsigned int len, u64 *iv); + +static int __cbc_crypt(struct skcipher_request *req, bool encrypt) { - struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct des_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - des_sparc64_load_keys(&ctx->encrypt_expkey[0]); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; - if (likely(block_len)) { - des_sparc64_cbc_encrypt((const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len, (u64 *) walk.iv); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + if (encrypt) + des_sparc64_load_keys(&ctx->encrypt_expkey[0]); + else + des_sparc64_load_keys(&ctx->decrypt_expkey[0]); + while ((nbytes = walk.nbytes) != 0) { + if (encrypt) + des_sparc64_cbc_encrypt(walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, + DES_BLOCK_SIZE), + walk.iv); + else + des_sparc64_cbc_decrypt(walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, + DES_BLOCK_SIZE), + walk.iv); + err = skcipher_walk_done(&walk, nbytes % DES_BLOCK_SIZE); } fprs_write(0); return err; } -extern void des_sparc64_cbc_decrypt(const u64 *input, u64 *output, - unsigned int len, u64 *iv); - -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - des_sparc64_load_keys(&ctx->decrypt_expkey[0]); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; + return __cbc_crypt(req, true); +} - if (likely(block_len)) { - des_sparc64_cbc_decrypt((const u64 *)walk.src.virt.addr, - (u64 *) walk.dst.virt.addr, - block_len, (u64 *) walk.iv); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); - } - fprs_write(0); - return err; +static int cbc_decrypt(struct skcipher_request *req) +{ + return __cbc_crypt(req, false); } static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key, @@ -227,6 +211,12 @@ static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key, return 0; } +static int des3_ede_set_key_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return des3_ede_set_key(crypto_skcipher_tfm(tfm), key, keylen); +} + extern void des3_ede_sparc64_crypt(const u64 *key, const u64 *input, u64 *output); @@ -251,241 +241,196 @@ extern void des3_ede_sparc64_load_keys(const u64 *key); extern void des3_ede_sparc64_ecb_crypt(const u64 *expkey, const u64 *input, u64 *output, unsigned int len); -static int __ecb3_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes, bool encrypt) +static int __ecb3_crypt(struct skcipher_request *req, bool encrypt) { - struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct des3_ede_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; const u64 *K; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; if (encrypt) K = &ctx->encrypt_expkey[0]; else K = &ctx->decrypt_expkey[0]; des3_ede_sparc64_load_keys(K); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64 = (const u64 *)walk.src.virt.addr; - des3_ede_sparc64_ecb_crypt(K, src64, - (u64 *) walk.dst.virt.addr, - block_len); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + des3_ede_sparc64_ecb_crypt(K, walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, DES_BLOCK_SIZE)); + err = skcipher_walk_done(&walk, nbytes % DES_BLOCK_SIZE); } fprs_write(0); return err; } -static int ecb3_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb3_encrypt(struct skcipher_request *req) { - return __ecb3_crypt(desc, dst, src, nbytes, true); + return __ecb3_crypt(req, true); } -static int ecb3_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb3_decrypt(struct skcipher_request *req) { - return __ecb3_crypt(desc, dst, src, nbytes, false); + return __ecb3_crypt(req, false); } extern void des3_ede_sparc64_cbc_encrypt(const u64 *expkey, const u64 *input, u64 *output, unsigned int len, u64 *iv); -static int cbc3_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; - const u64 *K; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - K = &ctx->encrypt_expkey[0]; - des3_ede_sparc64_load_keys(K); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64 = (const u64 *)walk.src.virt.addr; - des3_ede_sparc64_cbc_encrypt(K, src64, - (u64 *) walk.dst.virt.addr, - block_len, - (u64 *) walk.iv); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); - } - fprs_write(0); - return err; -} - extern void des3_ede_sparc64_cbc_decrypt(const u64 *expkey, const u64 *input, u64 *output, unsigned int len, u64 *iv); -static int cbc3_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int __cbc3_crypt(struct skcipher_request *req, bool encrypt) { - struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct des3_ede_sparc64_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; const u64 *K; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; - K = &ctx->decrypt_expkey[0]; + if (encrypt) + K = &ctx->encrypt_expkey[0]; + else + K = &ctx->decrypt_expkey[0]; des3_ede_sparc64_load_keys(K); - while ((nbytes = walk.nbytes)) { - unsigned int block_len = nbytes & DES_BLOCK_MASK; - - if (likely(block_len)) { - const u64 *src64 = (const u64 *)walk.src.virt.addr; - des3_ede_sparc64_cbc_decrypt(K, src64, - (u64 *) walk.dst.virt.addr, - block_len, - (u64 *) walk.iv); - } - nbytes &= DES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + while ((nbytes = walk.nbytes) != 0) { + if (encrypt) + des3_ede_sparc64_cbc_encrypt(K, walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, + DES_BLOCK_SIZE), + walk.iv); + else + des3_ede_sparc64_cbc_decrypt(K, walk.src.virt.addr, + walk.dst.virt.addr, + round_down(nbytes, + DES_BLOCK_SIZE), + walk.iv); + err = skcipher_walk_done(&walk, nbytes % DES_BLOCK_SIZE); } fprs_write(0); return err; } -static struct crypto_alg algs[] = { { - .cra_name = "des", - .cra_driver_name = "des-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des_sparc64_ctx), - .cra_alignmask = 7, - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = DES_KEY_SIZE, - .cia_max_keysize = DES_KEY_SIZE, - .cia_setkey = des_set_key, - .cia_encrypt = sparc_des_encrypt, - .cia_decrypt = sparc_des_decrypt +static int cbc3_encrypt(struct skcipher_request *req) +{ + return __cbc3_crypt(req, true); +} + +static int cbc3_decrypt(struct skcipher_request *req) +{ + return __cbc3_crypt(req, false); +} + +static struct crypto_alg cipher_algs[] = { + { + .cra_name = "des", + .cra_driver_name = "des-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des_sparc64_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES_KEY_SIZE, + .cia_max_keysize = DES_KEY_SIZE, + .cia_setkey = des_set_key, + .cia_encrypt = sparc_des_encrypt, + .cia_decrypt = sparc_des_decrypt + } } - } -}, { - .cra_name = "ecb(des)", - .cra_driver_name = "ecb-des-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .setkey = des_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(des)", - .cra_driver_name = "cbc-des-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .ivsize = DES_BLOCK_SIZE, - .setkey = des_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "des3_ede", - .cra_driver_name = "des3_ede-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), - .cra_alignmask = 7, - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = DES3_EDE_KEY_SIZE, - .cia_max_keysize = DES3_EDE_KEY_SIZE, - .cia_setkey = des3_ede_set_key, - .cia_encrypt = sparc_des3_ede_encrypt, - .cia_decrypt = sparc_des3_ede_decrypt + }, { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES3_EDE_KEY_SIZE, + .cia_max_keysize = DES3_EDE_KEY_SIZE, + .cia_setkey = des3_ede_set_key, + .cia_encrypt = sparc_des3_ede_encrypt, + .cia_decrypt = sparc_des3_ede_decrypt + } } } -}, { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "ecb-des3_ede-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .setkey = des3_ede_set_key, - .encrypt = ecb3_encrypt, - .decrypt = ecb3_decrypt, - }, - }, -}, { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "cbc-des3_ede-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_set_key, - .encrypt = cbc3_encrypt, - .decrypt = cbc3_decrypt, - }, - }, -} }; +}; + +static struct skcipher_alg skcipher_algs[] = { + { + .base.cra_name = "ecb(des)", + .base.cra_driver_name = "ecb-des-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .setkey = des_set_key_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(des)", + .base.cra_driver_name = "cbc-des-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_set_key_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ecb(des3_ede)", + .base.cra_driver_name = "ecb-des3_ede-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_set_key_skcipher, + .encrypt = ecb3_encrypt, + .decrypt = ecb3_decrypt, + }, { + .base.cra_name = "cbc(des3_ede)", + .base.cra_driver_name = "cbc-des3_ede-sparc64", + .base.cra_priority = SPARC_CR_OPCODE_PRIORITY, + .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .base.cra_alignmask = 7, + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_set_key_skcipher, + .encrypt = cbc3_encrypt, + .decrypt = cbc3_decrypt, + } +}; static bool __init sparc64_has_des_opcode(void) { @@ -503,17 +448,27 @@ static bool __init sparc64_has_des_opcode(void) static int __init des_sparc64_mod_init(void) { - if (sparc64_has_des_opcode()) { - pr_info("Using sparc64 des opcodes optimized DES implementation\n"); - return crypto_register_algs(algs, ARRAY_SIZE(algs)); + int err; + + if (!sparc64_has_des_opcode()) { + pr_info("sparc64 des opcodes not available.\n"); + return -ENODEV; } - pr_info("sparc64 des opcodes not available.\n"); - return -ENODEV; + pr_info("Using sparc64 des opcodes optimized DES implementation\n"); + err = crypto_register_algs(cipher_algs, ARRAY_SIZE(cipher_algs)); + if (err) + return err; + err = crypto_register_skciphers(skcipher_algs, + ARRAY_SIZE(skcipher_algs)); + if (err) + crypto_unregister_algs(cipher_algs, ARRAY_SIZE(cipher_algs)); + return err; } static void __exit des_sparc64_mod_fini(void) { - crypto_unregister_algs(algs, ARRAY_SIZE(algs)); + crypto_unregister_algs(cipher_algs, ARRAY_SIZE(cipher_algs)); + crypto_unregister_skciphers(skcipher_algs, ARRAY_SIZE(skcipher_algs)); } module_init(des_sparc64_mod_init); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a8275fea4b70..9b4506373353 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1673,9 +1673,9 @@ void __init setup_per_cpu_areas(void) pcpu_alloc_bootmem, pcpu_free_bootmem); if (rc) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 61afd787bd0c..7ec79918b566 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -67,7 +67,7 @@ SECTIONS .data1 : { *(.data1) } - RW_DATA_SECTION(SMP_CACHE_BYTES, 0, THREAD_SIZE) + RW_DATA(SMP_CACHE_BYTES, 0, THREAD_SIZE) /* End of data section */ _edata = .; @@ -78,7 +78,6 @@ SECTIONS __stop___fixup = .; } EXCEPTION_TABLE(16) - NOTES . = ALIGN(PAGE_SIZE); __init_begin = ALIGN(PAGE_SIZE); diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index d7086b985f27..7145ce699982 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -9,14 +9,13 @@ _sdata = .; PROVIDE (sdata = .); - RODATA + RO_DATA(4096) .unprotected : { *(.unprotected) } . = ALIGN(4096); PROVIDE (_unprotected_end = .); . = ALIGN(4096); - NOTES EXCEPTION_TABLE(0) BUG_TABLE diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S index 7abf90537cd5..6fb320b337ef 100644 --- a/arch/unicore32/kernel/vmlinux.lds.S +++ b/arch/unicore32/kernel/vmlinux.lds.S @@ -43,12 +43,11 @@ SECTIONS _etext = .; _sdata = .; - RO_DATA_SECTION(PAGE_SIZE) - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RO_DATA(PAGE_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; EXCEPTION_TABLE(L1_CACHE_BYTES) - NOTES BSS_SECTION(0, 0, 0) _end = .; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ef85139553f..58b31ee198d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE - select ARCH_SUPPORTS_INT128 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA @@ -73,7 +73,6 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL - select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY @@ -158,6 +157,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA @@ -932,36 +932,6 @@ config GART_IOMMU If unsure, say Y. -config CALGARY_IOMMU - bool "IBM Calgary IOMMU support" - select IOMMU_HELPER - select SWIOTLB - depends on X86_64 && PCI - ---help--- - Support for hardware IOMMUs in IBM's xSeries x366 and x460 - systems. Needed to run systems with more than 3GB of memory - properly with 32-bit PCI devices that do not support DAC - (Double Address Cycle). Calgary also supports bus level - isolation, where all DMAs pass through the IOMMU. This - prevents them from going anywhere except their intended - destination. This catches hard-to-find kernel bugs and - mis-behaving drivers and devices that do not use the DMA-API - properly to set up their DMA buffers. The IOMMU can be - turned off at boot time with the iommu=off parameter. - Normally the kernel will make the right choice by itself. - If unsure, say Y. - -config CALGARY_IOMMU_ENABLED_BY_DEFAULT - def_bool y - prompt "Should Calgary be enabled by default?" - depends on CALGARY_IOMMU - ---help--- - Should Calgary be enabled by default? if you choose 'y', Calgary - will be used (if it exists). If you choose 'n', Calgary will not be - used even if it exists. If you choose 'n' and would like to use - Calgary anyway, pass 'iommu=calgary' on the kernel command line. - If unsure, say Y. - config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL @@ -1000,8 +970,8 @@ config NR_CPUS_RANGE_END config NR_CPUS_RANGE_END int depends on X86_64 - default 8192 if SMP && ( MAXSMP || CPUMASK_OFFSTACK) - default 512 if SMP && (!MAXSMP && !CPUMASK_OFFSTACK) + default 8192 if SMP && CPUMASK_OFFSTACK + default 512 if SMP && !CPUMASK_OFFSTACK default 1 if !SMP config NR_CPUS_DEFAULT @@ -1254,6 +1224,24 @@ config X86_VSYSCALL_EMULATION Disabling this option saves about 7K of kernel size and possibly 4K of additional runtime pagetable memory. +config X86_IOPL_IOPERM + bool "IOPERM and IOPL Emulation" + default y + ---help--- + This enables the ioperm() and iopl() syscalls which are necessary + for legacy applications. + + Legacy IOPL support is an overbroad mechanism which allows user + space aside of accessing all 65536 I/O ports also to disable + interrupts. To gain this access the caller needs CAP_SYS_RAWIO + capabilities and permission from potentially active security + modules. + + The emulation restricts the functionality of the syscall to + only allowing the full range I/O port access, but prevents the + ability to disable interrupts from user space which would be + granted if the hardware IOPL mechanism would be used. + config TOSHIBA tristate "Toshiba Laptop support" depends on X86_32 @@ -1492,6 +1480,7 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + default y select DYNAMIC_MEMORY_LAYOUT select SPARSEMEM_VMEMMAP depends on X86_64 @@ -1751,7 +1740,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool depends on MODIFY_LDT_SYSCALL - prompt "Math emulation" if X86_32 + prompt "Math emulation" if X86_32 && (M486SX || MELAN) ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -1880,16 +1869,16 @@ config X86_SMAP If unsure, say Y. -config X86_INTEL_UMIP +config X86_UMIP def_bool y - depends on CPU_SUP_INTEL - prompt "Intel User Mode Instruction Prevention" if EXPERT + depends on CPU_SUP_INTEL || CPU_SUP_AMD + prompt "User Mode Instruction Prevention" if EXPERT ---help--- - The User Mode Instruction Prevention (UMIP) is a security - feature in newer Intel processors. If enabled, a general - protection fault is issued if the SGDT, SLDT, SIDT, SMSW - or STR instructions are executed in user mode. These instructions - unnecessarily expose information about the hardware state. + User Mode Instruction Prevention (UMIP) is a security feature in + some x86 processors. If enabled, a general protection fault is + issued if the SGDT, SLDT, SIDT, SMSW or STR instructions are + executed in user mode. These instructions unnecessarily expose + information about the hardware state. The vast majority of applications do not use these instructions. For the very few that do, software emulation is provided in diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8e29c991ba3e..af9c967782f6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -50,12 +50,19 @@ choice See each option's help text for additional details. If you don't know what to do, choose "486". +config M486SX + bool "486SX" + depends on X86_32 + ---help--- + Select this for an 486-class CPU without an FPU such as + AMD/Cyrix/IBM/Intel SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5S. + config M486 - bool "486" + bool "486DX" depends on X86_32 ---help--- Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel - 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + 486DX/DX2/DX4 and UMC U5D. config M586 bool "586/K5/5x86/6x86/6x86MX" @@ -312,20 +319,20 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if MELAN || M486 || MGEODEGX1 + default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 + depends on M586MMX || M586TSC || M586 || M486SX || M486 config X86_INVD_BUG def_bool y - depends on M486 + depends on M486SX || M486 config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -378,7 +385,7 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EXPERT @@ -402,7 +409,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) + depends on M486SX || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -470,7 +477,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on M486 || (EXPERT && !64BIT) + depends on M486SX || M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf9cd83de777..409c00f74e60 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -316,10 +316,6 @@ config UNWINDER_FRAME_POINTER unwinder, but the kernel text size will grow by ~3% and the kernel's overall performance will degrade by roughly 5-10%. - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 1f5faf8606b4..cd3056759880 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -10,6 +10,7 @@ else tune = $(call cc-option,-mcpu=$(1),$(2)) endif +cflags-$(CONFIG_M486SX) += -march=i486 cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index e2839b5c246c..95410d6ee2ff 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -67,6 +67,7 @@ clean-files += cpustr.h KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) GCOV_PROFILE := n UBSAN_SANITIZE := n @@ -87,7 +88,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6b84afdd7538..aa976adb7094 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -38,6 +38,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_CFLAGS += -Wno-pointer-sign +KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -72,8 +73,8 @@ $(obj)/../voffset.h: vmlinux FORCE $(obj)/misc.o: $(obj)/../voffset.h -vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ +vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \ + $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 82bc60c8acb2..72b08fde6de6 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -554,7 +554,11 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: - e820_type = E820_TYPE_RAM; + if (efi_soft_reserve_enabled() && + (d->attribute & EFI_MEMORY_SP)) + e820_type = E820_TYPE_SOFT_RESERVED; + else + e820_type = E820_TYPE_RAM; break; case EFI_ACPI_MEMORY_NVS: @@ -782,6 +786,9 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) /* Ask the firmware to clear memory on unclean shutdown */ efi_enable_reset_attack_mitigation(sys_table); + + efi_random_get_seed(sys_table); + efi_retrieve_tpm2_eventlog(sys_table); setup_graphics(boot_params); diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S index 257e341fd2c8..ed6c351d34ed 100644 --- a/arch/x86/boot/compressed/efi_stub_32.S +++ b/arch/x86/boot/compressed/efi_stub_32.S @@ -24,7 +24,7 @@ */ .text -ENTRY(efi_call_phys) +SYM_FUNC_START(efi_call_phys) /* * 0. The function can only be called in Linux kernel. So CS has been * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found @@ -77,7 +77,7 @@ ENTRY(efi_call_phys) movl saved_return_addr(%edx), %ecx pushl %ecx ret -ENDPROC(efi_call_phys) +SYM_FUNC_END(efi_call_phys) .previous .data diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index bff9ab7c6317..593913692d16 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -23,7 +23,7 @@ .code64 .text -ENTRY(efi64_thunk) +SYM_FUNC_START(efi64_thunk) push %rbp push %rbx @@ -97,14 +97,14 @@ ENTRY(efi64_thunk) pop %rbx pop %rbp ret -ENDPROC(efi64_thunk) +SYM_FUNC_END(efi64_thunk) -ENTRY(efi_exit32) +SYM_FUNC_START_LOCAL(efi_exit32) movq func_rt_ptr(%rip), %rax push %rax mov %rdi, %rax ret -ENDPROC(efi_exit32) +SYM_FUNC_END(efi_exit32) .code32 /* @@ -112,7 +112,7 @@ ENDPROC(efi_exit32) * * The stack should represent the 32-bit calling convention. */ -ENTRY(efi_enter32) +SYM_FUNC_START_LOCAL(efi_enter32) movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %es @@ -172,20 +172,23 @@ ENTRY(efi_enter32) btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 lret -ENDPROC(efi_enter32) +SYM_FUNC_END(efi_enter32) .data .balign 8 - .global efi32_boot_gdt -efi32_boot_gdt: .word 0 - .quad 0 +SYM_DATA_START(efi32_boot_gdt) + .word 0 + .quad 0 +SYM_DATA_END(efi32_boot_gdt) + +SYM_DATA_START_LOCAL(save_gdt) + .word 0 + .quad 0 +SYM_DATA_END(save_gdt) -save_gdt: .word 0 - .quad 0 -func_rt_ptr: .quad 0 +SYM_DATA_LOCAL(func_rt_ptr, .quad 0) - .global efi_gdt64 -efi_gdt64: +SYM_DATA_START(efi_gdt64) .word efi_gdt64_end - efi_gdt64 .long 0 /* Filled out by user */ .word 0 @@ -194,4 +197,4 @@ efi_gdt64: .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ .quad 0x0000000000000000 /* TS continued */ -efi_gdt64_end: +SYM_DATA_END_LABEL(efi_gdt64, SYM_L_LOCAL, efi_gdt64_end) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 5e30eaaf8576..f2dfd6d083ef 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -61,7 +61,7 @@ .hidden _egot __HEAD -ENTRY(startup_32) +SYM_FUNC_START(startup_32) cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -142,14 +142,14 @@ ENTRY(startup_32) */ leal .Lrelocated(%ebx), %eax jmp *%eax -ENDPROC(startup_32) +SYM_FUNC_END(startup_32) #ifdef CONFIG_EFI_STUB /* * We don't need the return address, so set up the stack so efi_main() can find * its arguments. */ -ENTRY(efi_pe_entry) +SYM_FUNC_START(efi_pe_entry) add $0x4, %esp call 1f @@ -174,9 +174,9 @@ ENTRY(efi_pe_entry) pushl %eax pushl %ecx jmp 2f /* Skip efi_config initialization */ -ENDPROC(efi_pe_entry) +SYM_FUNC_END(efi_pe_entry) -ENTRY(efi32_stub_entry) +SYM_FUNC_START(efi32_stub_entry) add $0x4, %esp popl %ecx popl %edx @@ -205,11 +205,11 @@ fail: movl BP_code32_start(%esi), %eax leal startup_32(%eax), %eax jmp *%eax -ENDPROC(efi32_stub_entry) +SYM_FUNC_END(efi32_stub_entry) #endif .text -.Lrelocated: +SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* * Clear BSS (stack is currently empty) @@ -260,6 +260,7 @@ ENDPROC(efi32_stub_entry) */ xorl %ebx, %ebx jmp *%eax +SYM_FUNC_END(.Lrelocated) #ifdef CONFIG_EFI_STUB .data diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d98cd483377e..58a512e33d8d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -45,7 +45,7 @@ __HEAD .code32 -ENTRY(startup_32) +SYM_FUNC_START(startup_32) /* * 32bit entry is 0 and it is ABI so immutable! * If we come here directly from a bootloader, @@ -222,11 +222,11 @@ ENTRY(startup_32) /* Jump from 32bit compatibility mode into 64bit mode. */ lret -ENDPROC(startup_32) +SYM_FUNC_END(startup_32) #ifdef CONFIG_EFI_MIXED .org 0x190 -ENTRY(efi32_stub_entry) +SYM_FUNC_START(efi32_stub_entry) add $0x4, %esp /* Discard return address */ popl %ecx popl %edx @@ -245,12 +245,12 @@ ENTRY(efi32_stub_entry) movl %eax, efi_config(%ebp) jmp startup_32 -ENDPROC(efi32_stub_entry) +SYM_FUNC_END(efi32_stub_entry) #endif .code64 .org 0x200 -ENTRY(startup_64) +SYM_CODE_START(startup_64) /* * 64bit entry is 0x200 and it is ABI so immutable! * We come here either from startup_32 or directly from a @@ -442,11 +442,12 @@ trampoline_return: */ leaq .Lrelocated(%rbx), %rax jmp *%rax +SYM_CODE_END(startup_64) #ifdef CONFIG_EFI_STUB /* The entry point for the PE/COFF executable is efi_pe_entry. */ -ENTRY(efi_pe_entry) +SYM_FUNC_START(efi_pe_entry) movq %rcx, efi64_config(%rip) /* Handle */ movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ @@ -495,10 +496,10 @@ fail: movl BP_code32_start(%esi), %eax leaq startup_64(%rax), %rax jmp *%rax -ENDPROC(efi_pe_entry) +SYM_FUNC_END(efi_pe_entry) .org 0x390 -ENTRY(efi64_stub_entry) +SYM_FUNC_START(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ movq %rsi, efi64_config+8(%rip) /* EFI System table pointer */ @@ -507,11 +508,11 @@ ENTRY(efi64_stub_entry) movq %rdx, %rsi jmp handover_entry -ENDPROC(efi64_stub_entry) +SYM_FUNC_END(efi64_stub_entry) #endif .text -.Lrelocated: +SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* * Clear BSS (stack is currently empty) @@ -540,6 +541,7 @@ ENDPROC(efi64_stub_entry) * Jump to the decompressed kernel. */ jmp *%rax +SYM_FUNC_END(.Lrelocated) /* * Adjust the global offset table @@ -570,7 +572,7 @@ ENDPROC(efi64_stub_entry) * ECX contains the base address of the trampoline memory. * Non zero RDX means trampoline needs to enable 5-level paging. */ -ENTRY(trampoline_32bit_src) +SYM_CODE_START(trampoline_32bit_src) /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds @@ -633,11 +635,13 @@ ENTRY(trampoline_32bit_src) movl %eax, %cr0 lret +SYM_CODE_END(trampoline_32bit_src) .code64 -.Lpaging_enabled: +SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled) /* Return from the trampoline */ jmp *%rdi +SYM_FUNC_END(.Lpaging_enabled) /* * The trampoline code has a size limit. @@ -647,20 +651,22 @@ ENTRY(trampoline_32bit_src) .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE .code32 -.Lno_longmode: +SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b +SYM_FUNC_END(.Lno_longmode) #include "../../kernel/verify_cpu.S" .data -gdt64: +SYM_DATA_START_LOCAL(gdt64) .word gdt_end - gdt .quad 0 +SYM_DATA_END(gdt64) .balign 8 -gdt: +SYM_DATA_START_LOCAL(gdt) .word gdt_end - gdt .long gdt .word 0 @@ -669,25 +675,24 @@ gdt: .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ .quad 0x0000000000000000 /* TS continued */ -gdt_end: +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) #ifdef CONFIG_EFI_STUB -efi_config: - .quad 0 +SYM_DATA_LOCAL(efi_config, .quad 0) #ifdef CONFIG_EFI_MIXED - .global efi32_config -efi32_config: +SYM_DATA_START(efi32_config) .fill 5,8,0 .quad efi64_thunk .byte 0 +SYM_DATA_END(efi32_config) #endif - .global efi64_config -efi64_config: +SYM_DATA_START(efi64_config) .fill 5,8,0 .quad efi_call .byte 1 +SYM_DATA_END(efi64_config) #endif /* CONFIG_EFI_STUB */ /* @@ -695,23 +700,21 @@ efi64_config: */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 -boot_stack: +SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) + +SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 -boot_stack_end: +SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end) /* * Space for page tables (not in .bss so not zeroed) */ .section ".pgtable","a",@nobits .balign 4096 -pgtable: - .fill BOOT_PGT_SIZE, 1, 0 +SYM_DATA_LOCAL(pgtable, .fill BOOT_PGT_SIZE, 1, 0) /* * The page table is going to be used instead of page table in the trampoline * memory. */ -top_pgtable: - .fill PAGE_SIZE, 1, 0 +SYM_DATA_LOCAL(top_pgtable, .fill PAGE_SIZE, 1, 0) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 2e53c056ba20..d7408af55738 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -132,8 +132,14 @@ char *skip_spaces(const char *str) #include "../../../../lib/ctype.c" #include "../../../../lib/cmdline.c" +enum parse_mode { + PARSE_MEMMAP, + PARSE_EFI, +}; + static int -parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +parse_memmap(char *p, unsigned long long *start, unsigned long long *size, + enum parse_mode mode) { char *oldp; @@ -156,8 +162,29 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) *start = memparse(p + 1, &p); return 0; case '@': - /* memmap=nn@ss specifies usable region, should be skipped */ - *size = 0; + if (mode == PARSE_MEMMAP) { + /* + * memmap=nn@ss specifies usable region, should + * be skipped + */ + *size = 0; + } else { + unsigned long long flags; + + /* + * efi_fake_mem=nn@ss:attr the attr specifies + * flags that might imply a soft-reservation. + */ + *start = memparse(p + 1, &p); + if (p && *p == ':') { + p++; + if (kstrtoull(p, 0, &flags) < 0) + *size = 0; + else if (flags & EFI_MEMORY_SP) + return 0; + } + *size = 0; + } /* Fall through */ default: /* @@ -172,7 +199,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) return -EINVAL; } -static void mem_avoid_memmap(char *str) +static void mem_avoid_memmap(enum parse_mode mode, char *str) { static int i; @@ -187,7 +214,7 @@ static void mem_avoid_memmap(char *str) if (k) *k++ = 0; - rc = parse_memmap(str, &start, &size); + rc = parse_memmap(str, &start, &size, mode); if (rc < 0) break; str = k; @@ -238,7 +265,6 @@ static void parse_gb_huge_pages(char *param, char *val) } } - static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); @@ -271,7 +297,7 @@ static void handle_mem_options(void) } if (!strcmp(param, "memmap")) { - mem_avoid_memmap(val); + mem_avoid_memmap(PARSE_MEMMAP, val); } else if (strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { @@ -284,6 +310,8 @@ static void handle_mem_options(void) goto out; mem_limit = mem_size; + } else if (!strcmp(param, "efi_fake_mem")) { + mem_avoid_memmap(PARSE_EFI, val); } } @@ -459,6 +487,18 @@ static bool mem_avoid_overlap(struct mem_vector *img, is_overlapping = true; } + if (ptr->type == SETUP_INDIRECT && + ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { + avoid.start = ((struct setup_indirect *)ptr->data)->addr; + avoid.size = ((struct setup_indirect *)ptr->data)->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + earliest = overlap->start; + is_overlapping = true; + } + } + ptr = (struct setup_data *)(unsigned long)ptr->next; } @@ -760,6 +800,10 @@ process_efi_entries(unsigned long minimum, unsigned long image_size) if (md->type != EFI_CONVENTIONAL_MEMORY) continue; + if (efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + continue; + if (efi_mirror_found && !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) continue; diff --git a/arch/x86/boot/compressed/kernel_info.S b/arch/x86/boot/compressed/kernel_info.S new file mode 100644 index 000000000000..f818ee8fba38 --- /dev/null +++ b/arch/x86/boot/compressed/kernel_info.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/bootparam.h> + + .section ".rodata.kernel_info", "a" + + .global kernel_info + +kernel_info: + /* Header, Linux top (structure). */ + .ascii "LToP" + /* Size. */ + .long kernel_info_var_len_data - kernel_info + /* Size total. */ + .long kernel_info_end - kernel_info + + /* Maximal allowed type for setup_data and setup_indirect structs. */ + .long SETUP_TYPE_MAX + +kernel_info_var_len_data: + /* Empty for time being... */ +kernel_info_end: diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 6afb7130a387..dd07e7b41b11 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -15,7 +15,7 @@ .text .code32 -ENTRY(get_sev_encryption_bit) +SYM_FUNC_START(get_sev_encryption_bit) xor %eax, %eax #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -65,10 +65,10 @@ ENTRY(get_sev_encryption_bit) #endif /* CONFIG_AMD_MEM_ENCRYPT */ ret -ENDPROC(get_sev_encryption_bit) +SYM_FUNC_END(get_sev_encryption_bit) .code64 -ENTRY(set_sev_encryption_mask) +SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -90,12 +90,11 @@ ENTRY(set_sev_encryption_mask) xor %rax, %rax ret -ENDPROC(set_sev_encryption_mask) +SYM_FUNC_END(set_sev_encryption_mask) .data #ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 -GLOBAL(sme_me_mask) - .quad 0 +SYM_DATA(sme_me_mask, .quad 0) #endif diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 4c5f4f4ad035..6afd05e819d2 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -15,7 +15,7 @@ .code16 .text -GLOBAL(memcpy) +SYM_FUNC_START_NOALIGN(memcpy) pushw %si pushw %di movw %ax, %di @@ -29,9 +29,9 @@ GLOBAL(memcpy) popw %di popw %si retl -ENDPROC(memcpy) +SYM_FUNC_END(memcpy) -GLOBAL(memset) +SYM_FUNC_START_NOALIGN(memset) pushw %di movw %ax, %di movzbl %dl, %eax @@ -44,22 +44,22 @@ GLOBAL(memset) rep; stosb popw %di retl -ENDPROC(memset) +SYM_FUNC_END(memset) -GLOBAL(copy_from_fs) +SYM_FUNC_START_NOALIGN(copy_from_fs) pushw %ds pushw %fs popw %ds calll memcpy popw %ds retl -ENDPROC(copy_from_fs) +SYM_FUNC_END(copy_from_fs) -GLOBAL(copy_to_fs) +SYM_FUNC_START_NOALIGN(copy_to_fs) pushw %es pushw %fs popw %es calll memcpy popw %es retl -ENDPROC(copy_to_fs) +SYM_FUNC_END(copy_to_fs) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2c11c0f45d49..97d9b6d6c1af 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -300,7 +300,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020d # header version number (>= 0x0105) + .word 0x020f # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -567,6 +567,7 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c +kernel_info_offset: .long 0 # Filled in by build.c # End of setup header ##################################################### diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index c22f9a7d1aeb..cbec8bd0841f 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -21,7 +21,7 @@ /* * void protected_mode_jump(u32 entrypoint, u32 bootparams); */ -GLOBAL(protected_mode_jump) +SYM_FUNC_START_NOALIGN(protected_mode_jump) movl %edx, %esi # Pointer to boot_params table xorl %ebx, %ebx @@ -40,13 +40,13 @@ GLOBAL(protected_mode_jump) # Transition to 32-bit mode .byte 0x66, 0xea # ljmpl opcode -2: .long in_pm32 # offset +2: .long .Lin_pm32 # offset .word __BOOT_CS # segment -ENDPROC(protected_mode_jump) +SYM_FUNC_END(protected_mode_jump) .code32 .section ".text32","ax" -GLOBAL(in_pm32) +SYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32) # Set up data segments for flat 32-bit mode movl %ecx, %ds movl %ecx, %es @@ -72,4 +72,4 @@ GLOBAL(in_pm32) lldt %cx jmpl *%eax # Jump to the 32-bit entrypoint -ENDPROC(in_pm32) +SYM_FUNC_END(.Lin_pm32) diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a93d44e58f9c..55e669d29e54 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -56,6 +56,7 @@ u8 buf[SETUP_SECT_MAX*512]; unsigned long efi32_stub_entry; unsigned long efi64_stub_entry; unsigned long efi_pe_entry; +unsigned long kernel_info; unsigned long startup_64; /*----------------------------------------------------------------------*/ @@ -321,6 +322,7 @@ static void parse_zoffset(char *fname) PARSE_ZOFS(p, efi32_stub_entry); PARSE_ZOFS(p, efi64_stub_entry); PARSE_ZOFS(p, efi_pe_entry); + PARSE_ZOFS(p, kernel_info); PARSE_ZOFS(p, startup_64); p = strchr(p, '\n'); @@ -410,6 +412,9 @@ int main(int argc, char ** argv) efi_stub_entry_update(); + /* Update kernel_info offset. */ + put_unaligned_le32(kernel_info, &buf[0x268]); + crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, dest) != i) die("Writing setup failed"); diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index d0a5ffeae8df..0b9654c7a05c 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -25,7 +25,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_SMP=y -CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 CONFIG_SCHED_SMT=y CONFIG_PREEMPT_VOLUNTARY=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 759b1a927826..958440eae27e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -48,6 +49,7 @@ ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o endif # These modules require assembler to support AVX2. @@ -70,6 +72,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 4434607e366d..51d46d93efbc 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -71,7 +71,7 @@ * %r8 * %r9 */ -__load_partial: +SYM_FUNC_START_LOCAL(__load_partial) xor %r9d, %r9d pxor MSG, MSG @@ -123,7 +123,7 @@ __load_partial: .Lld_partial_8: ret -ENDPROC(__load_partial) +SYM_FUNC_END(__load_partial) /* * __store_partial: internal ABI @@ -137,7 +137,7 @@ ENDPROC(__load_partial) * %r9 * %r10 */ -__store_partial: +SYM_FUNC_START_LOCAL(__store_partial) mov LEN, %r8 mov DST, %r9 @@ -181,12 +181,12 @@ __store_partial: .Lst_partial_1: ret -ENDPROC(__store_partial) +SYM_FUNC_END(__store_partial) /* * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); */ -ENTRY(crypto_aegis128_aesni_init) +SYM_FUNC_START(crypto_aegis128_aesni_init) FRAME_BEGIN /* load IV: */ @@ -226,13 +226,13 @@ ENTRY(crypto_aegis128_aesni_init) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_init) +SYM_FUNC_END(crypto_aegis128_aesni_init) /* * void crypto_aegis128_aesni_ad(void *state, unsigned int length, * const void *data); */ -ENTRY(crypto_aegis128_aesni_ad) +SYM_FUNC_START(crypto_aegis128_aesni_ad) FRAME_BEGIN cmp $0x10, LEN @@ -378,7 +378,7 @@ ENTRY(crypto_aegis128_aesni_ad) .Lad_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_ad) +SYM_FUNC_END(crypto_aegis128_aesni_ad) .macro encrypt_block a s0 s1 s2 s3 s4 i movdq\a (\i * 0x10)(SRC), MSG @@ -402,7 +402,7 @@ ENDPROC(crypto_aegis128_aesni_ad) * void crypto_aegis128_aesni_enc(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_enc) +SYM_FUNC_START(crypto_aegis128_aesni_enc) FRAME_BEGIN cmp $0x10, LEN @@ -493,13 +493,13 @@ ENTRY(crypto_aegis128_aesni_enc) .Lenc_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_enc) +SYM_FUNC_END(crypto_aegis128_aesni_enc) /* * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_BEGIN /* load the state: */ @@ -533,7 +533,7 @@ ENTRY(crypto_aegis128_aesni_enc_tail) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) .macro decrypt_block a s0 s1 s2 s3 s4 i movdq\a (\i * 0x10)(SRC), MSG @@ -556,7 +556,7 @@ ENDPROC(crypto_aegis128_aesni_enc_tail) * void crypto_aegis128_aesni_dec(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_dec) +SYM_FUNC_START(crypto_aegis128_aesni_dec) FRAME_BEGIN cmp $0x10, LEN @@ -647,13 +647,13 @@ ENTRY(crypto_aegis128_aesni_dec) .Ldec_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_dec) +SYM_FUNC_END(crypto_aegis128_aesni_dec) /* * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_BEGIN /* load the state: */ @@ -697,13 +697,13 @@ ENTRY(crypto_aegis128_aesni_dec_tail) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) /* * void crypto_aegis128_aesni_final(void *state, void *tag_xor, * u64 assoclen, u64 cryptlen); */ -ENTRY(crypto_aegis128_aesni_final) +SYM_FUNC_START(crypto_aegis128_aesni_final) FRAME_BEGIN /* load the state: */ @@ -744,4 +744,4 @@ ENTRY(crypto_aegis128_aesni_final) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_final) +SYM_FUNC_END(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 5f6a5af9c489..ec437db1fa54 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -544,11 +544,11 @@ ddq_add_8: * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_128_avx_by8) +SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_128 -ENDPROC(aes_ctr_enc_128_avx_by8) +SYM_FUNC_END(aes_ctr_enc_128_avx_by8) /* * routine to do AES192 CTR enc/decrypt "by8" @@ -557,11 +557,11 @@ ENDPROC(aes_ctr_enc_128_avx_by8) * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_192_avx_by8) +SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_192 -ENDPROC(aes_ctr_enc_192_avx_by8) +SYM_FUNC_END(aes_ctr_enc_192_avx_by8) /* * routine to do AES256 CTR enc/decrypt "by8" @@ -570,8 +570,8 @@ ENDPROC(aes_ctr_enc_192_avx_by8) * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_256_avx_by8) +SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_256 -ENDPROC(aes_ctr_enc_256_avx_by8) +SYM_FUNC_END(aes_ctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e40bdf024ba7..d28503f99f58 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1592,7 +1592,7 @@ _esb_loop_\@: * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ -ENTRY(aesni_gcm_dec) +SYM_FUNC_START(aesni_gcm_dec) FUNC_SAVE GCM_INIT %arg6, arg7, arg8, arg9 @@ -1600,7 +1600,7 @@ ENTRY(aesni_gcm_dec) GCM_COMPLETE arg10, arg11 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec) +SYM_FUNC_END(aesni_gcm_dec) /***************************************************************************** @@ -1680,7 +1680,7 @@ ENDPROC(aesni_gcm_dec) * * poly = x^128 + x^127 + x^126 + x^121 + 1 ***************************************************************************/ -ENTRY(aesni_gcm_enc) +SYM_FUNC_START(aesni_gcm_enc) FUNC_SAVE GCM_INIT %arg6, arg7, arg8, arg9 @@ -1689,7 +1689,7 @@ ENTRY(aesni_gcm_enc) GCM_COMPLETE arg10, arg11 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc) +SYM_FUNC_END(aesni_gcm_enc) /***************************************************************************** * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1702,12 +1702,12 @@ ENDPROC(aesni_gcm_enc) * const u8 *aad, // Additional Authentication Data (AAD) * u64 aad_len) // Length of AAD in bytes. */ -ENTRY(aesni_gcm_init) +SYM_FUNC_START(aesni_gcm_init) FUNC_SAVE GCM_INIT %arg3, %arg4,%arg5, %arg6 FUNC_RESTORE ret -ENDPROC(aesni_gcm_init) +SYM_FUNC_END(aesni_gcm_init) /***************************************************************************** * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1717,12 +1717,12 @@ ENDPROC(aesni_gcm_init) * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. */ -ENTRY(aesni_gcm_enc_update) +SYM_FUNC_START(aesni_gcm_enc_update) FUNC_SAVE GCM_ENC_DEC enc FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update) +SYM_FUNC_END(aesni_gcm_enc_update) /***************************************************************************** * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1732,12 +1732,12 @@ ENDPROC(aesni_gcm_enc_update) * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. */ -ENTRY(aesni_gcm_dec_update) +SYM_FUNC_START(aesni_gcm_dec_update) FUNC_SAVE GCM_ENC_DEC dec FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update) +SYM_FUNC_END(aesni_gcm_dec_update) /***************************************************************************** * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1747,19 +1747,18 @@ ENDPROC(aesni_gcm_dec_update) * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), * // 12 or 8. */ -ENTRY(aesni_gcm_finalize) +SYM_FUNC_START(aesni_gcm_finalize) FUNC_SAVE GCM_COMPLETE %arg3 %arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize) +SYM_FUNC_END(aesni_gcm_finalize) #endif -.align 4 -_key_expansion_128: -_key_expansion_256a: +SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) +SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1769,11 +1768,10 @@ _key_expansion_256a: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_128) -ENDPROC(_key_expansion_256a) +SYM_FUNC_END(_key_expansion_256a) +SYM_FUNC_END_ALIAS(_key_expansion_128) -.align 4 -_key_expansion_192a: +SYM_FUNC_START_LOCAL(_key_expansion_192a) pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1795,10 +1793,9 @@ _key_expansion_192a: movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP ret -ENDPROC(_key_expansion_192a) +SYM_FUNC_END(_key_expansion_192a) -.align 4 -_key_expansion_192b: +SYM_FUNC_START_LOCAL(_key_expansion_192b) pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1815,10 +1812,9 @@ _key_expansion_192b: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_192b) +SYM_FUNC_END(_key_expansion_192b) -.align 4 -_key_expansion_256b: +SYM_FUNC_START_LOCAL(_key_expansion_256b) pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 pxor %xmm4, %xmm2 @@ -1828,13 +1824,13 @@ _key_expansion_256b: movaps %xmm2, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_256b) +SYM_FUNC_END(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, * unsigned int key_len) */ -ENTRY(aesni_set_key) +SYM_FUNC_START(aesni_set_key) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -1943,12 +1939,12 @@ ENTRY(aesni_set_key) #endif FRAME_END ret -ENDPROC(aesni_set_key) +SYM_FUNC_END(aesni_set_key) /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ -ENTRY(aesni_enc) +SYM_FUNC_START(aesni_enc) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -1967,7 +1963,7 @@ ENTRY(aesni_enc) #endif FRAME_END ret -ENDPROC(aesni_enc) +SYM_FUNC_END(aesni_enc) /* * _aesni_enc1: internal ABI @@ -1981,8 +1977,7 @@ ENDPROC(aesni_enc) * KEY * TKEYP (T1) */ -.align 4 -_aesni_enc1: +SYM_FUNC_START_LOCAL(_aesni_enc1) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 @@ -2025,7 +2020,7 @@ _aesni_enc1: movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE ret -ENDPROC(_aesni_enc1) +SYM_FUNC_END(_aesni_enc1) /* * _aesni_enc4: internal ABI @@ -2045,8 +2040,7 @@ ENDPROC(_aesni_enc1) * KEY * TKEYP (T1) */ -.align 4 -_aesni_enc4: +SYM_FUNC_START_LOCAL(_aesni_enc4) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 @@ -2134,12 +2128,12 @@ _aesni_enc4: AESENCLAST KEY STATE3 AESENCLAST KEY STATE4 ret -ENDPROC(_aesni_enc4) +SYM_FUNC_END(_aesni_enc4) /* * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ -ENTRY(aesni_dec) +SYM_FUNC_START(aesni_dec) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -2159,7 +2153,7 @@ ENTRY(aesni_dec) #endif FRAME_END ret -ENDPROC(aesni_dec) +SYM_FUNC_END(aesni_dec) /* * _aesni_dec1: internal ABI @@ -2173,8 +2167,7 @@ ENDPROC(aesni_dec) * KEY * TKEYP (T1) */ -.align 4 -_aesni_dec1: +SYM_FUNC_START_LOCAL(_aesni_dec1) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 @@ -2217,7 +2210,7 @@ _aesni_dec1: movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE ret -ENDPROC(_aesni_dec1) +SYM_FUNC_END(_aesni_dec1) /* * _aesni_dec4: internal ABI @@ -2237,8 +2230,7 @@ ENDPROC(_aesni_dec1) * KEY * TKEYP (T1) */ -.align 4 -_aesni_dec4: +SYM_FUNC_START_LOCAL(_aesni_dec4) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 @@ -2326,13 +2318,13 @@ _aesni_dec4: AESDECLAST KEY STATE3 AESDECLAST KEY STATE4 ret -ENDPROC(_aesni_dec4) +SYM_FUNC_END(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len) */ -ENTRY(aesni_ecb_enc) +SYM_FUNC_START(aesni_ecb_enc) FRAME_BEGIN #ifndef __x86_64__ pushl LEN @@ -2386,13 +2378,13 @@ ENTRY(aesni_ecb_enc) #endif FRAME_END ret -ENDPROC(aesni_ecb_enc) +SYM_FUNC_END(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len); */ -ENTRY(aesni_ecb_dec) +SYM_FUNC_START(aesni_ecb_dec) FRAME_BEGIN #ifndef __x86_64__ pushl LEN @@ -2447,13 +2439,13 @@ ENTRY(aesni_ecb_dec) #endif FRAME_END ret -ENDPROC(aesni_ecb_dec) +SYM_FUNC_END(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_cbc_enc) +SYM_FUNC_START(aesni_cbc_enc) FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2491,13 +2483,13 @@ ENTRY(aesni_cbc_enc) #endif FRAME_END ret -ENDPROC(aesni_cbc_enc) +SYM_FUNC_END(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_cbc_dec) +SYM_FUNC_START(aesni_cbc_dec) FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2584,7 +2576,7 @@ ENTRY(aesni_cbc_dec) #endif FRAME_END ret -ENDPROC(aesni_cbc_dec) +SYM_FUNC_END(aesni_cbc_dec) #ifdef __x86_64__ .pushsection .rodata @@ -2604,8 +2596,7 @@ ENDPROC(aesni_cbc_dec) * INC: == 1, in little endian * BSWAP_MASK == endian swapping mask */ -.align 4 -_aesni_inc_init: +SYM_FUNC_START_LOCAL(_aesni_inc_init) movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR PSHUFB_XMM BSWAP_MASK CTR @@ -2613,7 +2604,7 @@ _aesni_inc_init: MOVQ_R64_XMM TCTR_LOW INC MOVQ_R64_XMM CTR TCTR_LOW ret -ENDPROC(_aesni_inc_init) +SYM_FUNC_END(_aesni_inc_init) /* * _aesni_inc: internal ABI @@ -2630,8 +2621,7 @@ ENDPROC(_aesni_inc_init) * CTR: == output IV, in little endian * TCTR_LOW: == lower qword of CTR */ -.align 4 -_aesni_inc: +SYM_FUNC_START_LOCAL(_aesni_inc) paddq INC, CTR add $1, TCTR_LOW jnc .Linc_low @@ -2642,13 +2632,13 @@ _aesni_inc: movaps CTR, IV PSHUFB_XMM BSWAP_MASK IV ret -ENDPROC(_aesni_inc) +SYM_FUNC_END(_aesni_inc) /* * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_ctr_enc) +SYM_FUNC_START(aesni_ctr_enc) FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret @@ -2705,7 +2695,7 @@ ENTRY(aesni_ctr_enc) .Lctr_enc_just_ret: FRAME_END ret -ENDPROC(aesni_ctr_enc) +SYM_FUNC_END(aesni_ctr_enc) /* * _aesni_gf128mul_x_ble: internal ABI @@ -2729,7 +2719,7 @@ ENDPROC(aesni_ctr_enc) * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * bool enc, u8 *iv) */ -ENTRY(aesni_xts_crypt8) +SYM_FUNC_START(aesni_xts_crypt8) FRAME_BEGIN cmpb $0, %cl movl $0, %ecx @@ -2833,6 +2823,6 @@ ENTRY(aesni_xts_crypt8) FRAME_END ret -ENDPROC(aesni_xts_crypt8) +SYM_FUNC_END(aesni_xts_crypt8) #endif diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 91c039ab5699..bfa1c0b3e5b4 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -1775,12 +1775,12 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_init_avx_gen2) +SYM_FUNC_START(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret -ENDPROC(aesni_gcm_init_avx_gen2) +SYM_FUNC_END(aesni_gcm_init_avx_gen2) ############################################################################### #void aesni_gcm_enc_update_avx_gen2( @@ -1790,7 +1790,7 @@ ENDPROC(aesni_gcm_init_avx_gen2) # const u8 *in, /* Plaintext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_update_avx_gen2) +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) FUNC_SAVE mov keysize, %eax cmp $32, %eax @@ -1809,7 +1809,7 @@ key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update_avx_gen2) +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) ############################################################################### #void aesni_gcm_dec_update_avx_gen2( @@ -1819,7 +1819,7 @@ ENDPROC(aesni_gcm_enc_update_avx_gen2) # const u8 *in, /* Ciphertext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_update_avx_gen2) +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -1838,7 +1838,7 @@ key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update_avx_gen2) +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) ############################################################################### #void aesni_gcm_finalize_avx_gen2( @@ -1848,7 +1848,7 @@ ENDPROC(aesni_gcm_dec_update_avx_gen2) # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_finalize_avx_gen2) +SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -1867,7 +1867,7 @@ key_256_finalize: GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize_avx_gen2) +SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2746,12 +2746,12 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_init_avx_gen4) +SYM_FUNC_START(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret -ENDPROC(aesni_gcm_init_avx_gen4) +SYM_FUNC_END(aesni_gcm_init_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( @@ -2761,7 +2761,7 @@ ENDPROC(aesni_gcm_init_avx_gen4) # const u8 *in, /* Plaintext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_update_avx_gen4) +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2780,7 +2780,7 @@ key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update_avx_gen4) +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) ############################################################################### #void aesni_gcm_dec_update_avx_gen4( @@ -2790,7 +2790,7 @@ ENDPROC(aesni_gcm_enc_update_avx_gen4) # const u8 *in, /* Ciphertext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_update_avx_gen4) +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2809,7 +2809,7 @@ key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update_avx_gen4) +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) ############################################################################### #void aesni_gcm_finalize_avx_gen4( @@ -2819,7 +2819,7 @@ ENDPROC(aesni_gcm_dec_update_avx_gen4) # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_finalize_avx_gen4) +SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2838,6 +2838,6 @@ key_256_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize_avx_gen4) +SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S new file mode 100644 index 000000000000..24910b766bdd --- /dev/null +++ b/arch/x86/crypto/blake2s-core.S @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +#ifdef CONFIG_AS_AVX512 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +#endif /* CONFIG_AS_AVX512 */ + +.text +#ifdef CONFIG_AS_SSSE3 +SYM_FUNC_START(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + ret +SYM_FUNC_END(blake2s_compress_ssse3) +#endif /* CONFIG_AS_SSSE3 */ + +#ifdef CONFIG_AS_AVX512 +SYM_FUNC_START(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + retq +SYM_FUNC_END(blake2s_compress_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c new file mode 100644 index 000000000000..4a37ba7cdbe5 --- /dev/null +++ b/arch/x86/crypto/blake2s-glue.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <crypto/internal/blake2s.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/hash.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/api.h> +#include <asm/processor.h> +#include <asm/simd.h> + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress_arch(struct blake2s_state *state, + const u8 *block, size_t nblocks, + const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + for (;;) { + const size_t blocks = min_t(size_t, nblocks, + PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + if (!nblocks) + break; + block += blocks * BLAKE2S_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(blake2s_compress_arch); + +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); + + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(tctx->key, key, keylen); + tctx->keylen = keylen; + + return 0; +} + +static int crypto_blake2s_init(struct shash_desc *desc) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct blake2s_state *state = shash_desc_ctx(desc); + const int outlen = crypto_shash_digestsize(desc->tfm); + + if (tctx->keylen) + blake2s_init_key(state, outlen, tctx->key, tctx->keylen); + else + blake2s_init(state, outlen); + + return 0; +} + +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, + unsigned int inlen) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; + + if (unlikely(!inlen)) + return 0; + if (inlen > fill) { + memcpy(state->buf + state->buflen, in, fill); + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + state->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + /* Hash one less (full) block than strictly possible */ + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(state->buf + state->buflen, in, inlen); + state->buflen += inlen; + + return 0; +} + +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + + blake2s_set_lastblock(state); + memset(state->buf + state->buflen, 0, + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ + blake2s_compress_arch(state, state->buf, 1, state->buflen); + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + memcpy(out, state->h, state->outlen); + memzero_explicit(state, sizeof(*state)); + + return 0; +} + +static struct shash_alg blake2s_algs[] = {{ + .base.cra_name = "blake2s-128", + .base.cra_driver_name = "blake2s-128-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_128_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-160", + .base.cra_driver_name = "blake2s-160-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_160_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-224", + .base.cra_driver_name = "blake2s-224-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_224_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-256", + .base.cra_driver_name = "blake2s-256-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_256_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}}; + +static int __init blake2s_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&blake2s_use_ssse3); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +static void __exit blake2s_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +module_init(blake2s_mod_init); +module_exit(blake2s_mod_exit); + +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 330db7a48af8..4222ac6d6584 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -103,7 +103,7 @@ bswapq RX0; \ xorq RX0, (RIO); -ENTRY(__blowfish_enc_blk) +SYM_FUNC_START(__blowfish_enc_blk) /* input: * %rdi: ctx * %rsi: dst @@ -139,9 +139,9 @@ ENTRY(__blowfish_enc_blk) .L__enc_xor: xor_block(); ret; -ENDPROC(__blowfish_enc_blk) +SYM_FUNC_END(__blowfish_enc_blk) -ENTRY(blowfish_dec_blk) +SYM_FUNC_START(blowfish_dec_blk) /* input: * %rdi: ctx * %rsi: dst @@ -171,7 +171,7 @@ ENTRY(blowfish_dec_blk) movq %r11, %r12; ret; -ENDPROC(blowfish_dec_blk) +SYM_FUNC_END(blowfish_dec_blk) /********************************************************************** 4-way blowfish, four blocks parallel @@ -283,7 +283,7 @@ ENDPROC(blowfish_dec_blk) bswapq RX3; \ xorq RX3, 24(RIO); -ENTRY(__blowfish_enc_blk_4way) +SYM_FUNC_START(__blowfish_enc_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -330,9 +330,9 @@ ENTRY(__blowfish_enc_blk_4way) popq %rbx; popq %r12; ret; -ENDPROC(__blowfish_enc_blk_4way) +SYM_FUNC_END(__blowfish_enc_blk_4way) -ENTRY(blowfish_dec_blk_4way) +SYM_FUNC_START(blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -365,4 +365,4 @@ ENTRY(blowfish_dec_blk_4way) popq %r12; ret; -ENDPROC(blowfish_dec_blk_4way) +SYM_FUNC_END(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index a14af6eb09cb..d01ddd73de65 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -189,20 +189,20 @@ * larger and would only be 0.5% faster (on sandy-bridge). */ .align 8 -roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: +SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rcx, (%r9)); ret; -ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) +SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 -roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: +SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, %rax, (%r9)); ret; -ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) +SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* * IN/OUT: @@ -722,7 +722,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text .align 8 -__camellia_enc_blk16: +SYM_FUNC_START_LOCAL(__camellia_enc_blk16) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes @@ -806,10 +806,10 @@ __camellia_enc_blk16: %xmm15, %rax, %rcx, 24); jmp .Lenc_done; -ENDPROC(__camellia_enc_blk16) +SYM_FUNC_END(__camellia_enc_blk16) .align 8 -__camellia_dec_blk16: +SYM_FUNC_START_LOCAL(__camellia_dec_blk16) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes @@ -891,9 +891,9 @@ __camellia_dec_blk16: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -ENDPROC(__camellia_dec_blk16) +SYM_FUNC_END(__camellia_dec_blk16) -ENTRY(camellia_ecb_enc_16way) +SYM_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -916,9 +916,9 @@ ENTRY(camellia_ecb_enc_16way) FRAME_END ret; -ENDPROC(camellia_ecb_enc_16way) +SYM_FUNC_END(camellia_ecb_enc_16way) -ENTRY(camellia_ecb_dec_16way) +SYM_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -946,9 +946,9 @@ ENTRY(camellia_ecb_dec_16way) FRAME_END ret; -ENDPROC(camellia_ecb_dec_16way) +SYM_FUNC_END(camellia_ecb_dec_16way) -ENTRY(camellia_cbc_dec_16way) +SYM_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -997,7 +997,7 @@ ENTRY(camellia_cbc_dec_16way) FRAME_END ret; -ENDPROC(camellia_cbc_dec_16way) +SYM_FUNC_END(camellia_cbc_dec_16way) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -1005,7 +1005,7 @@ ENDPROC(camellia_cbc_dec_16way) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -ENTRY(camellia_ctr_16way) +SYM_FUNC_START(camellia_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1110,7 +1110,7 @@ ENTRY(camellia_ctr_16way) FRAME_END ret; -ENDPROC(camellia_ctr_16way) +SYM_FUNC_END(camellia_ctr_16way) #define gf128mul_x_ble(iv, mask, tmp) \ vpsrad $31, iv, tmp; \ @@ -1120,7 +1120,7 @@ ENDPROC(camellia_ctr_16way) vpxor tmp, iv, iv; .align 8 -camellia_xts_crypt_16way: +SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1254,9 +1254,9 @@ camellia_xts_crypt_16way: FRAME_END ret; -ENDPROC(camellia_xts_crypt_16way) +SYM_FUNC_END(camellia_xts_crypt_16way) -ENTRY(camellia_xts_enc_16way) +SYM_FUNC_START(camellia_xts_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1268,9 +1268,9 @@ ENTRY(camellia_xts_enc_16way) leaq __camellia_enc_blk16, %r9; jmp camellia_xts_crypt_16way; -ENDPROC(camellia_xts_enc_16way) +SYM_FUNC_END(camellia_xts_enc_16way) -ENTRY(camellia_xts_dec_16way) +SYM_FUNC_START(camellia_xts_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1286,4 +1286,4 @@ ENTRY(camellia_xts_dec_16way) leaq __camellia_dec_blk16, %r9; jmp camellia_xts_crypt_16way; -ENDPROC(camellia_xts_dec_16way) +SYM_FUNC_END(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 4be4c7c3ba27..563ef6e83cdd 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -223,20 +223,20 @@ * larger and would only marginally faster. */ .align 8 -roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: +SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rcx, (%r9)); ret; -ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) +SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 -roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: +SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, %rax, (%r9)); ret; -ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) +SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* * IN/OUT: @@ -760,7 +760,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text .align 8 -__camellia_enc_blk32: +SYM_FUNC_START_LOCAL(__camellia_enc_blk32) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes @@ -844,10 +844,10 @@ __camellia_enc_blk32: %ymm15, %rax, %rcx, 24); jmp .Lenc_done; -ENDPROC(__camellia_enc_blk32) +SYM_FUNC_END(__camellia_enc_blk32) .align 8 -__camellia_dec_blk32: +SYM_FUNC_START_LOCAL(__camellia_dec_blk32) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes @@ -929,9 +929,9 @@ __camellia_dec_blk32: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -ENDPROC(__camellia_dec_blk32) +SYM_FUNC_END(__camellia_dec_blk32) -ENTRY(camellia_ecb_enc_32way) +SYM_FUNC_START(camellia_ecb_enc_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -958,9 +958,9 @@ ENTRY(camellia_ecb_enc_32way) FRAME_END ret; -ENDPROC(camellia_ecb_enc_32way) +SYM_FUNC_END(camellia_ecb_enc_32way) -ENTRY(camellia_ecb_dec_32way) +SYM_FUNC_START(camellia_ecb_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -992,9 +992,9 @@ ENTRY(camellia_ecb_dec_32way) FRAME_END ret; -ENDPROC(camellia_ecb_dec_32way) +SYM_FUNC_END(camellia_ecb_dec_32way) -ENTRY(camellia_cbc_dec_32way) +SYM_FUNC_START(camellia_cbc_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1060,7 +1060,7 @@ ENTRY(camellia_cbc_dec_32way) FRAME_END ret; -ENDPROC(camellia_cbc_dec_32way) +SYM_FUNC_END(camellia_cbc_dec_32way) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -1076,7 +1076,7 @@ ENDPROC(camellia_cbc_dec_32way) vpslldq $8, tmp1, tmp1; \ vpsubq tmp1, x, x; -ENTRY(camellia_ctr_32way) +SYM_FUNC_START(camellia_ctr_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1200,7 +1200,7 @@ ENTRY(camellia_ctr_32way) FRAME_END ret; -ENDPROC(camellia_ctr_32way) +SYM_FUNC_END(camellia_ctr_32way) #define gf128mul_x_ble(iv, mask, tmp) \ vpsrad $31, iv, tmp; \ @@ -1222,7 +1222,7 @@ ENDPROC(camellia_ctr_32way) vpxor tmp1, iv, iv; .align 8 -camellia_xts_crypt_32way: +SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1367,9 +1367,9 @@ camellia_xts_crypt_32way: FRAME_END ret; -ENDPROC(camellia_xts_crypt_32way) +SYM_FUNC_END(camellia_xts_crypt_32way) -ENTRY(camellia_xts_enc_32way) +SYM_FUNC_START(camellia_xts_enc_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1382,9 +1382,9 @@ ENTRY(camellia_xts_enc_32way) leaq __camellia_enc_blk32, %r9; jmp camellia_xts_crypt_32way; -ENDPROC(camellia_xts_enc_32way) +SYM_FUNC_END(camellia_xts_enc_32way) -ENTRY(camellia_xts_dec_32way) +SYM_FUNC_START(camellia_xts_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1400,4 +1400,4 @@ ENTRY(camellia_xts_dec_32way) leaq __camellia_dec_blk32, %r9; jmp camellia_xts_crypt_32way; -ENDPROC(camellia_xts_dec_32way) +SYM_FUNC_END(camellia_xts_dec_32way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 23528bc18fc6..1372e6408850 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -175,7 +175,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -ENTRY(__camellia_enc_blk) +SYM_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -220,9 +220,9 @@ ENTRY(__camellia_enc_blk) movq RR12, %r12; ret; -ENDPROC(__camellia_enc_blk) +SYM_FUNC_END(__camellia_enc_blk) -ENTRY(camellia_dec_blk) +SYM_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -258,7 +258,7 @@ ENTRY(camellia_dec_blk) movq RR12, %r12; ret; -ENDPROC(camellia_dec_blk) +SYM_FUNC_END(camellia_dec_blk) /********************************************************************** 2-way camellia @@ -409,7 +409,7 @@ ENDPROC(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -ENTRY(__camellia_enc_blk_2way) +SYM_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -456,9 +456,9 @@ ENTRY(__camellia_enc_blk_2way) movq RR12, %r12; popq %rbx; ret; -ENDPROC(__camellia_enc_blk_2way) +SYM_FUNC_END(__camellia_enc_blk_2way) -ENTRY(camellia_dec_blk_2way) +SYM_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -496,4 +496,4 @@ ENTRY(camellia_dec_blk_2way) movq RR12, %r12; movq RXOR, %rbx; ret; -ENDPROC(camellia_dec_blk_2way) +SYM_FUNC_END(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index dc55c3332fcc..8a6181b08b59 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -209,7 +209,7 @@ .text .align 16 -__cast5_enc_blk16: +SYM_FUNC_START_LOCAL(__cast5_enc_blk16) /* input: * %rdi: ctx * RL1: blocks 1 and 2 @@ -280,10 +280,10 @@ __cast5_enc_blk16: outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; -ENDPROC(__cast5_enc_blk16) +SYM_FUNC_END(__cast5_enc_blk16) .align 16 -__cast5_dec_blk16: +SYM_FUNC_START_LOCAL(__cast5_dec_blk16) /* input: * %rdi: ctx * RL1: encrypted blocks 1 and 2 @@ -357,9 +357,9 @@ __cast5_dec_blk16: .L__skip_dec: vpsrldq $4, RKR, RKR; jmp .L__dec_tail; -ENDPROC(__cast5_dec_blk16) +SYM_FUNC_END(__cast5_dec_blk16) -ENTRY(cast5_ecb_enc_16way) +SYM_FUNC_START(cast5_ecb_enc_16way) /* input: * %rdi: ctx * %rsi: dst @@ -394,9 +394,9 @@ ENTRY(cast5_ecb_enc_16way) popq %r15; FRAME_END ret; -ENDPROC(cast5_ecb_enc_16way) +SYM_FUNC_END(cast5_ecb_enc_16way) -ENTRY(cast5_ecb_dec_16way) +SYM_FUNC_START(cast5_ecb_dec_16way) /* input: * %rdi: ctx * %rsi: dst @@ -432,9 +432,9 @@ ENTRY(cast5_ecb_dec_16way) popq %r15; FRAME_END ret; -ENDPROC(cast5_ecb_dec_16way) +SYM_FUNC_END(cast5_ecb_dec_16way) -ENTRY(cast5_cbc_dec_16way) +SYM_FUNC_START(cast5_cbc_dec_16way) /* input: * %rdi: ctx * %rsi: dst @@ -484,9 +484,9 @@ ENTRY(cast5_cbc_dec_16way) popq %r12; FRAME_END ret; -ENDPROC(cast5_cbc_dec_16way) +SYM_FUNC_END(cast5_cbc_dec_16way) -ENTRY(cast5_ctr_16way) +SYM_FUNC_START(cast5_ctr_16way) /* input: * %rdi: ctx * %rsi: dst @@ -560,4 +560,4 @@ ENTRY(cast5_ctr_16way) popq %r12; FRAME_END ret; -ENDPROC(cast5_ctr_16way) +SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 4f0a7cdb94d9..932a3ce32a88 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -247,7 +247,7 @@ .text .align 8 -__cast6_enc_blk8: +SYM_FUNC_START_LOCAL(__cast6_enc_blk8) /* input: * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -292,10 +292,10 @@ __cast6_enc_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; -ENDPROC(__cast6_enc_blk8) +SYM_FUNC_END(__cast6_enc_blk8) .align 8 -__cast6_dec_blk8: +SYM_FUNC_START_LOCAL(__cast6_dec_blk8) /* input: * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks @@ -339,9 +339,9 @@ __cast6_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; -ENDPROC(__cast6_dec_blk8) +SYM_FUNC_END(__cast6_dec_blk8) -ENTRY(cast6_ecb_enc_8way) +SYM_FUNC_START(cast6_ecb_enc_8way) /* input: * %rdi: ctx * %rsi: dst @@ -362,9 +362,9 @@ ENTRY(cast6_ecb_enc_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_ecb_enc_8way) +SYM_FUNC_END(cast6_ecb_enc_8way) -ENTRY(cast6_ecb_dec_8way) +SYM_FUNC_START(cast6_ecb_dec_8way) /* input: * %rdi: ctx * %rsi: dst @@ -385,9 +385,9 @@ ENTRY(cast6_ecb_dec_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_ecb_dec_8way) +SYM_FUNC_END(cast6_ecb_dec_8way) -ENTRY(cast6_cbc_dec_8way) +SYM_FUNC_START(cast6_cbc_dec_8way) /* input: * %rdi: ctx * %rsi: dst @@ -411,9 +411,9 @@ ENTRY(cast6_cbc_dec_8way) popq %r12; FRAME_END ret; -ENDPROC(cast6_cbc_dec_8way) +SYM_FUNC_END(cast6_cbc_dec_8way) -ENTRY(cast6_ctr_8way) +SYM_FUNC_START(cast6_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -439,9 +439,9 @@ ENTRY(cast6_ctr_8way) popq %r12; FRAME_END ret; -ENDPROC(cast6_ctr_8way) +SYM_FUNC_END(cast6_ctr_8way) -ENTRY(cast6_xts_enc_8way) +SYM_FUNC_START(cast6_xts_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -466,9 +466,9 @@ ENTRY(cast6_xts_enc_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_xts_enc_8way) +SYM_FUNC_END(cast6_xts_enc_8way) -ENTRY(cast6_xts_dec_8way) +SYM_FUNC_START(cast6_xts_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -493,4 +493,4 @@ ENTRY(cast6_xts_dec_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_xts_dec_8way) +SYM_FUNC_END(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S index 831e4434fc20..ee9a40ab4109 100644 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -34,7 +34,7 @@ CTR4BL: .octa 0x00000000000000000000000000000002 .text -ENTRY(chacha_2block_xor_avx2) +SYM_FUNC_START(chacha_2block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 2 data blocks output, o # %rdx: up to 2 data blocks input, i @@ -224,9 +224,9 @@ ENTRY(chacha_2block_xor_avx2) lea -8(%r10),%rsp jmp .Ldone2 -ENDPROC(chacha_2block_xor_avx2) +SYM_FUNC_END(chacha_2block_xor_avx2) -ENTRY(chacha_4block_xor_avx2) +SYM_FUNC_START(chacha_4block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -529,9 +529,9 @@ ENTRY(chacha_4block_xor_avx2) lea -8(%r10),%rsp jmp .Ldone4 -ENDPROC(chacha_4block_xor_avx2) +SYM_FUNC_END(chacha_4block_xor_avx2) -ENTRY(chacha_8block_xor_avx2) +SYM_FUNC_START(chacha_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o # %rdx: up to 8 data blocks input, i @@ -1018,4 +1018,4 @@ ENTRY(chacha_8block_xor_avx2) jmp .Ldone8 -ENDPROC(chacha_8block_xor_avx2) +SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S index 848f9c75fd4f..bb193fde123a 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -24,7 +24,7 @@ CTR8BL: .octa 0x00000003000000020000000100000000 .text -ENTRY(chacha_2block_xor_avx512vl) +SYM_FUNC_START(chacha_2block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 2 data blocks output, o # %rdx: up to 2 data blocks input, i @@ -187,9 +187,9 @@ ENTRY(chacha_2block_xor_avx512vl) jmp .Ldone2 -ENDPROC(chacha_2block_xor_avx512vl) +SYM_FUNC_END(chacha_2block_xor_avx512vl) -ENTRY(chacha_4block_xor_avx512vl) +SYM_FUNC_START(chacha_4block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -453,9 +453,9 @@ ENTRY(chacha_4block_xor_avx512vl) jmp .Ldone4 -ENDPROC(chacha_4block_xor_avx512vl) +SYM_FUNC_END(chacha_4block_xor_avx512vl) -ENTRY(chacha_8block_xor_avx512vl) +SYM_FUNC_START(chacha_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o # %rdx: up to 8 data blocks input, i @@ -833,4 +833,4 @@ ENTRY(chacha_8block_xor_avx512vl) jmp .Ldone8 -ENDPROC(chacha_8block_xor_avx512vl) +SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index 2d86c7d6dc88..a38ab2512a6f 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -33,7 +33,7 @@ CTRINC: .octa 0x00000003000000020000000100000000 * * Clobbers: %r8d, %xmm4-%xmm7 */ -chacha_permute: +SYM_FUNC_START_LOCAL(chacha_permute) movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 @@ -109,9 +109,9 @@ chacha_permute: jnz .Ldoubleround ret -ENDPROC(chacha_permute) +SYM_FUNC_END(chacha_permute) -ENTRY(chacha_block_xor_ssse3) +SYM_FUNC_START(chacha_block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 1 data block output, o # %rdx: up to 1 data block input, i @@ -197,9 +197,9 @@ ENTRY(chacha_block_xor_ssse3) lea -8(%r10),%rsp jmp .Ldone -ENDPROC(chacha_block_xor_ssse3) +SYM_FUNC_END(chacha_block_xor_ssse3) -ENTRY(hchacha_block_ssse3) +SYM_FUNC_START(hchacha_block_ssse3) # %rdi: Input state matrix, s # %rsi: output (8 32-bit words) # %edx: nrounds @@ -218,9 +218,9 @@ ENTRY(hchacha_block_ssse3) FRAME_END ret -ENDPROC(hchacha_block_ssse3) +SYM_FUNC_END(hchacha_block_ssse3) -ENTRY(chacha_4block_xor_ssse3) +SYM_FUNC_START(chacha_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -788,4 +788,4 @@ ENTRY(chacha_4block_xor_ssse3) jmp .Ldone4 -ENDPROC(chacha_4block_xor_ssse3) +SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 388f95a4ec24..a94e30b6f941 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -7,7 +7,7 @@ */ #include <crypto/algapi.h> -#include <crypto/chacha.h> +#include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); -#ifdef CONFIG_AS_AVX2 + asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx2; -#ifdef CONFIG_AS_AVX512 + asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx512vl; -#endif -#endif + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha_use_avx512vl) { + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif - if (chacha_use_avx2) { + + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; @@ -123,37 +123,76 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_walk *walk, +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, stream, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, bytes, nrounds); + kernel_fpu_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - int next_yield = 4096; /* bytes until next FPU yield */ - int err = 0; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, iv); - - while (walk->nbytes > 0) { - unsigned int nbytes = walk->nbytes; + chacha_init_generic(state, ctx->key, iv); - if (nbytes < walk->total) { - nbytes = round_down(nbytes, walk->stride); - next_yield -= nbytes; - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; - chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, - nbytes, ctx->nrounds); + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (next_yield <= 0) { - /* temporarily allow preemption */ - kernel_fpu_end(); + if (!static_branch_likely(&chacha_use_simd) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { kernel_fpu_begin(); - next_yield = 4096; + chacha_dosimd(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + kernel_fpu_end(); } - - err = skcipher_walk_done(walk, walk->nbytes - nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; @@ -163,55 +202,34 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; - - kernel_fpu_begin(); - err = chacha_simd_stream_xor(&walk, ctx, req->iv); - kernel_fpu_end(); - return err; + return chacha_simd_stream_xor(req, ctx, req->iv); } static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); + struct chacha_ctx subctx; u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + chacha_init_generic(state, ctx->key, req->iv); + + if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { + kernel_fpu_begin(); + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + kernel_fpu_end(); + } else { + hchacha_block_generic(state, subctx.key, ctx->nrounds); + } subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(&walk, &subctx, real_iv); - - kernel_fpu_end(); - - return err; + return chacha_simd_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { @@ -227,7 +245,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = chacha_simd, .decrypt = chacha_simd, }, { @@ -242,7 +260,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, { @@ -257,7 +275,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, + .setkey = chacha12_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, @@ -266,24 +284,28 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha_use_avx512vl = chacha_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif + return 0; + + static_branch_enable(&chacha_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_simd_mod_fini(void) { - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 1c099dc08cc3..9fd28ff65bc2 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -103,7 +103,7 @@ * size_t len, uint crc32) */ -ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ +SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ movdqa (BUF), %xmm1 movdqa 0x10(BUF), %xmm2 movdqa 0x20(BUF), %xmm3 @@ -238,4 +238,4 @@ fold_64: PEXTRD 0x01, %xmm1, %eax ret -ENDPROC(crc32_pclmul_le_16) +SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index d9b734d0c8cc..0e6690e3618c 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -74,7 +74,7 @@ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); .text -ENTRY(crc_pcl) +SYM_FUNC_START(crc_pcl) #define bufp %rdi #define bufp_dw %edi #define bufp_w %di @@ -311,7 +311,7 @@ do_return: popq %rdi popq %rbx ret -ENDPROC(crc_pcl) +SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits ################################################################ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 3d873e67749d..b2533d63030e 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -95,7 +95,7 @@ # Assumes len >= 16. # .align 16 -ENTRY(crc_t10dif_pcl) +SYM_FUNC_START(crc_t10dif_pcl) movdqa .Lbswap_mask(%rip), BSWAP_MASK @@ -280,7 +280,7 @@ ENTRY(crc_t10dif_pcl) jge .Lfold_16_bytes_loop # 32 <= len <= 255 add $16, len jmp .Lhandle_partial_segment # 17 <= len <= 31 -ENDPROC(crc_t10dif_pcl) +SYM_FUNC_END(crc_t10dif_pcl) .section .rodata, "a", @progbits .align 16 diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c new file mode 100644 index 000000000000..a52a3fb15727 --- /dev/null +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -0,0 +1,2475 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <crypto/curve25519.h> +#include <crypto/internal/kpp.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); + +enum { NUM_WORDS_ELTFP25519 = 4 }; +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; + +#define mul_eltfp25519_1w_adx(c, a, b) do { \ + mul_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_1w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ + mul_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_1w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_adx(a) do { \ + sqr_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_1w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_bmi2(a) do { \ + sqr_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_1w_bmi2(a, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_adx(c, a, b) do { \ + mul2_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_2w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ + mul2_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_2w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_adx(a) do { \ + sqr2_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_2w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_bmi2(a) do { \ + sqr2_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_2w_bmi2(a, m.buffer); \ +} while (0) + +#define sqrn_eltfp25519_1w_adx(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_adx(a); \ +} while (0) + +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_bmi2(a); \ +} while (0) + +#define copy_eltfp25519_1w(C, A) do { \ + (C)[0] = (A)[0]; \ + (C)[1] = (A)[1]; \ + (C)[2] = (A)[2]; \ + (C)[3] = (A)[3]; \ +} while (0) + +#define setzero_eltfp25519_1w(C) do { \ + (C)[0] = 0; \ + (C)[1] = 0; \ + (C)[2] = 0; \ + (C)[3] = 0; \ +} while (0) + +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, + 0xffffffffffffffffUL, 0x5fffffffffffffffUL, + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, + 0xc1c20d06231f7614UL, 0x2938218da274f972UL, + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, + 0x23758739f630a257UL, 0x295a407a01a78580UL, + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, + 0xc189218075e91436UL, 0x6d9284169b3b8484UL, + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, + 0x25232973322dbef4UL, 0x445dc4758c17f770UL, + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, + 0x894d1d855ae52359UL, 0x68e122157b743d69UL, + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, + 0x4a497962066e6043UL, 0x705b3aab41355b44UL, + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, + 0x508e862f121692fcUL, 0x3a81907fa093c291UL, + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, + 0xe488de11d761e352UL, 0x0e878a01a085545cUL, + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, + 0x266fd5809208f294UL, 0x5c847085619a26b9UL, + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, + 0x52d17436309d4253UL, 0x356f97e13efae576UL, + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, + 0x497d723f802e88e1UL, 0x30684dea602f408dUL, + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, + 0x81004b71e33cc191UL, 0x44e6be345122803cUL, + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, + 0x33979624f0e917beUL, 0x2c018dc527356b30UL, + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL +}; + +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] + * a is two 256-bit integers: a0[0:3] and a1[4:7] + * b is two 256-bit integers: b0[0:3] and b1[4:7] + */ +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "xorl %%r14d, %%r14d ;" + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "adox %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adox %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, 64(%0);" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "adox %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adox %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 72(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 80(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 88(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "movq %%r8, 64(%0) ;" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "addq %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adcq %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 72(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 80(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 88(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ + "xorl %%r15d, %%r15d;" + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ + "adcx %%r14, %%r9 ;" + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ + "adcx %%rax, %%r10 ;" + "movq 56(%1), %%rdx ;" /* B[3] */ + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ + "adcx %%rax, %%rbx ;" + "movq 40(%1), %%rdx ;" /* B[1] */ + "adcx %%r15, %%r13 ;" + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq 32(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + "movq 40(%1), %%rdx ;" /* B[1] */ + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ + + "movq 48(%1), %%rdx ;" /* B[2] */ + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox 64(%1), %%r8 ;" + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 72(%1), %%r9 ;" + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 80(%1), %%r10 ;" + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 88(%1), %%r11 ;" + /****************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /****************************************/ + "adcq $0, %%rcx ;" + "addq 64(%1), %%r8 ;" + "adcq 72(%1), %%r9 ;" + "adcq 80(%1), %%r10 ;" + "adcq 88(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ + "adox %%r9, %%r10 ;" + "movq %%r10, 8(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ + "adox %%r11, %%r15 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ + "adox %%r13, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 8(%0), %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 16(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 16(%0), %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 24(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 24(%0), %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 32(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq %%r14, 48(%0) ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + "movq %%rax, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15"); +} + +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "xorl %%ecx, %%ecx ;" + "movq (%2), %%r8 ;" + "adcx (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcx 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcx 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcx 24(%1), %%r11 ;" + "cmovc %%eax, %%ecx ;" + "xorl %%eax, %%eax ;" + "adcx %%rcx, %%r8 ;" + "adcx %%rax, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rax, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rax, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $38, %%ecx ;" + "cmovc %%ecx, %%eax ;" + "addq %%rax, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%2), %%r8 ;" + "addq (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcq 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcq 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcq 24(%1), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%1), %%r8 ;" + "subq (%2), %%r8 ;" + "movq 8(%1), %%r9 ;" + "sbbq 8(%2), %%r9 ;" + "movq 16(%1), %%r10 ;" + "sbbq 16(%2), %%r10 ;" + "movq 24(%1), %%r11 ;" + "sbbq 24(%2), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "sbbq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "sbbq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "sbbq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ +static __always_inline void +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) +{ + const u64 a24 = 121666; + asm volatile( + "movq %2, %%rdx ;" + "mulx (%1), %%r8, %%r10 ;" + "mulx 8(%1), %%r9, %%r11 ;" + "addq %%r10, %%r9 ;" + "mulx 16(%1), %%r10, %%rax ;" + "adcq %%r11, %%r10 ;" + "mulx 24(%1), %%r11, %%rcx ;" + "adcq %%rax, %%r11 ;" + /**************************/ + "adcq $0, %%rcx ;" + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ + "imul %%rdx, %%rcx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(a24) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[4]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_adx(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 2); + mul_eltfp25519_1w_adx(T[0], a, T[2]); + mul_eltfp25519_1w_adx(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 1); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 10); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 20); + mul_eltfp25519_1w_adx(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 10); + mul_eltfp25519_1w_adx(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_adx(T[0], 50); + mul_eltfp25519_1w_adx(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 100); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 50); + mul_eltfp25519_1w_adx(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[5]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_bmi2(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 2); + mul_eltfp25519_1w_bmi2(T[0], a, T[2]); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 1); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 10); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 20); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 10); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_bmi2(T[0], 50); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 100); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 50); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c + * with a number such that 0 <= C < 2**255-19. + */ +static __always_inline void fred_eltfp25519_1w(u64 *const c) +{ + u64 tmp0 = 38, tmp1 = 19; + asm volatile( + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ + + /* Add either 19 or 38 to c */ + "addq %4, %0 ;" + "adcq $0, %1 ;" + "adcq $0, %2 ;" + "adcq $0, %3 ;" + + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ + "movl $0, %k4 ;" + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ + "btrq $63, %3 ;" /* Clear bit 255 */ + + /* Subtract 19 if necessary */ + "subq %4, %0 ;" + "sbbq $0, %1 ;" + "sbbq $0, %2 ;" + "sbbq $0, %3 ;" + + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), + "+r"(tmp1) + : + : "memory", "cc"); +} + +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) +{ + u64 temp; + asm volatile( + "test %9, %9 ;" + "movq %0, %8 ;" + "cmovnzq %4, %0 ;" + "cmovnzq %8, %4 ;" + "movq %1, %8 ;" + "cmovnzq %5, %1 ;" + "cmovnzq %8, %5 ;" + "movq %2, %8 ;" + "cmovnzq %6, %2 ;" + "cmovnzq %8, %6 ;" + "movq %3, %8 ;" + "cmovnzq %7, %3 ;" + "cmovnzq %8, %7 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), + "=r"(temp) + : "r"(bit) + : "cc" + ); +} + +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) +{ + asm volatile( + "test %4, %4 ;" + "cmovnzq %5, %0 ;" + "cmovnzq %6, %1 ;" + "cmovnzq %7, %2 ;" + "cmovnzq %8, %3 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) + : "cc" + ); +} + +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_adx(A, Qz); + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_adx(A, Zr1); + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_bmi2(A, Qz); + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_bmi2(A, Zr1); + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx(mypublic, secret, basepoint); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx_base(pub, secret); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2_base(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-x86", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + +static int __init curve25519_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2)) + static_branch_enable(&curve25519_use_bmi2); + else if (boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_adx); + else + return 0; + return crypto_register_kpp(&curve25519_alg); +} + +static void __exit curve25519_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2) || + boot_cpu_has(X86_FEATURE_ADX)) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index 7fca43099a5f..fac0fdc3f25d 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -162,7 +162,7 @@ movl left##d, (io); \ movl right##d, 4(io); -ENTRY(des3_ede_x86_64_crypt_blk) +SYM_FUNC_START(des3_ede_x86_64_crypt_blk) /* input: * %rdi: round keys, CTX * %rsi: dst @@ -244,7 +244,7 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %rbx; ret; -ENDPROC(des3_ede_x86_64_crypt_blk) +SYM_FUNC_END(des3_ede_x86_64_crypt_blk) /*********************************************************************** * 3-way 3DES @@ -418,7 +418,7 @@ ENDPROC(des3_ede_x86_64_crypt_blk) #define __movq(src, dst) \ movq src, dst; -ENTRY(des3_ede_x86_64_crypt_blk_3way) +SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way) /* input: * %rdi: ctx, round keys * %rsi: dst (3 blocks) @@ -529,7 +529,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %rbx; ret; -ENDPROC(des3_ede_x86_64_crypt_blk_3way) +SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way) .section .rodata, "a", @progbits .align 16 diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 5d53effe8abe..bb9735fbb865 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -44,7 +44,7 @@ * T2 * T3 */ -__clmul_gf128mul_ble: +SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) movaps DATA, T1 pshufd $0b01001110, DATA, T2 pshufd $0b01001110, SHASH, T3 @@ -87,10 +87,10 @@ __clmul_gf128mul_ble: pxor T2, T1 pxor T1, DATA ret -ENDPROC(__clmul_gf128mul_ble) +SYM_FUNC_END(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const u128 *shash) */ -ENTRY(clmul_ghash_mul) +SYM_FUNC_START(clmul_ghash_mul) FRAME_BEGIN movups (%rdi), DATA movups (%rsi), SHASH @@ -101,13 +101,13 @@ ENTRY(clmul_ghash_mul) movups DATA, (%rdi) FRAME_END ret -ENDPROC(clmul_ghash_mul) +SYM_FUNC_END(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, * const u128 *shash); */ -ENTRY(clmul_ghash_update) +SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN cmp $16, %rdx jb .Lupdate_just_ret # check length @@ -130,4 +130,4 @@ ENTRY(clmul_ghash_update) .Lupdate_just_ret: FRAME_END ret -ENDPROC(clmul_ghash_update) +SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index f7946ea1b704..b22c7b936272 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -69,7 +69,7 @@ * * It's guaranteed that message_len % 16 == 0. */ -ENTRY(nh_avx2) +SYM_FUNC_START(nh_avx2) vmovdqu 0x00(KEY), K0 vmovdqu 0x10(KEY), K1 @@ -154,4 +154,4 @@ ENTRY(nh_avx2) vpaddq T4, T0, T0 vmovdqu T0, (HASH) ret -ENDPROC(nh_avx2) +SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index 51f52d4ab4bb..d7ae22dd6683 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -71,7 +71,7 @@ * * It's guaranteed that message_len % 16 == 0. */ -ENTRY(nh_sse2) +SYM_FUNC_START(nh_sse2) movdqu 0x00(KEY), K0 movdqu 0x10(KEY), K1 @@ -120,4 +120,4 @@ ENTRY(nh_sse2) movdqu T0, 0x00(HASH) movdqu T1, 0x10(HASH) ret -ENDPROC(nh_sse2) +SYM_FUNC_END(nh_sse2) diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 8b341bc29d41..d6063feda9da 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -79,7 +79,7 @@ ORMASK: .octa 0x00000000010000000000000001000000 #define d3 %r12 #define d4 %r13 -ENTRY(poly1305_4block_avx2) +SYM_FUNC_START(poly1305_4block_avx2) # %rdi: Accumulator h[5] # %rsi: 64 byte input block m # %rdx: Poly1305 key r[5] @@ -387,4 +387,4 @@ ENTRY(poly1305_4block_avx2) pop %r12 pop %rbx ret -ENDPROC(poly1305_4block_avx2) +SYM_FUNC_END(poly1305_4block_avx2) diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index 5578f846e622..d8ea29b96640 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -46,7 +46,7 @@ ORMASK: .octa 0x00000000010000000000000001000000 #define d3 %r11 #define d4 %r12 -ENTRY(poly1305_block_sse2) +SYM_FUNC_START(poly1305_block_sse2) # %rdi: Accumulator h[5] # %rsi: 16 byte input block m # %rdx: Poly1305 key r[5] @@ -276,7 +276,7 @@ ENTRY(poly1305_block_sse2) pop %r12 pop %rbx ret -ENDPROC(poly1305_block_sse2) +SYM_FUNC_END(poly1305_block_sse2) #define u0 0x00(%r8) @@ -301,7 +301,7 @@ ENDPROC(poly1305_block_sse2) #undef d0 #define d0 %r13 -ENTRY(poly1305_2block_sse2) +SYM_FUNC_START(poly1305_2block_sse2) # %rdi: Accumulator h[5] # %rsi: 16 byte input block m # %rdx: Poly1305 key r[5] @@ -587,4 +587,4 @@ ENTRY(poly1305_2block_sse2) pop %r12 pop %rbx ret -ENDPROC(poly1305_2block_sse2) +SYM_FUNC_END(poly1305_2block_sse2) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4a1c05dce950..370cd88068ec 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -7,47 +7,23 @@ #include <crypto/algapi.h> #include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> #include <crypto/internal/simd.h> -#include <crypto/poly1305.h> #include <linux/crypto.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <asm/simd.h> -struct poly1305_simd_desc_ctx { - struct poly1305_desc_ctx base; - /* derived key u set? */ - bool uset; -#ifdef CONFIG_AS_AVX2 - /* derived keys r^3, r^4 set? */ - bool wset; -#endif - /* derived Poly1305 key r^2 */ - u32 u[5]; - /* ... silently appended r^3 and r^4 when using AVX2 */ -}; - asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks); asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -#ifdef CONFIG_AS_AVX2 asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -static bool poly1305_use_avx2; -#endif - -static int poly1305_simd_init(struct shash_desc *desc) -{ - struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); - sctx->uset = false; -#ifdef CONFIG_AS_AVX2 - sctx->wset = false; -#endif - - return crypto_poly1305_init(desc); -} +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static void poly1305_simd_mult(u32 *a, const u32 *b) { @@ -60,72 +36,85 @@ static void poly1305_simd_mult(u32 *a, const u32 *b) poly1305_block_sse2(a, m, b, 1); } +static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + unsigned int datalen; + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_core_blocks(&dctx->h, dctx->r, src, + srclen / POLY1305_BLOCK_SIZE, 1); + srclen %= POLY1305_BLOCK_SIZE; + } + return srclen; +} + static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - struct poly1305_simd_desc_ctx *sctx; unsigned int blocks, datalen; - BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); - sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); - if (unlikely(!dctx->sset)) { datalen = crypto_poly1305_setdesckey(dctx, src, srclen); src += srclen - datalen; srclen = datalen; } -#ifdef CONFIG_AS_AVX2 - if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(!sctx->wset)) { - if (!sctx->uset) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&poly1305_use_avx2) && + srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(dctx->rset < 4)) { + if (dctx->rset < 2) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); } - memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r.r); - memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r.r); - sctx->wset = true; + dctx->r[2] = dctx->r[1]; + poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); + dctx->r[3] = dctx->r[2]; + poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); + dctx->rset = 4; } blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, + dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 4 * blocks; srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; } -#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (unlikely(dctx->rset < 2)) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); + dctx->rset = 2; } blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, + blocks, dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 2 * blocks; srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); + poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); srclen -= POLY1305_BLOCK_SIZE; } return srclen; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) { - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int bytes; - - /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !crypto_simd_usable()) - return crypto_poly1305_update(desc, src, srclen); + poly1305_init_generic(desc, key); +} +EXPORT_SYMBOL(poly1305_init_arch); - kernel_fpu_begin(); +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int srclen) +{ + unsigned int bytes; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -135,34 +124,84 @@ static int poly1305_simd_update(struct shash_desc *desc, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + kernel_fpu_end(); + } else { + poly1305_scalar_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + } dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = poly1305_simd_blocks(dctx, src, srclen); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + bytes = poly1305_simd_blocks(dctx, src, srclen); + kernel_fpu_end(); + } else { + bytes = poly1305_scalar_blocks(dctx, src, srclen); + } src += srclen - bytes; srclen = bytes; } - kernel_fpu_end(); - if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest) +{ + poly1305_final_generic(desc, digest); +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int crypto_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_core_init(&dctx->h); + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_generic(dctx, dst); + return 0; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + poly1305_update_arch(dctx, src, srclen); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, - .init = poly1305_simd_init, + .init = crypto_poly1305_init, .update = poly1305_simd_update, .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_simd_desc_ctx), + .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", @@ -175,16 +214,16 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); - alg.descsize = sizeof(struct poly1305_simd_desc_ctx); - if (poly1305_use_avx2) - alg.descsize += 10 * sizeof(u32); -#endif + return 0; + + static_branch_enable(&poly1305_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + return crypto_register_shash(&alg); } diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index ddc51dbba3af..ba9e4c1e7f5c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -555,7 +555,7 @@ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -__serpent_enc_blk8_avx: +SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -606,10 +606,10 @@ __serpent_enc_blk8_avx: write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk8_avx) +SYM_FUNC_END(__serpent_enc_blk8_avx) .align 8 -__serpent_dec_blk8_avx: +SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks @@ -660,9 +660,9 @@ __serpent_dec_blk8_avx: write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_dec_blk8_avx) +SYM_FUNC_END(__serpent_dec_blk8_avx) -ENTRY(serpent_ecb_enc_8way_avx) +SYM_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -678,9 +678,9 @@ ENTRY(serpent_ecb_enc_8way_avx) FRAME_END ret; -ENDPROC(serpent_ecb_enc_8way_avx) +SYM_FUNC_END(serpent_ecb_enc_8way_avx) -ENTRY(serpent_ecb_dec_8way_avx) +SYM_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -696,9 +696,9 @@ ENTRY(serpent_ecb_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_ecb_dec_8way_avx) +SYM_FUNC_END(serpent_ecb_dec_8way_avx) -ENTRY(serpent_cbc_dec_8way_avx) +SYM_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -714,9 +714,9 @@ ENTRY(serpent_cbc_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_cbc_dec_8way_avx) +SYM_FUNC_END(serpent_cbc_dec_8way_avx) -ENTRY(serpent_ctr_8way_avx) +SYM_FUNC_START(serpent_ctr_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -734,9 +734,9 @@ ENTRY(serpent_ctr_8way_avx) FRAME_END ret; -ENDPROC(serpent_ctr_8way_avx) +SYM_FUNC_END(serpent_ctr_8way_avx) -ENTRY(serpent_xts_enc_8way_avx) +SYM_FUNC_START(serpent_xts_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -756,9 +756,9 @@ ENTRY(serpent_xts_enc_8way_avx) FRAME_END ret; -ENDPROC(serpent_xts_enc_8way_avx) +SYM_FUNC_END(serpent_xts_enc_8way_avx) -ENTRY(serpent_xts_dec_8way_avx) +SYM_FUNC_START(serpent_xts_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -778,4 +778,4 @@ ENTRY(serpent_xts_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_xts_dec_8way_avx) +SYM_FUNC_END(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 37bc1d48106c..c9648aeae705 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -561,7 +561,7 @@ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -__serpent_enc_blk16: +SYM_FUNC_START_LOCAL(__serpent_enc_blk16) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext @@ -612,10 +612,10 @@ __serpent_enc_blk16: write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk16) +SYM_FUNC_END(__serpent_enc_blk16) .align 8 -__serpent_dec_blk16: +SYM_FUNC_START_LOCAL(__serpent_dec_blk16) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext @@ -666,9 +666,9 @@ __serpent_dec_blk16: write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_dec_blk16) +SYM_FUNC_END(__serpent_dec_blk16) -ENTRY(serpent_ecb_enc_16way) +SYM_FUNC_START(serpent_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -688,9 +688,9 @@ ENTRY(serpent_ecb_enc_16way) FRAME_END ret; -ENDPROC(serpent_ecb_enc_16way) +SYM_FUNC_END(serpent_ecb_enc_16way) -ENTRY(serpent_ecb_dec_16way) +SYM_FUNC_START(serpent_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -710,9 +710,9 @@ ENTRY(serpent_ecb_dec_16way) FRAME_END ret; -ENDPROC(serpent_ecb_dec_16way) +SYM_FUNC_END(serpent_ecb_dec_16way) -ENTRY(serpent_cbc_dec_16way) +SYM_FUNC_START(serpent_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -733,9 +733,9 @@ ENTRY(serpent_cbc_dec_16way) FRAME_END ret; -ENDPROC(serpent_cbc_dec_16way) +SYM_FUNC_END(serpent_cbc_dec_16way) -ENTRY(serpent_ctr_16way) +SYM_FUNC_START(serpent_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -758,9 +758,9 @@ ENTRY(serpent_ctr_16way) FRAME_END ret; -ENDPROC(serpent_ctr_16way) +SYM_FUNC_END(serpent_ctr_16way) -ENTRY(serpent_xts_enc_16way) +SYM_FUNC_START(serpent_xts_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -784,9 +784,9 @@ ENTRY(serpent_xts_enc_16way) FRAME_END ret; -ENDPROC(serpent_xts_enc_16way) +SYM_FUNC_END(serpent_xts_enc_16way) -ENTRY(serpent_xts_dec_16way) +SYM_FUNC_START(serpent_xts_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -810,4 +810,4 @@ ENTRY(serpent_xts_dec_16way) FRAME_END ret; -ENDPROC(serpent_xts_dec_16way) +SYM_FUNC_END(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index e5c4a4690ca9..6379b99cb722 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -497,7 +497,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -ENTRY(__serpent_enc_blk_4way) +SYM_FUNC_START(__serpent_enc_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -559,9 +559,9 @@ ENTRY(__serpent_enc_blk_4way) xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; -ENDPROC(__serpent_enc_blk_4way) +SYM_FUNC_END(__serpent_enc_blk_4way) -ENTRY(serpent_dec_blk_4way) +SYM_FUNC_START(serpent_dec_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -613,4 +613,4 @@ ENTRY(serpent_dec_blk_4way) write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); ret; -ENDPROC(serpent_dec_blk_4way) +SYM_FUNC_END(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 5e0b3a3e97af..efb6dc17dc90 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -619,7 +619,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -ENTRY(__serpent_enc_blk_8way) +SYM_FUNC_START(__serpent_enc_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -682,9 +682,9 @@ ENTRY(__serpent_enc_blk_8way) xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk_8way) +SYM_FUNC_END(__serpent_enc_blk_8way) -ENTRY(serpent_dec_blk_8way) +SYM_FUNC_START(serpent_dec_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -736,4 +736,4 @@ ENTRY(serpent_dec_blk_8way) write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(serpent_dec_blk_8way) +SYM_FUNC_END(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 9f712a7dfd79..6decc85ef7b7 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -634,7 +634,7 @@ _loop3: * param: function's name */ .macro SHA1_VECTOR_ASM name - ENTRY(\name) + SYM_FUNC_START(\name) push %rbx push %r12 @@ -676,7 +676,7 @@ _loop3: ret - ENDPROC(\name) + SYM_FUNC_END(\name) .endm .section .rodata diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index ebbdba72ae07..11efe3a45a1f 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -95,7 +95,7 @@ */ .text .align 32 -ENTRY(sha1_ni_transform) +SYM_FUNC_START(sha1_ni_transform) mov %rsp, RSPSAVE sub $FRAME_SIZE, %rsp and $~0xF, %rsp @@ -291,7 +291,7 @@ ENTRY(sha1_ni_transform) mov RSPSAVE, %rsp ret -ENDPROC(sha1_ni_transform) +SYM_FUNC_END(sha1_ni_transform) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 .align 16 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 99c5b8c4dc38..5d03c1173690 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -67,7 +67,7 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - ENTRY(\name) + SYM_FUNC_START(\name) push %rbx push %r12 @@ -101,7 +101,7 @@ pop %rbx ret - ENDPROC(\name) + SYM_FUNC_END(\name) .endm /* diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 001bbcf93c79..22e14c8dd2e4 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -347,7 +347,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_avx) +SYM_FUNC_START(sha256_transform_avx) .align 32 pushq %rbx pushq %r12 @@ -460,7 +460,7 @@ done_hash: popq %r12 popq %rbx ret -ENDPROC(sha256_transform_avx) +SYM_FUNC_END(sha256_transform_avx) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 1420db15dcdd..519b551ad576 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -526,7 +526,7 @@ STACK_SIZE = _RSP + _RSP_SIZE ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_rorx) +SYM_FUNC_START(sha256_transform_rorx) .align 32 pushq %rbx pushq %r12 @@ -713,7 +713,7 @@ done_hash: popq %r12 popq %rbx ret -ENDPROC(sha256_transform_rorx) +SYM_FUNC_END(sha256_transform_rorx) .section .rodata.cst512.K256, "aM", @progbits, 512 .align 64 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c6c05ed2c16a..69cc2f91dc4c 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -353,7 +353,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_ssse3) +SYM_FUNC_START(sha256_transform_ssse3) .align 32 pushq %rbx pushq %r12 @@ -471,7 +471,7 @@ done_hash: popq %rbx ret -ENDPROC(sha256_transform_ssse3) +SYM_FUNC_END(sha256_transform_ssse3) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index fb58f58ecfbc..7abade04a3a3 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -97,7 +97,7 @@ .text .align 32 -ENTRY(sha256_ni_transform) +SYM_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash @@ -327,7 +327,7 @@ ENTRY(sha256_ni_transform) .Ldone_hash: ret -ENDPROC(sha256_ni_transform) +SYM_FUNC_END(sha256_ni_transform) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 39235fefe6f7..3704ddd7e5d5 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -277,7 +277,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # message blocks. # L is the message length in SHA512 blocks ######################################################################## -ENTRY(sha512_transform_avx) +SYM_FUNC_START(sha512_transform_avx) cmp $0, msglen je nowork @@ -365,7 +365,7 @@ updateblock: nowork: ret -ENDPROC(sha512_transform_avx) +SYM_FUNC_END(sha512_transform_avx) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index b16d56005162..80d830e7ee09 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -569,7 +569,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # message blocks. # L is the message length in SHA512 blocks ######################################################################## -ENTRY(sha512_transform_rorx) +SYM_FUNC_START(sha512_transform_rorx) # Allocate Stack Space mov %rsp, %rax sub $frame_size, %rsp @@ -682,7 +682,7 @@ done_hash: # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp ret -ENDPROC(sha512_transform_rorx) +SYM_FUNC_END(sha512_transform_rorx) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 66bbd9058a90..838f984e95d9 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -275,7 +275,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # message blocks. # L is the message length in SHA512 blocks. ######################################################################## -ENTRY(sha512_transform_ssse3) +SYM_FUNC_START(sha512_transform_ssse3) cmp $0, msglen je nowork @@ -364,7 +364,7 @@ updateblock: nowork: ret -ENDPROC(sha512_transform_ssse3) +SYM_FUNC_END(sha512_transform_ssse3) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 698b8f2a56e2..a5151393bb2f 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -234,7 +234,7 @@ vpxor x3, wkey, x3; .align 8 -__twofish_enc_blk8: +SYM_FUNC_START_LOCAL(__twofish_enc_blk8) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -273,10 +273,10 @@ __twofish_enc_blk8: outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; -ENDPROC(__twofish_enc_blk8) +SYM_FUNC_END(__twofish_enc_blk8) .align 8 -__twofish_dec_blk8: +SYM_FUNC_START_LOCAL(__twofish_dec_blk8) /* input: * %rdi: ctx, CTX * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks @@ -313,9 +313,9 @@ __twofish_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); ret; -ENDPROC(__twofish_dec_blk8) +SYM_FUNC_END(__twofish_dec_blk8) -ENTRY(twofish_ecb_enc_8way) +SYM_FUNC_START(twofish_ecb_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -333,9 +333,9 @@ ENTRY(twofish_ecb_enc_8way) FRAME_END ret; -ENDPROC(twofish_ecb_enc_8way) +SYM_FUNC_END(twofish_ecb_enc_8way) -ENTRY(twofish_ecb_dec_8way) +SYM_FUNC_START(twofish_ecb_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -353,9 +353,9 @@ ENTRY(twofish_ecb_dec_8way) FRAME_END ret; -ENDPROC(twofish_ecb_dec_8way) +SYM_FUNC_END(twofish_ecb_dec_8way) -ENTRY(twofish_cbc_dec_8way) +SYM_FUNC_START(twofish_cbc_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -378,9 +378,9 @@ ENTRY(twofish_cbc_dec_8way) FRAME_END ret; -ENDPROC(twofish_cbc_dec_8way) +SYM_FUNC_END(twofish_cbc_dec_8way) -ENTRY(twofish_ctr_8way) +SYM_FUNC_START(twofish_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -405,9 +405,9 @@ ENTRY(twofish_ctr_8way) FRAME_END ret; -ENDPROC(twofish_ctr_8way) +SYM_FUNC_END(twofish_ctr_8way) -ENTRY(twofish_xts_enc_8way) +SYM_FUNC_START(twofish_xts_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -429,9 +429,9 @@ ENTRY(twofish_xts_enc_8way) FRAME_END ret; -ENDPROC(twofish_xts_enc_8way) +SYM_FUNC_END(twofish_xts_enc_8way) -ENTRY(twofish_xts_dec_8way) +SYM_FUNC_START(twofish_xts_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -453,4 +453,4 @@ ENTRY(twofish_xts_dec_8way) FRAME_END ret; -ENDPROC(twofish_xts_dec_8way) +SYM_FUNC_END(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 290cc4e9a6fe..a6f09e4f2e46 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -207,7 +207,7 @@ xor %esi, d ## D;\ ror $1, d ## D; -ENTRY(twofish_enc_blk) +SYM_FUNC_START(twofish_enc_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -261,9 +261,9 @@ ENTRY(twofish_enc_blk) pop %ebp mov $1, %eax ret -ENDPROC(twofish_enc_blk) +SYM_FUNC_END(twofish_enc_blk) -ENTRY(twofish_dec_blk) +SYM_FUNC_START(twofish_dec_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -318,4 +318,4 @@ ENTRY(twofish_dec_blk) pop %ebp mov $1, %eax ret -ENDPROC(twofish_dec_blk) +SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index e495e07c7f1b..fc23552afe37 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -220,7 +220,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -ENTRY(__twofish_enc_blk_3way) +SYM_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -267,9 +267,9 @@ ENTRY(__twofish_enc_blk_3way) popq %r12; popq %r13; ret; -ENDPROC(__twofish_enc_blk_3way) +SYM_FUNC_END(__twofish_enc_blk_3way) -ENTRY(twofish_dec_blk_3way) +SYM_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -302,4 +302,4 @@ ENTRY(twofish_dec_blk_3way) popq %r12; popq %r13; ret; -ENDPROC(twofish_dec_blk_3way) +SYM_FUNC_END(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index ecef2cb9f43f..d2e56232494a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -202,7 +202,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -ENTRY(twofish_enc_blk) +SYM_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -253,9 +253,9 @@ ENTRY(twofish_enc_blk) popq R1 movl $1,%eax ret -ENDPROC(twofish_enc_blk) +SYM_FUNC_END(twofish_enc_blk) -ENTRY(twofish_dec_blk) +SYM_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ @@ -305,4 +305,4 @@ ENTRY(twofish_dec_blk) popq R1 movl $1,%eax ret -ENDPROC(twofish_dec_blk) +SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 515c0ceeb4a3..0789e13ece90 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -354,7 +354,7 @@ For 32-bit we have the following conventions - kernel is built with .macro CALL_enter_from_user_mode #ifdef CONFIG_CONTEXT_TRACKING #ifdef CONFIG_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 + STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0 #endif call enter_from_user_mode .Lafter_call_\@: diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 3f8e22615812..9747876980b5 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -33,6 +33,7 @@ #include <asm/cpufeature.h> #include <asm/fpu/api.h> #include <asm/nospec-branch.h> +#include <asm/io_bitmap.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> @@ -196,6 +197,9 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) /* Reload ti->flags; we may have rescheduled above. */ cached_flags = READ_ONCE(ti->flags); + if (unlikely(cached_flags & _TIF_IO_BITMAP)) + tss_update_io_bitmap(); + fpregs_assert_state_consistent(); if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f83ca5aa8b77..5832b11f01bb 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -172,7 +172,7 @@ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI .if \no_user_check == 0 /* coming from usermode? */ - testl $SEGMENT_RPL_MASK, PT_CS(%esp) + testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp) jz .Lend_\@ .endif /* On user-cr3? */ @@ -205,64 +205,76 @@ #define CS_FROM_ENTRY_STACK (1 << 31) #define CS_FROM_USER_CR3 (1 << 30) #define CS_FROM_KERNEL (1 << 29) +#define CS_FROM_ESPFIX (1 << 28) .macro FIXUP_FRAME /* * The high bits of the CS dword (__csh) are used for CS_FROM_*. * Clear them in case hardware didn't do this for us. */ - andl $0x0000ffff, 3*4(%esp) + andl $0x0000ffff, 4*4(%esp) #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, 4*4(%esp) + testl $X86_EFLAGS_VM, 5*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ #endif - testl $SEGMENT_RPL_MASK, 3*4(%esp) + testl $USER_SEGMENT_RPL_MASK, 4*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ - orl $CS_FROM_KERNEL, 3*4(%esp) + orl $CS_FROM_KERNEL, 4*4(%esp) /* * When we're here from kernel mode; the (exception) stack looks like: * - * 5*4(%esp) - <previous context> - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 6*4(%esp) - <previous context> + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 5 enties as gap: + * the original 6 enties as gap: * - * 12*4(%esp) - <previous context> - * 11*4(%esp) - gap / flags - * 10*4(%esp) - gap / cs - * 9*4(%esp) - gap / ip - * 8*4(%esp) - gap / orig_eax - * 7*4(%esp) - gap / gs / function - * 6*4(%esp) - ss - * 5*4(%esp) - sp - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 14*4(%esp) - <previous context> + * 13*4(%esp) - gap / flags + * 12*4(%esp) - gap / cs + * 11*4(%esp) - gap / ip + * 10*4(%esp) - gap / orig_eax + * 9*4(%esp) - gap / gs / function + * 8*4(%esp) - gap / fs + * 7*4(%esp) - ss + * 6*4(%esp) - sp + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs */ pushl %ss # ss pushl %esp # sp (points at ss) - addl $6*4, (%esp) # point sp back at the previous context - pushl 6*4(%esp) # flags - pushl 6*4(%esp) # cs - pushl 6*4(%esp) # ip - pushl 6*4(%esp) # orig_eax - pushl 6*4(%esp) # gs / function + addl $7*4, (%esp) # point sp back at the previous context + pushl 7*4(%esp) # flags + pushl 7*4(%esp) # cs + pushl 7*4(%esp) # ip + pushl 7*4(%esp) # orig_eax + pushl 7*4(%esp) # gs / function + pushl 7*4(%esp) # fs .Lfrom_usermode_no_fixup_\@: .endm .macro IRET_FRAME + /* + * We're called with %ds, %es, %fs, and %gs from the interrupted + * frame, so we shouldn't use them. Also, we may be in ESPFIX + * mode and therefore have a nonzero SS base and an offset ESP, + * so any attempt to access the stack needs to use SS. (except for + * accesses through %esp, which automatically use SS.) + */ testl $CS_FROM_KERNEL, 1*4(%esp) jz .Lfinished_frame_\@ @@ -276,31 +288,40 @@ movl 5*4(%esp), %eax # (modified) regs->sp movl 4*4(%esp), %ecx # flags - movl %ecx, -4(%eax) + movl %ecx, %ss:-1*4(%eax) movl 3*4(%esp), %ecx # cs andl $0x0000ffff, %ecx - movl %ecx, -8(%eax) + movl %ecx, %ss:-2*4(%eax) movl 2*4(%esp), %ecx # ip - movl %ecx, -12(%eax) + movl %ecx, %ss:-3*4(%eax) movl 1*4(%esp), %ecx # eax - movl %ecx, -16(%eax) + movl %ecx, %ss:-4*4(%eax) popl %ecx - lea -16(%eax), %esp + lea -4*4(%eax), %esp popl %eax .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0 cld .if \skip_gs == 0 PUSH_GS .endif - FIXUP_FRAME pushl %fs + + pushl %eax + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs +.if \unwind_espfix > 0 + UNWIND_ESPFIX_STACK +.endif + popl %eax + + FIXUP_FRAME pushl %es pushl %ds pushl \pt_regs_ax @@ -313,8 +334,6 @@ movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs .if \skip_gs == 0 SET_KERNEL_GS %edx .endif @@ -324,8 +343,8 @@ .endif .endm -.macro SAVE_ALL_NMI cr3_reg:req - SAVE_ALL +.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0 + SAVE_ALL unwind_espfix=\unwind_espfix BUG_IF_WRONG_CR3 @@ -357,6 +376,7 @@ 2: popl %es 3: popl %fs POP_GS \pop + IRET_FRAME .pushsection .fixup, "ax" 4: movl $0, (%esp) jmp 1b @@ -395,7 +415,8 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -709,7 +730,7 @@ * %eax: prev task * %edx: next task */ -ENTRY(__switch_to_asm) +SYM_CODE_START(__switch_to_asm) /* * Save callee-saved registers * This must match the order in struct inactive_task_frame @@ -718,6 +739,11 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + /* + * Flags are saved to prevent AC leakage. This could go + * away if objtool would have 32bit support to verify + * the STAC/CLAC correctness. + */ pushfl /* switch stack */ @@ -740,15 +766,16 @@ ENTRY(__switch_to_asm) FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif - /* restore callee-saved registers */ + /* Restore flags or the incoming task to restore AC state. */ popfl + /* restore callee-saved registers */ popl %esi popl %edi popl %ebx popl %ebp jmp __switch_to -END(__switch_to_asm) +SYM_CODE_END(__switch_to_asm) /* * The unwinder expects the last frame on the stack to always be at the same @@ -757,7 +784,7 @@ END(__switch_to_asm) * asmlinkage function so its argument has to be pushed on the stack. This * wrapper creates a proper "end of stack" frame header before the call. */ -ENTRY(schedule_tail_wrapper) +SYM_FUNC_START(schedule_tail_wrapper) FRAME_BEGIN pushl %eax @@ -766,7 +793,7 @@ ENTRY(schedule_tail_wrapper) FRAME_END ret -ENDPROC(schedule_tail_wrapper) +SYM_FUNC_END(schedule_tail_wrapper) /* * A newly forked process directly context switches into this address. * @@ -774,7 +801,7 @@ ENDPROC(schedule_tail_wrapper) * ebx: kernel thread func (NULL for user thread) * edi: kernel thread arg */ -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) call schedule_tail_wrapper testl %ebx, %ebx @@ -797,7 +824,7 @@ ENTRY(ret_from_fork) */ movl $0, PT_EAX(%esp) jmp 2b -END(ret_from_fork) +SYM_CODE_END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, @@ -807,8 +834,7 @@ END(ret_from_fork) */ # userspace resumption stub bypassing syscall exit tracing - ALIGN -ret_from_exception: +SYM_CODE_START_LOCAL(ret_from_exception) preempt_stop(CLBR_ANY) ret_from_intr: #ifdef CONFIG_VM86 @@ -825,15 +851,14 @@ ret_from_intr: cmpl $USER_RPL, %eax jb restore_all_kernel # not returning to v8086 or userspace -ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl %esp, %eax call prepare_exit_to_usermode jmp restore_all -END(ret_from_exception) +SYM_CODE_END(ret_from_exception) -GLOBAL(__begin_SYSENTER_singlestep_region) +SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) /* * All code from here through __end_SYSENTER_singlestep_region is subject * to being single-stepped if a user program sets TF and executes SYSENTER. @@ -848,9 +873,10 @@ GLOBAL(__begin_SYSENTER_singlestep_region) * Xen doesn't set %esp to be precisely what the normal SYSENTER * entry point expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) +SYM_CODE_START(xen_sysenter_target) addl $5*4, %esp /* remove xen-provided frame */ jmp .Lsysenter_past_esp +SYM_CODE_END(xen_sysenter_target) #endif /* @@ -885,7 +911,7 @@ ENTRY(xen_sysenter_target) * ebp user stack * 0(%ebp) arg6 */ -ENTRY(entry_SYSENTER_32) +SYM_FUNC_START(entry_SYSENTER_32) /* * On entry-stack with all userspace-regs live - save and * restore eflags and %eax to use it as scratch-reg for the cr3 @@ -1012,8 +1038,8 @@ ENTRY(entry_SYSENTER_32) pushl $X86_EFLAGS_FIXED popfl jmp .Lsysenter_flags_fixed -GLOBAL(__end_SYSENTER_singlestep_region) -ENDPROC(entry_SYSENTER_32) +SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) +SYM_FUNC_END(entry_SYSENTER_32) /* * 32-bit legacy system call entry. @@ -1043,7 +1069,7 @@ ENDPROC(entry_SYSENTER_32) * edi arg5 * ebp arg6 */ -ENTRY(entry_INT80_32) +SYM_FUNC_START(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ @@ -1075,7 +1101,6 @@ restore_all: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: - IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1100,7 +1125,7 @@ restore_all_kernel: jmp .Lirq_return .section .fixup, "ax" -ENTRY(iret_exc ) +SYM_CODE_START(iret_exc) pushl $0 # no error code pushl $do_iret_error @@ -1117,9 +1142,10 @@ ENTRY(iret_exc ) #endif jmp common_exception +SYM_CODE_END(iret_exc) .previous _ASM_EXTABLE(.Lirq_return, iret_exc) -ENDPROC(entry_INT80_32) +SYM_FUNC_END(entry_INT80_32) .macro FIXUP_ESPFIX_STACK /* @@ -1128,30 +1154,43 @@ ENDPROC(entry_INT80_32) * We can't call C functions using the ESPFIX stack. This code reads * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. + * + * We might be on user CR3 here, so percpu data is not mapped and we can't + * access the GDT through the percpu segment. Instead, use SGDT to find + * the cpu_entry_area alias of the GDT. */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + pushl %ecx + subl $2*4, %esp + sgdt (%esp) + movl 2(%esp), %ecx /* GDT address */ + /* + * Careful: ECX is a linear pointer, so we need to force base + * zero. %cs is the only known-linear segment we have right now. + */ + mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */ + mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */ shl $16, %eax + addl $2*4, %esp + popl %ecx addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm + .macro UNWIND_ESPFIX_STACK + /* It's safe to clobber %eax, all other regs need to be preserved */ #ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + jne .Lno_fixup_\@ /* switch to normal stack */ FIXUP_ESPFIX_STACK -27: +.Lno_fixup_\@: #endif .endm @@ -1160,7 +1199,7 @@ ENDPROC(entry_INT80_32) * We pack 1 stub into every 8-byte block. */ .align 8 -ENTRY(irq_entries_start) +SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) pushl $(~vector+0x80) /* Note: always in signed byte range */ @@ -1168,11 +1207,11 @@ ENTRY(irq_entries_start) jmp common_interrupt .align 8 .endr -END(irq_entries_start) +SYM_CODE_END(irq_entries_start) #ifdef CONFIG_X86_LOCAL_APIC .align 8 -ENTRY(spurious_entries_start) +SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) pushl $(~vector+0x80) /* Note: always in signed byte range */ @@ -1180,9 +1219,9 @@ ENTRY(spurious_entries_start) jmp common_spurious .align 8 .endr -END(spurious_entries_start) +SYM_CODE_END(spurious_entries_start) -common_spurious: +SYM_CODE_START_LOCAL(common_spurious) ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL switch_stacks=1 @@ -1191,7 +1230,7 @@ common_spurious: movl %esp, %eax call smp_spurious_interrupt jmp ret_from_intr -ENDPROC(common_spurious) +SYM_CODE_END(common_spurious) #endif /* @@ -1199,7 +1238,7 @@ ENDPROC(common_spurious) * so IRQ-flags tracing has to follow that: */ .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: +SYM_CODE_START_LOCAL(common_interrupt) ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ @@ -1209,10 +1248,10 @@ common_interrupt: movl %esp, %eax call do_IRQ jmp ret_from_intr -ENDPROC(common_interrupt) +SYM_CODE_END(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ +SYM_FUNC_START(name) \ ASM_CLAC; \ pushl $~(nr); \ SAVE_ALL switch_stacks=1; \ @@ -1221,7 +1260,7 @@ ENTRY(name) \ movl %esp, %eax; \ call fn; \ jmp ret_from_intr; \ -ENDPROC(name) +SYM_FUNC_END(name) #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ @@ -1229,14 +1268,14 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> -ENTRY(coprocessor_error) +SYM_CODE_START(coprocessor_error) ASM_CLAC pushl $0 pushl $do_coprocessor_error jmp common_exception -END(coprocessor_error) +SYM_CODE_END(coprocessor_error) -ENTRY(simd_coprocessor_error) +SYM_CODE_START(simd_coprocessor_error) ASM_CLAC pushl $0 #ifdef CONFIG_X86_INVD_BUG @@ -1248,104 +1287,99 @@ ENTRY(simd_coprocessor_error) pushl $do_simd_coprocessor_error #endif jmp common_exception -END(simd_coprocessor_error) +SYM_CODE_END(simd_coprocessor_error) -ENTRY(device_not_available) +SYM_CODE_START(device_not_available) ASM_CLAC pushl $-1 # mark this as an int pushl $do_device_not_available jmp common_exception -END(device_not_available) +SYM_CODE_END(device_not_available) #ifdef CONFIG_PARAVIRT -ENTRY(native_iret) +SYM_CODE_START(native_iret) iret _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) +SYM_CODE_END(native_iret) #endif -ENTRY(overflow) +SYM_CODE_START(overflow) ASM_CLAC pushl $0 pushl $do_overflow jmp common_exception -END(overflow) +SYM_CODE_END(overflow) -ENTRY(bounds) +SYM_CODE_START(bounds) ASM_CLAC pushl $0 pushl $do_bounds jmp common_exception -END(bounds) +SYM_CODE_END(bounds) -ENTRY(invalid_op) +SYM_CODE_START(invalid_op) ASM_CLAC pushl $0 pushl $do_invalid_op jmp common_exception -END(invalid_op) +SYM_CODE_END(invalid_op) -ENTRY(coprocessor_segment_overrun) +SYM_CODE_START(coprocessor_segment_overrun) ASM_CLAC pushl $0 pushl $do_coprocessor_segment_overrun jmp common_exception -END(coprocessor_segment_overrun) +SYM_CODE_END(coprocessor_segment_overrun) -ENTRY(invalid_TSS) +SYM_CODE_START(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS jmp common_exception -END(invalid_TSS) +SYM_CODE_END(invalid_TSS) -ENTRY(segment_not_present) +SYM_CODE_START(segment_not_present) ASM_CLAC pushl $do_segment_not_present jmp common_exception -END(segment_not_present) +SYM_CODE_END(segment_not_present) -ENTRY(stack_segment) +SYM_CODE_START(stack_segment) ASM_CLAC pushl $do_stack_segment jmp common_exception -END(stack_segment) +SYM_CODE_END(stack_segment) -ENTRY(alignment_check) +SYM_CODE_START(alignment_check) ASM_CLAC pushl $do_alignment_check jmp common_exception -END(alignment_check) +SYM_CODE_END(alignment_check) -ENTRY(divide_error) +SYM_CODE_START(divide_error) ASM_CLAC pushl $0 # no error code pushl $do_divide_error jmp common_exception -END(divide_error) +SYM_CODE_END(divide_error) #ifdef CONFIG_X86_MCE -ENTRY(machine_check) +SYM_CODE_START(machine_check) ASM_CLAC pushl $0 pushl machine_check_vector jmp common_exception -END(machine_check) +SYM_CODE_END(machine_check) #endif -ENTRY(spurious_interrupt_bug) +SYM_CODE_START(spurious_interrupt_bug) ASM_CLAC pushl $0 pushl $do_spurious_interrupt_bug jmp common_exception -END(spurious_interrupt_bug) +SYM_CODE_END(spurious_interrupt_bug) #ifdef CONFIG_XEN_PV -ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - +SYM_FUNC_START(xen_hypervisor_callback) /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1353,22 +1387,23 @@ ENTRY(xen_hypervisor_callback) * iret instruction's behaviour where it delivers a * pending interrupt when enabling interrupts: */ - movl PT_EIP(%esp), %eax - cmpl $xen_iret_start_crit, %eax + cmpl $xen_iret_start_crit, (%esp) jb 1f - cmpl $xen_iret_end_crit, %eax + cmpl $xen_iret_end_crit, (%esp) jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax + call xen_iret_crit_fixup +1: + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + mov %esp, %eax call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp ret_from_intr -ENDPROC(xen_hypervisor_callback) +SYM_FUNC_END(xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1382,7 +1417,7 @@ ENDPROC(xen_hypervisor_callback) * to pop the stack frame we end up in an infinite loop of failsafe callbacks. * We distinguish between categories by maintaining a status value in EAX. */ -ENTRY(xen_failsafe_callback) +SYM_FUNC_START(xen_failsafe_callback) pushl %eax movl $1, %eax 1: mov 4(%esp), %ds @@ -1419,7 +1454,7 @@ ENTRY(xen_failsafe_callback) _ASM_EXTABLE(2b, 7b) _ASM_EXTABLE(3b, 8b) _ASM_EXTABLE(4b, 9b) -ENDPROC(xen_failsafe_callback) +SYM_FUNC_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ #ifdef CONFIG_XEN_PVHVM @@ -1441,18 +1476,17 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, #endif /* CONFIG_HYPERV */ -ENTRY(page_fault) +SYM_CODE_START(page_fault) ASM_CLAC pushl $do_page_fault jmp common_exception_read_cr2 -END(page_fault) +SYM_CODE_END(page_fault) -common_exception_read_cr2: +SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1470,13 +1504,12 @@ common_exception_read_cr2: movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi jmp ret_from_exception -END(common_exception_read_cr2) +SYM_CODE_END(common_exception_read_cr2) -common_exception: +SYM_CODE_START_LOCAL_NOALIGN(common_exception) /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1492,9 +1525,9 @@ common_exception: movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi jmp ret_from_exception -END(common_exception) +SYM_CODE_END(common_exception) -ENTRY(debug) +SYM_CODE_START(debug) /* * Entry from sysenter is now handled in common_exception */ @@ -1502,7 +1535,7 @@ ENTRY(debug) pushl $-1 # mark this as an int pushl $do_debug jmp common_exception -END(debug) +SYM_CODE_END(debug) /* * NMI is doubly nasty. It can happen on the first instruction of @@ -1511,10 +1544,14 @@ END(debug) * switched stacks. We handle both conditions by simply checking whether we * interrupted kernel code running on the SYSENTER stack. */ -ENTRY(nmi) +SYM_CODE_START(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 + /* + * ESPFIX_SS is only ever set on the return to user path + * after we've switched to the entry stack. + */ pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1550,6 +1587,11 @@ ENTRY(nmi) movl %ebx, %esp .Lnmi_return: +#ifdef CONFIG_X86_ESPFIX32 + testl $CS_FROM_ESPFIX, PT_CS(%esp) + jnz .Lnmi_from_espfix +#endif + CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 jmp .Lirq_return @@ -1557,28 +1599,47 @@ ENTRY(nmi) #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: /* - * create the pointer to lss back + * Create the pointer to LSS back */ pushl %ss pushl %esp addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL_NMI cr3_reg=%edi + + /* Copy the (short) IRET frame */ + pushl 4*4(%esp) # flags + pushl 4*4(%esp) # cs + pushl 4*4(%esp) # ip + + pushl %eax # orig_ax + + SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1 ENCODE_FRAME_POINTER - FIXUP_ESPFIX_STACK # %eax == %esp + + /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */ + xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp) + xorl %edx, %edx # zero error code - call do_nmi + movl %esp, %eax # pt_regs pointer + jmp .Lnmi_from_sysenter_stack + +.Lnmi_from_espfix: RESTORE_ALL_NMI cr3_reg=%edi - lss 12+4(%esp), %esp # back to espfix stack + /* + * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to + * fix up the gap and long frame: + * + * 3 - original frame (exception) + * 2 - ESPFIX block (above) + * 6 - gap (FIXUP_FRAME) + * 5 - long frame (FIXUP_FRAME) + * 1 - orig_ax + */ + lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif -END(nmi) +SYM_CODE_END(nmi) -ENTRY(int3) +SYM_CODE_START(int3) ASM_CLAC pushl $-1 # mark this as an int @@ -1589,22 +1650,22 @@ ENTRY(int3) movl %esp, %eax # pt_regs pointer call do_int3 jmp ret_from_exception -END(int3) +SYM_CODE_END(int3) -ENTRY(general_protection) +SYM_CODE_START(general_protection) pushl $do_general_protection jmp common_exception -END(general_protection) +SYM_CODE_END(general_protection) #ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) +SYM_CODE_START(async_page_fault) ASM_CLAC pushl $do_async_page_fault jmp common_exception_read_cr2 -END(async_page_fault) +SYM_CODE_END(async_page_fault) #endif -ENTRY(rewind_stack_do_exit) +SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1613,4 +1674,4 @@ ENTRY(rewind_stack_do_exit) call do_exit 1: jmp 1b -END(rewind_stack_do_exit) +SYM_CODE_END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b7c3ea4cb19d..76942cbd95a1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -15,7 +15,7 @@ * at the top of the kernel process stack. * * Some macro usage: - * - ENTRY/END: Define functions in the symbol table. + * - SYM_FUNC_START/END:Define functions in the symbol table. * - TRACE_IRQ_*: Trace hardirq state for lock debugging. * - idtentry: Define exception entry points. */ @@ -46,11 +46,11 @@ .section .entry.text, "ax" #ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret64) +SYM_CODE_START(native_usergs_sysret64) UNWIND_HINT_EMPTY swapgs sysretq -END(native_usergs_sysret64) +SYM_CODE_END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_FLAGS flags:req @@ -142,7 +142,7 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(entry_SYSCALL_64) +SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* * Interrupts are off on entry. @@ -162,7 +162,7 @@ ENTRY(entry_SYSCALL_64) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ -GLOBAL(entry_SYSCALL_64_after_hwframe) +SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@ -273,13 +273,13 @@ syscall_return_via_sysret: popq %rdi popq %rsp USERGS_SYSRET64 -END(entry_SYSCALL_64) +SYM_CODE_END(entry_SYSCALL_64) /* * %rdi: prev task * %rsi: next task */ -ENTRY(__switch_to_asm) +SYM_CODE_START(__switch_to_asm) UNWIND_HINT_FUNC /* * Save callee-saved registers @@ -321,7 +321,7 @@ ENTRY(__switch_to_asm) popq %rbp jmp __switch_to -END(__switch_to_asm) +SYM_CODE_END(__switch_to_asm) /* * A newly forked process directly context switches into this address. @@ -330,7 +330,7 @@ END(__switch_to_asm) * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg */ -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -357,14 +357,14 @@ ENTRY(ret_from_fork) */ movq $0, RAX(%rsp) jmp 2b -END(ret_from_fork) +SYM_CODE_END(ret_from_fork) /* * Build the entry stubs with some assembler magic. * We pack 1 stub into every 8-byte block. */ .align 8 -ENTRY(irq_entries_start) +SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) UNWIND_HINT_IRET_REGS @@ -373,10 +373,10 @@ ENTRY(irq_entries_start) .align 8 vector=vector+1 .endr -END(irq_entries_start) +SYM_CODE_END(irq_entries_start) .align 8 -ENTRY(spurious_entries_start) +SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) UNWIND_HINT_IRET_REGS @@ -385,7 +385,7 @@ ENTRY(spurious_entries_start) .align 8 vector=vector+1 .endr -END(spurious_entries_start) +SYM_CODE_END(spurious_entries_start) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY @@ -511,7 +511,7 @@ END(spurious_entries_start) * | return address | * +----------------------------------------------------+ */ -ENTRY(interrupt_entry) +SYM_CODE_START(interrupt_entry) UNWIND_HINT_FUNC ASM_CLAC cld @@ -579,7 +579,7 @@ ENTRY(interrupt_entry) TRACE_IRQS_OFF ret -END(interrupt_entry) +SYM_CODE_END(interrupt_entry) _ASM_NOKPROBE(interrupt_entry) @@ -589,18 +589,18 @@ _ASM_NOKPROBE(interrupt_entry) * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_spurious/interrupt. */ -common_spurious: +SYM_CODE_START_LOCAL(common_spurious) addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ call interrupt_entry UNWIND_HINT_REGS indirect=1 call smp_spurious_interrupt /* rdi points to pt_regs */ jmp ret_from_intr -END(common_spurious) +SYM_CODE_END(common_spurious) _ASM_NOKPROBE(common_spurious) /* common_interrupt is a hotpath. Align it */ .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: +SYM_CODE_START_LOCAL(common_interrupt) addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ call interrupt_entry UNWIND_HINT_REGS indirect=1 @@ -616,12 +616,12 @@ ret_from_intr: jz retint_kernel /* Interrupt came from user space */ -GLOBAL(retint_user) +.Lretint_user: mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ -GLOBAL(swapgs_restore_regs_and_return_to_usermode) +SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testb $3, CS(%rsp) @@ -679,7 +679,7 @@ retint_kernel: */ TRACE_IRQS_IRETQ -GLOBAL(restore_regs_and_return_to_kernel) +SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ testb $3, CS(%rsp) @@ -695,7 +695,7 @@ GLOBAL(restore_regs_and_return_to_kernel) */ INTERRUPT_RETURN -ENTRY(native_iret) +SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL) UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in @@ -706,8 +706,7 @@ ENTRY(native_iret) jnz native_irq_return_ldt #endif -.global native_irq_return_iret -native_irq_return_iret: +SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) /* * This may fault. Non-paranoid faults on return to userspace are * handled by fixup_bad_iret. These include #SS, #GP, and #NP. @@ -789,14 +788,14 @@ native_irq_return_ldt: */ jmp native_irq_return_iret #endif -END(common_interrupt) +SYM_CODE_END(common_interrupt) _ASM_NOKPROBE(common_interrupt) /* * APIC interrupts. */ .macro apicinterrupt3 num sym do_sym -ENTRY(\sym) +SYM_CODE_START(\sym) UNWIND_HINT_IRET_REGS pushq $~(\num) .Lcommon_\sym: @@ -804,7 +803,7 @@ ENTRY(\sym) UNWIND_HINT_REGS indirect=1 call \do_sym /* rdi points to pt_regs */ jmp ret_from_intr -END(\sym) +SYM_CODE_END(\sym) _ASM_NOKPROBE(\sym) .endm @@ -969,7 +968,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 -ENTRY(\sym) +SYM_CODE_START(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ @@ -1019,7 +1018,7 @@ ENTRY(\sym) .endif _ASM_NOKPROBE(\sym) -END(\sym) +SYM_CODE_END(\sym) .endm idtentry divide_error do_divide_error has_error_code=0 @@ -1041,7 +1040,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * Reload gs selector with exception handling * edi: new selector */ -ENTRY(native_load_gs_index) +SYM_FUNC_START(native_load_gs_index) FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) @@ -1055,13 +1054,13 @@ ENTRY(native_load_gs_index) popfq FRAME_END ret -ENDPROC(native_load_gs_index) +SYM_FUNC_END(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ -.Lbad_gs: +SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) SWAPGS /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ @@ -1072,10 +1071,11 @@ EXPORT_SYMBOL(native_load_gs_index) xorl %eax, %eax movl %eax, %gs jmp 2b +SYM_CODE_END(.Lbad_gs) .previous /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(do_softirq_own_stack) +SYM_FUNC_START(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp ENTER_IRQ_STACK regs=0 old_rsp=%r11 @@ -1083,7 +1083,7 @@ ENTRY(do_softirq_own_stack) LEAVE_IRQ_STACK regs=0 leaveq ret -ENDPROC(do_softirq_own_stack) +SYM_FUNC_END(do_softirq_own_stack) #ifdef CONFIG_XEN_PV idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 @@ -1101,7 +1101,8 @@ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. */ -ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ +/* do_hypervisor_callback(struct *pt_regs) */ +SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -1119,7 +1120,7 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ call xen_maybe_preempt_hcall #endif jmp error_exit -END(xen_do_hypervisor_callback) +SYM_CODE_END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1134,7 +1135,7 @@ END(xen_do_hypervisor_callback) * We distinguish between categories by comparing each saved segment register * with its current contents: any discrepancy means we in category 1. */ -ENTRY(xen_failsafe_callback) +SYM_CODE_START(xen_failsafe_callback) UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) @@ -1164,7 +1165,7 @@ ENTRY(xen_failsafe_callback) PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER jmp error_exit -END(xen_failsafe_callback) +SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ #ifdef CONFIG_XEN_PVHVM @@ -1214,7 +1215,7 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 * Use slow, but surefire "are we in kernel?" check. * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ -ENTRY(paranoid_entry) +SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 @@ -1248,7 +1249,7 @@ ENTRY(paranoid_entry) FENCE_SWAPGS_KERNEL_ENTRY ret -END(paranoid_entry) +SYM_CODE_END(paranoid_entry) /* * "Paranoid" exit path from exception stack. This is invoked @@ -1262,7 +1263,7 @@ END(paranoid_entry) * * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(paranoid_exit) +SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG @@ -1272,19 +1273,18 @@ ENTRY(paranoid_exit) /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore + jmp restore_regs_and_return_to_kernel .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 -.Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel -END(paranoid_exit) +SYM_CODE_END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. */ -ENTRY(error_entry) +SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 @@ -1364,16 +1364,16 @@ ENTRY(error_entry) call fixup_bad_iret mov %rax, %rsp jmp .Lerror_entry_from_usermode_after_swapgs -END(error_entry) +SYM_CODE_END(error_entry) -ENTRY(error_exit) +SYM_CODE_START_LOCAL(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testb $3, CS(%rsp) jz retint_kernel - jmp retint_user -END(error_exit) + jmp .Lretint_user +SYM_CODE_END(error_exit) /* * Runs on exception stack. Xen PV does not go through this path at all, @@ -1383,7 +1383,7 @@ END(error_exit) * %r14: Used to save/restore the CR3 of the interrupted context * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ -ENTRY(nmi) +SYM_CODE_START(nmi) UNWIND_HINT_IRET_REGS /* @@ -1718,21 +1718,21 @@ nmi_restore: * about espfix64 on the way back to kernel mode. */ iretq -END(nmi) +SYM_CODE_END(nmi) #ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -ENTRY(ignore_sysret) +SYM_CODE_START(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret -END(ignore_sysret) +SYM_CODE_END(ignore_sysret) #endif -ENTRY(rewind_stack_do_exit) +SYM_CODE_START(rewind_stack_do_exit) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1742,4 +1742,4 @@ ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -END(rewind_stack_do_exit) +SYM_CODE_END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 39913770a44d..f1d3ccae5dd5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -46,7 +46,7 @@ * ebp user stack * 0(%ebp) arg6 */ -ENTRY(entry_SYSENTER_compat) +SYM_FUNC_START(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS @@ -146,8 +146,8 @@ ENTRY(entry_SYSENTER_compat) pushq $X86_EFLAGS_FIXED popfq jmp .Lsysenter_flags_fixed -GLOBAL(__end_entry_SYSENTER_compat) -ENDPROC(entry_SYSENTER_compat) +SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) +SYM_FUNC_END(entry_SYSENTER_compat) /* * 32-bit SYSCALL entry. @@ -196,7 +196,7 @@ ENDPROC(entry_SYSENTER_compat) * esp user stack * 0(%esp) arg6 */ -ENTRY(entry_SYSCALL_compat) +SYM_CODE_START(entry_SYSCALL_compat) /* Interrupts are off on entry. */ swapgs @@ -215,7 +215,7 @@ ENTRY(entry_SYSCALL_compat) pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ -GLOBAL(entry_SYSCALL_compat_after_hwframe) +SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ @@ -311,7 +311,7 @@ sysret32_from_system_call: xorl %r10d, %r10d swapgs sysretl -END(entry_SYSCALL_compat) +SYM_CODE_END(entry_SYSCALL_compat) /* * 32-bit legacy system call entry. @@ -339,7 +339,7 @@ END(entry_SYSCALL_compat) * edi arg5 * ebp arg6 */ -ENTRY(entry_INT80_compat) +SYM_CODE_START(entry_INT80_compat) /* * Interrupts are off on entry. */ @@ -416,4 +416,4 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode -END(entry_INT80_compat) +SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index aa3336a7cb15..7d17b3addbbb 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,13 +10,11 @@ #ifdef CONFIG_IA32_EMULATION /* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); - -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); - +#define __sys_ni_syscall __ia32_sys_ni_syscall #else /* CONFIG_IA32_EMULATION */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#define __sys_ni_syscall sys_ni_syscall #endif /* CONFIG_IA32_EMULATION */ #include <asm/syscalls_32.h> @@ -29,6 +27,6 @@ __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &__sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index b1bf31713374..adf619a856e8 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -4,11 +4,17 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/syscalls.h> #include <asm/asm-offsets.h> #include <asm/syscall.h> -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +extern asmlinkage long sys_ni_syscall(void); + +SYSCALL_DEFINE0(ni_syscall) +{ + return sys_ni_syscall(); +} + #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); #define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include <asm/syscalls_64.h> @@ -23,7 +29,7 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; @@ -40,7 +46,7 @@ asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_x32_max] = &sys_ni_syscall, + [0 ... __NR_syscall_x32_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3fe02546aed3..15908eb9b17e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -124,13 +124,13 @@ 110 i386 iopl sys_iopl __ia32_sys_iopl 111 i386 vhangup sys_vhangup __ia32_sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys_ni_syscall +113 i386 vm86old sys_vm86old __ia32_sys_ni_syscall 114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 115 i386 swapoff sys_swapoff __ia32_sys_swapoff 116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo 117 i386 ipc sys_ipc __ia32_compat_sys_ipc 118 i386 fsync sys_fsync __ia32_sys_fsync -119 i386 sigreturn sys_sigreturn sys32_sigreturn +119 i386 sigreturn sys_sigreturn __ia32_compat_sys_sigreturn 120 i386 clone sys_clone __ia32_compat_sys_x86_clone 121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname 122 i386 uname sys_newuname __ia32_sys_newuname @@ -177,14 +177,14 @@ 163 i386 mremap sys_mremap __ia32_sys_mremap 164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 -166 i386 vm86 sys_vm86 sys_ni_syscall +166 i386 vm86 sys_vm86 __ia32_sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll __ia32_sys_poll 169 i386 nfsservctl 170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 172 i386 prctl sys_prctl __ia32_sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn __ia32_compat_sys_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 2713490611a3..e010d4ae11f1 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -10,8 +10,7 @@ /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 - .globl \name -\name: +SYM_CODE_START_NOALIGN(\name) pushl %eax pushl %ecx pushl %edx @@ -27,6 +26,7 @@ popl %eax ret _ASM_NOKPROBE(\name) +SYM_CODE_END(\name) .endm #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index ea5c4167086c..c5c3b6e86e62 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -12,7 +12,7 @@ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 - ENTRY(\name) +SYM_FUNC_START_NOALIGN(\name) pushq %rbp movq %rsp, %rbp @@ -33,7 +33,7 @@ call \func jmp .L_restore - ENDPROC(\name) +SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm @@ -56,7 +56,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPTION) -.L_restore: +SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %r11 popq %r10 popq %r9 @@ -69,4 +69,5 @@ popq %rbp ret _ASM_NOKPROBE(.L_restore) +SYM_CODE_END(.L_restore) #endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 0f2154106d01..2b75e80f6b41 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -87,11 +87,9 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. # -CFLAGS_REMOVE_vdso-note.o = -pg CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg -CFLAGS_REMOVE_vvar.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 263d7433dea8..de1fff7188aa 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -62,7 +62,7 @@ __kernel_vsyscall: /* Enter using int $0x80 */ int $0x80 -GLOBAL(int80_landing_pad) +SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL) /* * Restore EDX and ECX in case they were clobbered. EBP is not diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 64c3e70b0556..a7752cd78b89 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -652,15 +652,7 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int active, handled; - - /* - * Obtain the active count before calling x86_pmu_handle_irq() since - * it is possible that x86_pmu_handle_irq() may make a counter - * inactive (through x86_pmu_stop). - */ - active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + int handled; /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); @@ -670,8 +662,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) * NMIs will be claimed if arriving within that window. */ if (handled) { - this_cpu_write(perf_nmi_tstamp, - jiffies + perf_nmi_window); + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); return handled; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7b21455d7504..6e3f0c18908e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2243,6 +2243,13 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) x86_pmu.sched_task(ctx, sched_in); } +static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next) +{ + if (x86_pmu.swap_task_ctx) + x86_pmu.swap_task_ctx(prev, next); +} + void perf_check_microcode(void) { if (x86_pmu.check_microcode) @@ -2297,6 +2304,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .task_ctx_size = sizeof(struct x86_perf_task_context), + .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 5ee3fed881d3..38de4a7f6752 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event) * Note that the default paranoia setting permits unprivileged * users to profile the kernel. */ - if (event->attr.exclude_kernel && perf_paranoid_kernel() && - !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (event->attr.exclude_kernel) { + ret = perf_allow_kernel(&event->attr); + if (ret) + return ret; + } if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 937363b803c1..3be51aa06e67 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event) if (x86_pmu.version < 3) return -EINVAL; - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + ret = perf_allow_cpu(&event->attr); + if (ret) + return ret; event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; @@ -3830,6 +3831,12 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx, intel_pmu_lbr_sched_task(ctx, sched_in); } +static void intel_pmu_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next) +{ + intel_pmu_lbr_swap_task_ctx(prev, next); +} + static int intel_pmu_check_period(struct perf_event *event, u64 value) { return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; @@ -3965,6 +3972,7 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, .sched_task = intel_pmu_sched_task, + .swap_task_ctx = intel_pmu_swap_task_ctx, .check_period = intel_pmu_check_period, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index ea54634eabf3..534c76606049 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -417,6 +417,29 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) cpuc->last_log_id = ++task_ctx->log_id; } +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next) +{ + struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + + swap(prev->task_ctx_data, next->task_ctx_data); + + /* + * Architecture specific synchronization makes sense in + * case both prev->task_ctx_data and next->task_ctx_data + * pointers are allocated. + */ + + prev_ctx_data = next->task_ctx_data; + next_ctx_data = prev->task_ctx_data; + + if (!prev_ctx_data || !next_ctx_data) + return; + + swap(prev_ctx_data->lbr_callstack_users, + next_ctx_data->lbr_callstack_users); +} + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index dee579efb2b2..a4cc66005ce8 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event) * the user needs special permissions to be able to use it */ if (p4_ht_active() && p4_event_bind_map[v].shared) { - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + v = perf_allow_cpu(&event->attr); + if (v) + return v; } /* ESCR EventMask bits may be invalid */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 05e43d0f430b..1db7a51d9792 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -397,6 +397,20 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ +static void pt_config_start(struct perf_event *event) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + u64 ctl = event->hw.config; + + ctl |= RTIT_CTL_TRACEEN; + if (READ_ONCE(pt->vmx_on)) + perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); + else + wrmsrl(MSR_IA32_RTIT_CTL, ctl); + + WRITE_ONCE(event->hw.config, ctl); +} + /* Address ranges and their corresponding msr configuration registers */ static const struct pt_address_range { unsigned long msr_a; @@ -469,6 +483,7 @@ static u64 pt_config_filters(struct perf_event *event) static void pt_config(struct perf_event *event) { struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); u64 reg; /* First round: clear STATUS, in particular the PSB byte counter. */ @@ -478,7 +493,9 @@ static void pt_config(struct perf_event *event) } reg = pt_config_filters(event); - reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN; + reg |= RTIT_CTL_TRACEEN; + if (!buf->single) + reg |= RTIT_CTL_TOPA; /* * Previously, we had BRANCH_EN on by default, but now that PT has @@ -501,10 +518,7 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); event->hw.config = reg; - if (READ_ONCE(pt->vmx_on)) - perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); - else - wrmsrl(MSR_IA32_RTIT_CTL, reg); + pt_config_start(event); } static void pt_config_stop(struct perf_event *event) @@ -533,18 +547,6 @@ static void pt_config_stop(struct perf_event *event) wmb(); } -static void pt_config_buffer(void *buf, unsigned int topa_idx, - unsigned int output_off) -{ - u64 reg; - - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); - - reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); - - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); -} - /** * struct topa - ToPA metadata * @list: linkage to struct pt_buffer's list of tables @@ -602,6 +604,33 @@ static inline phys_addr_t topa_pfn(struct topa *topa) #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) +static void pt_config_buffer(struct pt_buffer *buf) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + u64 reg, mask; + void *base; + + if (buf->single) { + base = buf->data_pages[0]; + mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7; + } else { + base = topa_to_page(buf->cur)->table; + mask = (u64)buf->cur_idx; + } + + reg = virt_to_phys(base); + if (pt->output_base != reg) { + pt->output_base = reg; + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg); + } + + reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32); + if (pt->output_mask != reg) { + pt->output_mask = reg; + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); + } +} + /** * topa_alloc() - allocate page-sized ToPA table * @cpu: CPU on which to allocate. @@ -802,6 +831,11 @@ static void pt_update_head(struct pt *pt) struct pt_buffer *buf = perf_get_aux(&pt->handle); u64 topa_idx, base, old; + if (buf->single) { + local_set(&buf->data_size, buf->output_off); + return; + } + /* offset of the first region in this table from the beginning of buf */ base = buf->cur->offset + buf->output_off; @@ -903,18 +937,21 @@ static void pt_handle_status(struct pt *pt) */ static void pt_read_offset(struct pt_buffer *buf) { - u64 offset, base_topa; + struct pt *pt = this_cpu_ptr(&pt_ctx); struct topa_page *tp; - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); - tp = phys_to_virt(base_topa); - buf->cur = &tp->topa; + if (!buf->single) { + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); + tp = phys_to_virt(pt->output_base); + buf->cur = &tp->topa; + } - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); /* offset within current output region */ - buf->output_off = offset >> 32; + buf->output_off = pt->output_mask >> 32; /* index of current output region within this table */ - buf->cur_idx = (offset & 0xffffff80) >> 7; + if (!buf->single) + buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7; } static struct topa_entry * @@ -1030,6 +1067,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long head = local64_read(&buf->head); unsigned long idx, npages, wakeup; + if (buf->single) + return 0; + /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); @@ -1111,13 +1151,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) if (buf->snapshot) head &= (buf->nr_pages << PAGE_SHIFT) - 1; - pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); - te = pt_topa_entry_for_page(buf, pg); + if (!buf->single) { + pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); + te = pt_topa_entry_for_page(buf, pg); - cur_tp = topa_entry_to_page(te); - buf->cur = &cur_tp->topa; - buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); - buf->output_off = head & (pt_buffer_region_size(buf) - 1); + cur_tp = topa_entry_to_page(te); + buf->cur = &cur_tp->topa; + buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); + buf->output_off = head & (pt_buffer_region_size(buf) - 1); + } else { + buf->output_off = head; + } local64_set(&buf->head, head); local_set(&buf->data_size, 0); @@ -1131,6 +1175,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf) { struct topa *topa, *iter; + if (buf->single) + return; + list_for_each_entry_safe(topa, iter, &buf->tables, list) { /* * right now, this is in free_aux() path only, so @@ -1176,6 +1223,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, return 0; } +static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) +{ + struct page *p = virt_to_page(buf->data_pages[0]); + int ret = -ENOTSUPP, order = 0; + + /* + * We can use single range output mode + * + in snapshot mode, where we don't need interrupts; + * + if the hardware supports it; + * + if the entire buffer is one contiguous allocation. + */ + if (!buf->snapshot) + goto out; + + if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output)) + goto out; + + if (PagePrivate(p)) + order = page_private(p); + + if (1 << order != nr_pages) + goto out; + + buf->single = true; + buf->nr_pages = nr_pages; + ret = 0; +out: + return ret; +} + /** * pt_buffer_setup_aux() - set up topa tables for a PT buffer * @cpu: Cpu on which to allocate, -1 means current. @@ -1198,6 +1275,13 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, if (!nr_pages) return NULL; + /* + * Only support AUX sampling in snapshot mode, where we don't + * generate NMIs. + */ + if (event->attr.aux_sample_size && !snapshot) + return NULL; + if (cpu == -1) cpu = raw_smp_processor_id(); node = cpu_to_node(cpu); @@ -1213,6 +1297,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, INIT_LIST_HEAD(&buf->tables); + ret = pt_buffer_try_single(buf, nr_pages); + if (!ret) + return buf; + ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); if (ret) { kfree(buf); @@ -1379,9 +1467,8 @@ void intel_pt_interrupt(void) return; } - pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, - buf->output_off); - pt_config(event); + pt_config_buffer(buf); + pt_config_start(event); } } @@ -1444,8 +1531,7 @@ static void pt_event_start(struct perf_event *event, int mode) WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0; - pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, - buf->output_off); + pt_config_buffer(buf); pt_config(event); return; @@ -1496,6 +1582,52 @@ static void pt_event_stop(struct perf_event *event, int mode) } } +static long pt_event_snapshot_aux(struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + unsigned long from = 0, to; + long ret; + + if (WARN_ON_ONCE(!buf)) + return 0; + + /* + * Sampling is only allowed on snapshot events; + * see pt_buffer_setup_aux(). + */ + if (WARN_ON_ONCE(!buf->snapshot)) + return 0; + + /* + * Here, handle_nmi tells us if the tracing is on + */ + if (READ_ONCE(pt->handle_nmi)) + pt_config_stop(event); + + pt_read_offset(buf); + pt_update_head(pt); + + to = local_read(&buf->data_size); + if (to < size) + from = buf->nr_pages << PAGE_SHIFT; + from += to - size; + + ret = perf_output_copy_aux(&pt->handle, handle, from, to); + + /* + * If the tracing was on when we turned up, restart it. + * Compiler barrier not needed as we couldn't have been + * preempted by anything that touches pt->handle_nmi. + */ + if (pt->handle_nmi) + pt_config_start(event); + + return ret; +} + static void pt_event_del(struct perf_event *event, int mode) { pt_event_stop(event, PERF_EF_UPDATE); @@ -1615,6 +1747,7 @@ static __init int pt_init(void) pt_pmu.pmu.del = pt_event_del; pt_pmu.pmu.start = pt_event_start; pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux; pt_pmu.pmu.read = pt_event_read; pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; pt_pmu.pmu.free_aux = pt_buffer_free_aux; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 1d2bb7572374..96906a62aacd 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -64,6 +64,7 @@ struct pt_pmu { * @lost: if data was lost/truncated * @head: logical write offset inside the buffer * @snapshot: if this is for a snapshot/overwrite counter + * @single: use Single Range Output instead of ToPA * @stop_pos: STOP topa entry index * @intr_pos: INT topa entry index * @stop_te: STOP topa entry pointer @@ -80,6 +81,7 @@ struct pt_buffer { local_t data_size; local64_t head; bool snapshot; + bool single; long stop_pos, intr_pos; struct topa_entry *stop_te, *intr_te; void **data_pages; @@ -111,16 +113,20 @@ struct pt_filters { /** * struct pt - per-cpu pt context - * @handle: perf output handle + * @handle: perf output handle * @filters: last configured filters - * @handle_nmi: do handle PT PMI on this cpu, there's an active event - * @vmx_on: 1 if VMX is ON on this cpu + * @handle_nmi: do handle PT PMI on this cpu, there's an active event + * @vmx_on: 1 if VMX is ON on this cpu + * @output_base: cached RTIT_OUTPUT_BASE MSR value + * @output_mask: cached RTIT_OUTPUT_MASK MSR value */ struct pt { struct perf_output_handle handle; struct pt_filters filters; int handle_nmi; int vmx_on; + u64 output_base; + u64 output_mask; }; #endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ecacfbf4ebc1..930611db8f9a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -683,6 +683,14 @@ struct x86_pmu { atomic_t lbr_exclusive[x86_lbr_exclusive_max]; /* + * perf task context (i.e. struct perf_event_context::task_ctx_data) + * switch helper to bridge calls from perf/core to perf/x86. + * See struct pmu::swap_task_ctx() usage for examples; + */ + void (*swap_task_ctx)(struct perf_event_context *prev, + struct perf_event_context *next); + + /* * AMD bits */ unsigned int amd_nb_constraints : 1; @@ -1016,6 +1024,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); void intel_ds_init(void); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next); + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index e01078e93dd3..40e0e322161d 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -194,10 +194,20 @@ do_ex_hypercall: static bool __send_ipi_one(int cpu, int vector) { - struct cpumask mask = CPU_MASK_NONE; + int vp = hv_cpu_number_to_vp_number(cpu); - cpumask_set_cpu(cpu, &mask); - return __send_ipi_mask(&mask, vector); + trace_hyperv_send_ipi_one(cpu, vector); + + if (!hv_hypercall_pg || (vp == VP_INVAL)) + return false; + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return false; + + if (vp >= 64) + return __send_ipi_mask_ex(cpumask_of(cpu), vector); + + return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); } static void hv_send_ipi(int cpu, int vector) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2db3972c0e0f..50ff030d9224 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -311,6 +311,12 @@ void __init hyperv_init(void) hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + hv_apic_init(); x86_init.pci.arch_init = hv_pci_init; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1cee10091b9f..30416d7f19d4 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -21,6 +21,7 @@ #include <linux/personality.h> #include <linux/compat.h> #include <linux/binfmts.h> +#include <linux/syscalls.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/fpu/internal.h> @@ -118,7 +119,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, return err; } -asmlinkage long sys32_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(sigreturn) { struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); @@ -144,7 +145,7 @@ badframe: return 0; } -asmlinkage long sys32_rt_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 3ff577c0b102..cd339b88d5d4 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -7,9 +7,11 @@ # define __ASM_FORM_RAW(x) x # define __ASM_FORM_COMMA(x) x, #else -# define __ASM_FORM(x) " " #x " " -# define __ASM_FORM_RAW(x) #x -# define __ASM_FORM_COMMA(x) " " #x "," +#include <linux/stringify.h> + +# define __ASM_FORM(x) " " __stringify(x) " " +# define __ASM_FORM_RAW(x) __stringify(x) +# define __ASM_FORM_COMMA(x) " " __stringify(x) "," #endif #ifndef __x86_64__ @@ -139,9 +141,6 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -170,9 +169,6 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h deleted file mode 100644 index facd374a1bf7..000000000000 --- a/arch/x86/include/asm/calgary.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Derived from include/asm-powerpc/iommu.h - * - * Copyright IBM Corporation, 2006-2007 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#ifndef _ASM_X86_CALGARY_H -#define _ASM_X86_CALGARY_H - -#include <linux/spinlock.h> -#include <linux/device.h> -#include <linux/dma-mapping.h> -#include <linux/timer.h> -#include <asm/types.h> - -struct iommu_table { - const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */ - unsigned long it_base; /* mapped address of tce table */ - unsigned long it_hint; /* Hint for next alloc */ - unsigned long *it_map; /* A simple allocation bitmap for now */ - void __iomem *bbar; /* Bridge BAR */ - u64 tar_val; /* Table Address Register */ - struct timer_list watchdog_timer; - spinlock_t it_lock; /* Protects it_map */ - unsigned int it_size; /* Size of iommu table in entries */ - unsigned char it_busno; /* Bus number this table belongs to */ -}; - -struct cal_chipset_ops { - void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev); - void (*tce_cache_blast)(struct iommu_table *tbl); - void (*dump_error_regs)(struct iommu_table *tbl); -}; - -#define TCE_TABLE_SIZE_UNSPECIFIED ~0 -#define TCE_TABLE_SIZE_64K 0 -#define TCE_TABLE_SIZE_128K 1 -#define TCE_TABLE_SIZE_256K 2 -#define TCE_TABLE_SIZE_512K 3 -#define TCE_TABLE_SIZE_1M 4 -#define TCE_TABLE_SIZE_2M 5 -#define TCE_TABLE_SIZE_4M 6 -#define TCE_TABLE_SIZE_8M 7 - -extern int use_calgary; - -#ifdef CONFIG_CALGARY_IOMMU -extern int detect_calgary(void); -#else -static inline int detect_calgary(void) { return -ENODEV; } -#endif - -#endif /* _ASM_X86_CALGARY_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 8348f7d69fd5..ea866c7bf31d 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -78,8 +78,12 @@ struct cpu_entry_area { /* * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. + * a read-only guard page. On 32-bit the GDT must be writeable, so + * it needs an extra guard page. */ +#ifdef CONFIG_X86_32 + char guard_entry_stack[PAGE_SIZE]; +#endif struct entry_stack_page entry_stack_page; /* @@ -94,7 +98,6 @@ struct cpu_entry_area { */ struct cea_exception_stacks estacks; #endif -#ifdef CONFIG_CPU_SUP_INTEL /* * Per CPU debug store for Intel performance monitoring. Wastes a * full page at the moment. @@ -105,11 +108,13 @@ struct cpu_entry_area { * Reserve enough fixmap PTEs. */ struct debug_store_buffers cpu_debug_buffers; -#endif }; -#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +/* Total size includes the readonly IDT mapping page as well: */ +#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); @@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); +/* Single page reserved for the readonly IDT mapping: */ #define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE #define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) #define CPU_ENTRY_AREA_MAP_SIZE \ - (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) extern struct cpu_entry_area *get_cpu_entry_area(int cpu); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c4fbe379cc0b..e9b62498fe75 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -292,6 +292,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ #define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index 0acf5ee45a21..f58de66091e5 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -2,10 +2,17 @@ #ifndef _ASM_X86_CRASH_H #define _ASM_X86_CRASH_H +struct kimage; + int crash_load_segments(struct kimage *image); -int crash_copy_backup_region(struct kimage *image); int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params); void crash_smp_send_stop(void); +#ifdef CONFIG_KEXEC_CORE +void __init crash_reserve_low_1M(void); +#else +static inline void __init crash_reserve_low_1M(void) { } +#endif + #endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index a5ea841cc6d2..8e1d0bb46361 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -22,7 +22,7 @@ # define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) #endif -#ifdef CONFIG_X86_INTEL_UMIP +#ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else # define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index c3aa4b5e49e2..314f75d886d0 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -29,6 +29,14 @@ enum e820_type { E820_TYPE_PRAM = 12, /* + * Special-purpose memory is indicated to the system via the + * EFI_MEMORY_SP attribute. Define an e820 translation of this + * memory type for the purpose of reserving this range and + * marking it with the IORES_DESC_SOFT_RESERVED designation. + */ + E820_TYPE_SOFT_RESERVED = 0xefffffff, + + /* * Reserved RAM used by the kernel itself if * CONFIG_INTEL_TXT=y is enabled, memory of this type * will be included in the S3 integrity calculation diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 43a82e59c59d..d028e9acdf1c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -140,7 +140,6 @@ extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); extern void efi_recover_from_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -extern void efi_reserve_boot_services(void); struct efi_setup_data { u64 fw_vendor; @@ -244,6 +243,8 @@ static inline bool efi_is_64bit(void) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); +extern void efi_find_mirror(void); +extern void efi_reserve_boot_services(void); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} static inline bool efi_reboot_required(void) @@ -254,6 +255,20 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } +static inline void efi_find_mirror(void) +{ +} +static inline void efi_reserve_boot_services(void) +{ +} #endif /* CONFIG_EFI */ +#ifdef CONFIG_EFI_FAKE_MEMMAP +extern void __init efi_fake_memmap_early(void); +#else +static inline void efi_fake_memmap_early(void) +{ +} +#endif + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/emulate_prefix.h b/arch/x86/include/asm/emulate_prefix.h new file mode 100644 index 000000000000..70f5b98a5286 --- /dev/null +++ b/arch/x86/include/asm/emulate_prefix.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_EMULATE_PREFIX_H +#define _ASM_X86_EMULATE_PREFIX_H + +/* + * Virt escape sequences to trigger instruction emulation; + * ideally these would decode to 'whole' instruction and not destroy + * the instruction stream; sadly this is not true for the 'kvm' one :/ + */ + +#define __XEN_EMULATE_PREFIX 0x0f,0x0b,0x78,0x65,0x6e /* ud2 ; .ascii "xen" */ +#define __KVM_EMULATE_PREFIX 0x0f,0x0b,0x6b,0x76,0x6d /* ud2 ; .ascii "kvm" */ + +#endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 0c47aa82e2e2..28183ee3cc42 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -156,7 +156,7 @@ extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); -void native_set_fixmap(enum fixed_addresses idx, +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); #ifndef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c38a66661576..c2a7458f912c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,6 +28,19 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* + * When a ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, we + * place the direct caller in the ORIG_AX part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + /* Emulate a call */ + regs->orig_ax = addr; +} + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 7741e211f7f5..5f10f7f2098d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -86,6 +86,8 @@ #define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) /* AccessReenlightenmentControls privilege */ #define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) +/* AccessTscInvariantControls privilege */ +#define HV_X64_ACCESS_TSC_INVARIANT BIT(15) /* * Feature identification: indicates which flags were specified at partition @@ -278,6 +280,9 @@ #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 #define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +/* TSC invariant control */ +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 + /* * Declare the MSR used to setup pages used to communicate with the hypervisor. */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 154f27be8bfc..5c1ae3eff9d4 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -45,6 +45,7 @@ struct insn { struct insn_field immediate2; /* for 64bit imm or seg16 */ }; + int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; @@ -128,6 +129,11 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +static inline int insn_has_emulate_prefix(struct insn *insn) +{ + return !!insn->emulate_prefix_size; +} + /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h new file mode 100644 index 000000000000..02c6ef8f7667 --- /dev/null +++ b/arch/x86/include/asm/io_bitmap.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IOBITMAP_H +#define _ASM_X86_IOBITMAP_H + +#include <linux/refcount.h> +#include <asm/processor.h> + +struct io_bitmap { + u64 sequence; + refcount_t refcnt; + /* The maximum number of bytes to copy so all zero bits are covered */ + unsigned int max; + unsigned long bitmap[IO_BITMAP_LONGS]; +}; + +struct task_struct; + +#ifdef CONFIG_X86_IOPL_IOPERM +void io_bitmap_share(struct task_struct *tsk); +void io_bitmap_exit(void); + +void tss_update_io_bitmap(void); +#else +static inline void io_bitmap_share(struct task_struct *tsk) { } +static inline void io_bitmap_exit(void) { } +static inline void tss_update_io_bitmap(void) { } +#endif + +#endif diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5e7d6b46de97..6802c59e8252 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -66,10 +66,6 @@ struct kimage; # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif -/* Memory to backup during crash kdump */ -#define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ - /* * This function is responsible for capturing register states if coming * via panic otherwise just fix up the ss and sp if coming via kernel @@ -154,12 +150,6 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; - /* Details of backup region */ - unsigned long backup_src_start; - unsigned long backup_src_sz; - - /* Physical address of backup segment */ - unsigned long backup_load_addr; /* Core ELF header buffer */ void *elf_headers; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 14caa9d9fb7f..365111789cc6 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -13,10 +13,6 @@ #ifdef __ASSEMBLY__ -#define GLOBAL(name) \ - .globl name; \ - name: - #if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) #define __ALIGN .p2align 4, 0x90 #define __ALIGN_STR __stringify(__ALIGN) diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 7948a17febb4..c215d2762488 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -15,6 +15,8 @@ struct mod_arch_specific { #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M486SX +#define MODULE_PROC_FAMILY "486SX " #elif defined CONFIG_M486 #define MODULE_PROC_FAMILY "486 " #elif defined CONFIG_M586 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 69089d46f128..86e7317eb31f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -294,10 +294,6 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } -static inline void set_iopl_mask(unsigned mask) -{ - PVOP_VCALL1(cpu.set_iopl_mask, mask); -} static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 70b654f3ffe5..84812964d3dd 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -140,8 +140,6 @@ struct pv_cpu_ops { void (*load_sp0)(unsigned long sp0); - void (*set_iopl_mask)(unsigned mask); - void (*wbinvd)(void); /* cpuid emulation, mostly so that caps bits can be disabled */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index e662f987dfa2..90d0731fdcb6 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -12,8 +12,6 @@ #include <asm/pat.h> #include <asm/x86_init.h> -#ifdef __KERNEL__ - struct pci_sysdata { int domain; /* PCI domain */ int node; /* NUMA node */ @@ -118,11 +116,6 @@ void native_restore_msi_irqs(struct pci_dev *dev); #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif -#endif /* __KERNEL__ */ - -#ifdef CONFIG_X86_64 -#include <asm/pci_64.h> -#endif /* generic pci stuff */ #include <asm-generic/pci.h> diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h deleted file mode 100644 index f5411de0ae11..000000000000 --- a/arch/x86/include/asm/pci_64.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_PCI_64_H -#define _ASM_X86_PCI_64_H - -#ifdef __KERNEL__ - -#ifdef CONFIG_CALGARY_IOMMU -static inline void *pci_iommu(struct pci_bus *bus) -{ - struct pci_sysdata *sd = bus->sysdata; - return sd->iommu; -} - -static inline void set_pci_iommu(struct pci_bus *bus, void *val) -{ - struct pci_sysdata *sd = bus->sysdata; - sd->iommu = val; -} -#endif /* CONFIG_CALGARY_IOMMU */ - -extern int (*pci_config_read)(int seg, int bus, int dev, int fn, - int reg, int len, u32 *value); -extern int (*pci_config_write)(int seg, int bus, int dev, int fn, - int reg, int len, u32 value); - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index e3633795fb22..5afb5e0fe903 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) #define pmd_read_atomic pmd_read_atomic /* - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by gcc. Problem is, in certain places - * where pte_offset_map_lock is called, concurrent page faults are + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by GCC. Problem is, in certain places + * where pte_offset_map_lock() is called, concurrent page faults are * allowed, if the mmap_sem is hold for reading. An example is mincore * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_populate() rightfully does a set_64bit(), but if we're reading the * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because gcc will not read the 64bit of the pmd atomically. To fix - * this all places running pmd_offset_map_lock() while holding the + * because GCC will not read the 64-bit value of the pmd atomically. + * + * To fix this all places running pte_offset_map_lock() while holding the * mmap_sem in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null nor not, and in turn to know if - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * function to know if the pmd is null or not, and in turn to know if + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the pmd can only - * transition from null to not null while pmd_read_atomic runs. So + * Without THP if the mmap_sem is held for reading, the pmd can only + * transition from null to not null while pmd_read_atomic() runs. So * we can always return atomic pmd values with this function. * - * With THP if the mmap_sem is hold for reading, the pmd can become + * With THP if the mmap_sem is held for reading, the pmd can become * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic. We could read it really - * atomically here with a atomic64_read for the THP enabled case (and + * at any time under pmd_read_atomic(). We could read it truly + * atomically here with an atomic64_read() for the THP enabled case (and * it would be a whole lot simpler), but to avoid using cmpxchg8b we * only return an atomic pmdval if the low part of the pmdval is later - * found stable (i.e. pointing to a pte). And we're returning a none - * pmdval if the low part of the pmd is none. In some cases the high - * and low part of the pmdval returned may not be consistent if THP is - * enabled (the low part may point to previously mapped hugepage, - * while the high part may point to a more recently mapped hugepage), - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part - * of the pmd to be read atomically to decide if the pmd is unstable - * or not, with the only exception of when the low part of the pmd is - * zero in which case we return a none pmd. + * found to be stable (i.e. pointing to a pte). We are also returning a + * 'none' (zero) pmdval if the low part of the pmd is zero. + * + * In some cases the high and low part of the pmdval returned may not be + * consistent if THP is enabled (the low part may point to previously + * mapped hugepage, while the high part may point to a more recently + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only + * needs the low part of the pmd to be read atomically to decide if the + * pmd is unstable or not, with the only exception when the low part + * of the pmd is zero, in which case we return a 'none' pmd. */ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index b0bc0fff5f1f..19f5807260c3 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * to avoid include recursion hell */ -#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41) -#define CPU_ENTRY_AREA_BASE \ - ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ - & PMD_MASK) +/* The +1 is for the readonly IDT page: */ +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK) #define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 54f5d54280f6..e51afbb0cbfb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -7,6 +7,7 @@ /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; +struct io_bitmap; struct vm86; #include <asm/math_emu.h> @@ -93,7 +94,15 @@ struct cpuinfo_x86 { __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ int cpuid_level; - __u32 x86_capability[NCAPINTS + NBUGINTS]; + /* + * Align to size of unsigned long because the x86_capability array + * is passed to bitops which require the alignment. Use unnamed + * union to enforce the array is aligned to size of unsigned long. + */ + union { + __u32 x86_capability[NCAPINTS + NBUGINTS]; + unsigned long x86_capability_alignment; + }; char x86_vendor_id[16]; char x86_model_id[64]; /* in KB - valid for CPUS which support this call: */ @@ -328,10 +337,32 @@ struct x86_hw_tss { * IO-bitmap sizes: */ #define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) -#define INVALID_IO_BITMAP_OFFSET 0x8000 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long)) + +#define IO_BITMAP_OFFSET_VALID_MAP \ + (offsetof(struct tss_struct, io_bitmap.bitmap) - \ + offsetof(struct tss_struct, x86_tss)) + +#define IO_BITMAP_OFFSET_VALID_ALL \ + (offsetof(struct tss_struct, io_bitmap.mapall) - \ + offsetof(struct tss_struct, x86_tss)) + +#ifdef CONFIG_X86_IOPL_IOPERM +/* + * sizeof(unsigned long) coming from an extra "long" at the end of the + * iobitmap. The limit is inclusive, i.e. the last valid byte. + */ +# define __KERNEL_TSS_LIMIT \ + (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \ + sizeof(unsigned long) - 1) +#else +# define __KERNEL_TSS_LIMIT \ + (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1) +#endif + +/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */ +#define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1) struct entry_stack { unsigned long words[64]; @@ -341,13 +372,21 @@ struct entry_stack_page { struct entry_stack stack; } __aligned(PAGE_SIZE); -struct tss_struct { +/* + * All IO bitmap related data stored in the TSS: + */ +struct x86_io_bitmap { + /* The sequence number of the last active bitmap. */ + u64 prev_sequence; + /* - * The fixed hardware portion. This must not cross a page boundary - * at risk of violating the SDM's advice and potentially triggering - * errata. + * Store the dirty size of the last io bitmap offender. The next + * one will have to do the cleanup as the switch out to a non io + * bitmap user will just set x86_tss.io_bitmap_base to a value + * outside of the TSS limit. So for sane tasks there is no need to + * actually touch the io_bitmap at all. */ - struct x86_hw_tss x86_tss; + unsigned int prev_max; /* * The extra 1 is there because the CPU will access an @@ -355,21 +394,28 @@ struct tss_struct { * bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + unsigned long bitmap[IO_BITMAP_LONGS + 1]; + + /* + * Special I/O bitmap to emulate IOPL(3). All bytes zero, + * except the additional byte at the end. + */ + unsigned long mapall[IO_BITMAP_LONGS + 1]; +}; + +struct tss_struct { + /* + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering + * errata. + */ + struct x86_hw_tss x86_tss; + + struct x86_io_bitmap io_bitmap; } __aligned(PAGE_SIZE); DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); -/* - * sizeof(unsigned long) coming from an extra "long" at the end - * of the iobitmap. - * - * -1? seg base+limit should be pointing to the address of the - * last valid byte - */ -#define __KERNEL_TSS_LIMIT \ - (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) - /* Per CPU interrupt stacks */ struct irq_stack { char stack[IRQ_STACK_SIZE]; @@ -480,10 +526,14 @@ struct thread_struct { struct vm86 *vm86; #endif /* IO permissions: */ - unsigned long *io_bitmap_ptr; - unsigned long iopl; - /* Max allowed port in the bitmap, in bytes: */ - unsigned io_bitmap_max; + struct io_bitmap *io_bitmap; + + /* + * IOPL. Priviledge level dependent I/O permission which is + * emulated via the I/O bitmap to prevent user space from disabling + * interrupts. + */ + unsigned long iopl_emul; mm_segment_t addr_limit; @@ -515,25 +565,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -/* - * Set IOPL bits in EFLAGS from given mask - */ -static inline void native_set_iopl_mask(unsigned mask) -{ -#ifdef CONFIG_X86_32 - unsigned int reg; - - asm volatile ("pushfl;" - "popl %0;" - "andl %1, %0;" - "orl %2, %0;" - "pushl %0;" - "popfl" - : "=&r" (reg) - : "i" (~X86_EFLAGS_IOPL), "r" (mask)); -#endif -} - static inline void native_load_sp0(unsigned long sp0) { @@ -573,7 +604,6 @@ static inline void load_sp0(unsigned long sp0) native_load_sp0(sp0); } -#define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT_XXL */ /* Free all resources held by a thread. */ @@ -841,7 +871,6 @@ static inline void spin_lock_prefetch(const void *x) #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ - .io_bitmap_ptr = NULL, \ .addr_limit = KERNEL_DS, \ } @@ -958,7 +987,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); -extern void free_kernel_image_pages(void *begin, void *end); +extern void free_kernel_image_pages(const char *what, void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 332eb3525867..5057a8ed100b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -361,5 +361,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +#ifdef CONFIG_X86_64 +# define do_set_thread_area_64(p, s, t) do_arch_prctl_64(p, s, t) +#else +# define do_set_thread_area_64(p, s, t) (0) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h index 92c34e517da1..5528e9325049 100644 --- a/arch/x86/include/asm/purgatory.h +++ b/arch/x86/include/asm/purgatory.h @@ -6,16 +6,6 @@ #include <linux/purgatory.h> extern void purgatory(void); -/* - * These forward declarations serve two purposes: - * - * 1) Make sparse happy when checking arch/purgatory - * 2) Document that these are required to be global so the symbol - * lookup in kexec works - */ -extern unsigned long purgatory_backup_dest; -extern unsigned long purgatory_backup_src; -extern unsigned long purgatory_backup_sz; #endif /* __ASSEMBLY__ */ #endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h deleted file mode 100644 index 232f856e0db0..000000000000 --- a/arch/x86/include/asm/refcount.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef __ASM_X86_REFCOUNT_H -#define __ASM_X86_REFCOUNT_H -/* - * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from - * PaX/grsecurity. - */ -#include <linux/refcount.h> -#include <asm/bug.h> - -/* - * This is the first portion of the refcount error handling, which lives in - * .text.unlikely, and is jumped to from the CPU flag check (in the - * following macros). This saves the refcount value location into CX for - * the exception handler to use (in mm/extable.c), and then triggers the - * central refcount exception. The fixup address for the exception points - * back to the regular execution flow in .text. - */ -#define _REFCOUNT_EXCEPTION \ - ".pushsection .text..refcount\n" \ - "111:\tlea %[var], %%" _ASM_CX "\n" \ - "112:\t" ASM_UD2 "\n" \ - ASM_UNREACHABLE \ - ".popsection\n" \ - "113:\n" \ - _ASM_EXTABLE_REFCOUNT(112b, 113b) - -/* Trigger refcount exception if refcount result is negative. */ -#define REFCOUNT_CHECK_LT_ZERO \ - "js 111f\n\t" \ - _REFCOUNT_EXCEPTION - -/* Trigger refcount exception if refcount result is zero or negative. */ -#define REFCOUNT_CHECK_LE_ZERO \ - "jz 111f\n\t" \ - REFCOUNT_CHECK_LT_ZERO - -/* Trigger refcount exception unconditionally. */ -#define REFCOUNT_ERROR \ - "jmp 111f\n\t" \ - _REFCOUNT_EXCEPTION - -static __always_inline void refcount_add(unsigned int i, refcount_t *r) -{ - asm volatile(LOCK_PREFIX "addl %1,%0\n\t" - REFCOUNT_CHECK_LT_ZERO - : [var] "+m" (r->refs.counter) - : "ir" (i) - : "cc", "cx"); -} - -static __always_inline void refcount_inc(refcount_t *r) -{ - asm volatile(LOCK_PREFIX "incl %0\n\t" - REFCOUNT_CHECK_LT_ZERO - : [var] "+m" (r->refs.counter) - : : "cc", "cx"); -} - -static __always_inline void refcount_dec(refcount_t *r) -{ - asm volatile(LOCK_PREFIX "decl %0\n\t" - REFCOUNT_CHECK_LE_ZERO - : [var] "+m" (r->refs.counter) - : : "cc", "cx"); -} - -static __always_inline __must_check -bool refcount_sub_and_test(unsigned int i, refcount_t *r) -{ - bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", - REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, e, "er", i, "cx"); - - if (ret) { - smp_acquire__after_ctrl_dep(); - return true; - } - - return false; -} - -static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) -{ - bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", - REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, e, "cx"); - - if (ret) { - smp_acquire__after_ctrl_dep(); - return true; - } - - return false; -} - -static __always_inline __must_check -bool refcount_add_not_zero(unsigned int i, refcount_t *r) -{ - int c, result; - - c = atomic_read(&(r->refs)); - do { - if (unlikely(c == 0)) - return false; - - result = c + i; - - /* Did we try to increment from/to an undesirable state? */ - if (unlikely(c < 0 || c == INT_MAX || result < c)) { - asm volatile(REFCOUNT_ERROR - : : [var] "m" (r->refs.counter) - : "cc", "cx"); - break; - } - - } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); - - return c != 0; -} - -static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) -{ - return refcount_add_not_zero(1, r); -} - -#endif diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h deleted file mode 100644 index 0a21986d2238..000000000000 --- a/arch/x86/include/asm/rio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Derived from include/asm-x86/mach-summit/mach_mpparse.h - * and include/asm-x86/mach-default/bios_ebda.h - * - * Author: Laurent Vivier <Laurent.Vivier@bull.net> - */ - -#ifndef _ASM_X86_RIO_H -#define _ASM_X86_RIO_H - -#define RIO_TABLE_VERSION 3 - -struct rio_table_hdr { - u8 version; /* Version number of this data structure */ - u8 num_scal_dev; /* # of Scalability devices */ - u8 num_rio_dev; /* # of RIO I/O devices */ -} __attribute__((packed)); - -struct scal_detail { - u8 node_id; /* Scalability Node ID */ - u32 CBAR; /* Address of 1MB register space */ - u8 port0node; /* Node ID port connected to: 0xFF=None */ - u8 port0port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port1node; /* Node ID port connected to: 0xFF = None */ - u8 port1port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port2node; /* Node ID port connected to: 0xFF = None */ - u8 port2port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 chassis_num; /* 1 based Chassis number (1 = boot node) */ -} __attribute__((packed)); - -struct rio_detail { - u8 node_id; /* RIO Node ID */ - u32 BBAR; /* Address of 1MB register space */ - u8 type; /* Type of device */ - u8 owner_id; /* Node ID of Hurricane that owns this */ - /* node */ - u8 port0node; /* Node ID port connected to: 0xFF=None */ - u8 port0port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port1node; /* Node ID port connected to: 0xFF=None */ - u8 port1port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 first_slot; /* Lowest slot number below this Calgary */ - u8 status; /* Bit 0 = 1 : the XAPIC is used */ - /* = 0 : the XAPIC is not used, ie: */ - /* ints fwded to another XAPIC */ - /* Bits1:7 Reserved */ - u8 WP_index; /* instance index - lower ones have */ - /* lower slot numbers/PCI bus numbers */ - u8 chassis_num; /* 1 based Chassis number */ -} __attribute__((packed)); - -enum { - HURR_SCALABILTY = 0, /* Hurricane Scalability info */ - HURR_RIOIB = 2, /* Hurricane RIOIB info */ - COMPAT_CALGARY = 4, /* Compatibility Calgary */ - ALT_CALGARY = 5, /* Second Planar Calgary */ -}; - -#endif /* _ASM_X86_RIO_H */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 71b32f2570ab..036c360910c5 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -6,7 +6,6 @@ #include <asm/extable.h> extern char __brk_base[], __brk_limit[]; -extern struct exception_table_entry __stop___ex_table[]; extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index ac3892920419..6669164abadc 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -31,6 +31,18 @@ */ #define SEGMENT_RPL_MASK 0x3 +/* + * When running on Xen PV, the actual privilege level of the kernel is 1, + * not 0. Testing the Requested Privilege Level in a segment selector to + * determine whether the context is user mode or kernel mode with + * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level + * matches the 0x3 mask. + * + * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV + * kernels because privilege level 2 is never used. + */ +#define USER_SEGMENT_RPL_MASK 0x2 + /* User mode is privilege level 3: */ #define USER_RPL 0x3 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 18a4b6890fa8..0e059b73437b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -103,7 +103,17 @@ static inline void update_task_stack(struct task_struct *task) if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif +} +static inline void kthread_frame_init(struct inactive_task_frame *frame, + unsigned long fun, unsigned long arg) +{ + frame->bx = fun; +#ifdef CONFIG_X86_32 + frame->di = arg; +#else + frame->r12 = arg; +#endif } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index e046a405743d..e2389ce9bf58 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -6,6 +6,8 @@ #ifndef _ASM_X86_SYSCALL_WRAPPER_H #define _ASM_X86_SYSCALL_WRAPPER_H +struct pt_regs; + /* Mapping of registers to parameters for syscalls on x86-64 and x32 */ #define SC_X86_64_REGS_TO_ARGS(x, ...) \ __MAP(x,__SC_ARGS \ @@ -28,13 +30,21 @@ * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this * case as well. */ +#define __IA32_COMPAT_SYS_STUB0(x, name) \ + asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__ia32_compat_sys_##name, ERRNO); \ + asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys_##name(); \ + } + #define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\ ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\ { \ return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ - } \ + } #define __IA32_SYS_STUBx(x, name, ...) \ asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \ @@ -48,16 +58,23 @@ * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias * named __ia32_sys_*() */ -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ - asmlinkage long __x64_sys_##sname(void) -#define COND_SYSCALL(name) \ - cond_syscall(__x64_sys_##name); \ - cond_syscall(__ia32_sys_##name) +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL(name) \ + asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ + { \ + return sys_ni_syscall(); \ + } \ + asmlinkage __weak long __ia32_sys_##name(const struct pt_regs *__unused)\ + { \ + return sys_ni_syscall(); \ + } #define SYS_NI(name) \ SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \ @@ -75,15 +92,24 @@ * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common * with x86_64 obviously do not need such care. */ +#define __X32_COMPAT_SYS_STUB0(x, name, ...) \ + asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__x32_compat_sys_##name, ERRNO); \ + asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys_##name();\ + } + #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\ ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\ { \ return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ - } \ + } #else /* CONFIG_X86_X32 */ +#define __X32_COMPAT_SYS_STUB0(x, name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #endif /* CONFIG_X86_X32 */ @@ -94,6 +120,17 @@ * mapping of registers to parameters, we need to generate stubs for each * of them. */ +#define COMPAT_SYSCALL_DEFINE0(name) \ + static long __se_compat_sys_##name(void); \ + static inline long __do_compat_sys_##name(void); \ + __IA32_COMPAT_SYS_STUB0(x, name) \ + __X32_COMPAT_SYS_STUB0(x, name) \ + static long __se_compat_sys_##name(void) \ + { \ + return __do_compat_sys_##name(); \ + } \ + static inline long __do_compat_sys_##name(void) + #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ @@ -181,15 +218,19 @@ * macros to work correctly. */ #ifndef SYSCALL_DEFINE0 -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - asmlinkage long __x64_sys_##sname(void) +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) #endif #ifndef COND_SYSCALL -#define COND_SYSCALL(name) cond_syscall(__x64_sys_##name) +#define COND_SYSCALL(name) \ + asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ + { \ + return sys_ni_syscall(); \ + } #endif #ifndef SYS_NI @@ -201,7 +242,6 @@ * For VSYSCALLS, we need to declare these three syscalls with the new * pt_regs-based calling convention for in-kernel use. */ -struct pt_regs; asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs); asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs); asmlinkage long __x64_sys_time(const struct pt_regs *regs); diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h deleted file mode 100644 index 6ed2deacf1d0..000000000000 --- a/arch/x86/include/asm/tce.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file is derived from asm-powerpc/tce.h. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - * Author: Jon Mason <jdmason@us.ibm.com> - */ - -#ifndef _ASM_X86_TCE_H -#define _ASM_X86_TCE_H - -extern unsigned int specified_table_size; -struct iommu_table; - -#define TCE_ENTRY_SIZE 8 /* in bytes */ - -#define TCE_READ_SHIFT 0 -#define TCE_WRITE_SHIFT 1 -#define TCE_HUBID_SHIFT 2 /* unused */ -#define TCE_RSVD_SHIFT 8 /* unused */ -#define TCE_RPN_SHIFT 12 -#define TCE_UNUSED_SHIFT 48 /* unused */ - -#define TCE_RPN_MASK 0x0000fffffffff000ULL - -extern void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction); -extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages); -extern void * __init alloc_tce_table(void); -extern void __init free_tce_table(void *tbl); -extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar); - -#endif /* _ASM_X86_TCE_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 5e8319bb207a..23c626a742e8 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -26,10 +26,11 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define POKE_MAX_OPCODE_SIZE 5 struct text_poke_loc { - void *detour; void *addr; - size_t len; - const char opcode[POKE_MAX_OPCODE_SIZE]; + int len; + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; }; extern void text_poke_early(void *addr, const void *opcode, size_t len); @@ -51,8 +52,10 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); +extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } -#define INT3_INSN_SIZE 1 -#define CALL_INSN_SIZE 5 +#define INT3_INSN_SIZE 1 +#define INT3_INSN_OPCODE 0xCC + +#define CALL_INSN_SIZE 5 +#define CALL_INSN_OPCODE 0xE8 + +#define JMP32_INSN_SIZE 5 +#define JMP32_INSN_OPCODE 0xE9 + +#define JMP8_INSN_SIZE 2 +#define JMP8_INSN_OPCODE 0xEB static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f9453536f9bb..d779366ce3f8 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -143,8 +143,8 @@ struct thread_info { _TIF_NOHZ) /* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW_BASE \ - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ +#define _TIF_WORK_CTXSW_BASE \ + (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) /* @@ -156,8 +156,14 @@ struct thread_info { # define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) #endif -#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) +#ifdef CONFIG_X86_IOPL_IOPERM +# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \ + _TIF_IO_BITMAP) +#else +# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY) +#endif + +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index ace464f09681..4d705cb4d63b 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -71,6 +71,21 @@ TRACE_EVENT(hyperv_send_ipi_mask, __entry->ncpus, __entry->vector) ); +TRACE_EVENT(hyperv_send_ipi_one, + TP_PROTO(int cpu, + int vector), + TP_ARGS(cpu, vector), + TP_STRUCT__entry( + __field(int, cpu) + __field(int, vector) + ), + TP_fast_assign(__entry->cpu = cpu; + __entry->vector = vector; + ), + TP_printk("cpu %d vector %x", + __entry->cpu, __entry->vector) + ); + #endif /* CONFIG_HYPERV */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h index db43f2a0d92c..aeed98c3c9e1 100644 --- a/arch/x86/include/asm/umip.h +++ b/arch/x86/include/asm/umip.h @@ -4,9 +4,9 @@ #include <linux/types.h> #include <asm/ptrace.h> -#ifdef CONFIG_X86_INTEL_UMIP +#ifdef CONFIG_X86_UMIP bool fixup_umip_exception(struct pt_regs *regs); #else static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; } -#endif /* CONFIG_X86_INTEL_UMIP */ +#endif /* CONFIG_X86_UMIP */ #endif /* _ASM_X86_UMIP_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 0bcdb1279361..f5e2eb12cb71 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,6 +86,14 @@ UNWIND_HINT sp_offset=\sp_offset .endm +.macro UNWIND_HINT_SAVE + UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE +.endm + +.macro UNWIND_HINT_RESTORE + UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +.endm + #else /* !__ASSEMBLY__ */ #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 6e7caf65fa40..389174eaec79 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -138,7 +138,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); -extern void uv_bios_init(void); +extern int uv_bios_init(void); extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6bc6d89d8e2a..45ea95ce79b4 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -12,6 +12,16 @@ struct mm_struct; #ifdef CONFIG_X86_UV #include <linux/efi.h> +#define UV_PROC_NODE "sgi_uv" + +static inline int uv(int uvtype) +{ + /* uv(0) is "any" */ + if (uvtype >= 0 && uvtype <= 30) + return 1 << uvtype; + return 1; +} + extern unsigned long uv_systab_phys; extern enum uv_system_type get_uv_system_type(void); @@ -20,7 +30,8 @@ static inline bool is_early_uv_system(void) return uv_systab_phys && uv_systab_phys != EFI_INVALID_TABLE_ADDR; } extern int is_uv_system(void); -extern int is_uv_hubless(void); +extern int is_uv_hubbed(int uvtype); +extern int is_uv_hubless(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -32,7 +43,8 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } -static inline int is_uv_hubless(void) { return 0; } +static inline int is_uv_hubbed(int uv) { return 0; } +static inline int is_uv_hubless(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 44cf6d6deb7a..950cd1395d5d 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -19,6 +19,7 @@ #include <linux/topology.h> #include <asm/types.h> #include <asm/percpu.h> +#include <asm/uv/uv.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/bios.h> #include <asm/irq_vectors.h> @@ -243,83 +244,61 @@ static inline int uv_hub_info_check(int version) #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -#ifdef UV1_HUB_IS_SUPPORTED +/* WARNING: UVx_HUB_IS_SUPPORTED defines are deprecated and will be removed */ static inline int is_uv1_hub(void) { - return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE; -} +#ifdef UV1_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(1)); #else -static inline int is_uv1_hub(void) -{ return 0; -} #endif +} -#ifdef UV2_HUB_IS_SUPPORTED static inline int is_uv2_hub(void) { - return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) && - (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE)); -} +#ifdef UV2_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(2)); #else -static inline int is_uv2_hub(void) -{ return 0; -} #endif +} -#ifdef UV3_HUB_IS_SUPPORTED static inline int is_uv3_hub(void) { - return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) && - (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)); -} +#ifdef UV3_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(3)); #else -static inline int is_uv3_hub(void) -{ return 0; -} #endif +} /* First test "is UV4A", then "is UV4" */ -#ifdef UV4A_HUB_IS_SUPPORTED -static inline int is_uv4a_hub(void) -{ - return (uv_hub_info->hub_revision >= UV4A_HUB_REVISION_BASE); -} -#else static inline int is_uv4a_hub(void) { +#ifdef UV4A_HUB_IS_SUPPORTED + if (is_uv_hubbed(uv(4))) + return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); +#endif return 0; } -#endif -#ifdef UV4_HUB_IS_SUPPORTED static inline int is_uv4_hub(void) { - return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE; -} +#ifdef UV4_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(4)); #else -static inline int is_uv4_hub(void) -{ return 0; -} #endif +} static inline int is_uvx_hub(void) { - if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) - return uv_hub_info->hub_revision; - - return 0; + return (is_uv_hubbed(-2) >= uv(2)); } static inline int is_uv_hub(void) { -#ifdef UV1_HUB_IS_SUPPORTED - return uv_hub_info->hub_revision; -#endif - return is_uvx_hub(); + return is_uv1_hub() || is_uvx_hub(); } union uvh_apicid { diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 42e1245af0d8..ff4b52e37e60 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num); void xen_arch_unregister_cpu(int num); #endif -extern void xen_set_iopl_mask(unsigned mask); - #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 62ca03ef5c65..9139b3e86316 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -379,12 +379,9 @@ struct xen_pmu_arch { * Prefix forces emulation of some non-trapping instructions. * Currently only CPUID. */ -#ifdef __ASSEMBLY__ -#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ; -#define XEN_CPUID XEN_EMULATE_PREFIX cpuid -#else -#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; " -#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" -#endif +#include <asm/emulate_prefix.h> + +#define XEN_EMULATE_PREFIX __ASM_FORM(.byte __XEN_EMULATE_PREFIX ;) +#define XEN_CPUID XEN_EMULATE_PREFIX __ASM_FORM(cpuid) #endif /* _ASM_X86_XEN_INTERFACE_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c895df5482c5..8669c6bdbb84 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data types */ +/* setup_data/setup_indirect types */ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 #define SETUP_DTB 2 @@ -11,6 +11,11 @@ #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 +#define SETUP_INDIRECT (1<<31) + +/* SETUP_INDIRECT | max(SETUP_*) */ +#define SETUP_TYPE_MAX (SETUP_INDIRECT | SETUP_JAILHOUSE) + /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 @@ -49,6 +54,14 @@ struct setup_data { __u8 data[0]; }; +/* extensible setup indirect data node */ +struct setup_indirect { + __u32 type; + __u32 reserved; /* Reserved, must be set to zero. */ + __u64 len; + __u64 addr; +}; + struct setup_header { __u8 setup_sects; __u16 root_flags; @@ -88,6 +101,7 @@ struct setup_header { __u64 pref_address; __u32 init_size; __u32 handover_offset; + __u32 kernel_info_offset; } __attribute__((packed)); struct sys_desc_table { @@ -139,15 +153,22 @@ struct boot_e820_entry { * setup data structure. */ struct jailhouse_setup_data { - __u16 version; - __u16 compatible_version; - __u16 pm_timer_address; - __u16 num_cpus; - __u64 pci_mmconfig_base; - __u32 tsc_khz; - __u32 apic_khz; - __u8 standard_ioapic; - __u8 cpu_ids[255]; + struct { + __u16 version; + __u16 compatible_version; + } __attribute__((packed)) hdr; + struct { + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; + } __attribute__((packed)) v1; + struct { + __u32 flags; + } __attribute__((packed)) v2; } __attribute__((packed)); /* The so-called "zeropage" */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3578ad248bc9..32acb970f416 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -134,7 +134,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_X86_INTEL_UMIP) += umip.o +obj-$(CONFIG_X86_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o @@ -146,7 +146,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o - obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index e95e95960156..daf88f8143c5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -9,8 +9,7 @@ .code32 ALIGN -ENTRY(wakeup_pmode_return) -wakeup_pmode_return: +SYM_CODE_START(wakeup_pmode_return) movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %fs @@ -39,6 +38,7 @@ wakeup_pmode_return: # jump to place where we left off movl saved_eip, %eax jmp *%eax +SYM_CODE_END(wakeup_pmode_return) bogus_magic: jmp bogus_magic @@ -72,7 +72,7 @@ restore_registers: popfl ret -ENTRY(do_suspend_lowlevel) +SYM_CODE_START(do_suspend_lowlevel) call save_processor_state call save_registers pushl $3 @@ -87,10 +87,11 @@ ret_point: call restore_registers call restore_processor_state ret +SYM_CODE_END(do_suspend_lowlevel) .data ALIGN -ENTRY(saved_magic) .long 0 +SYM_DATA(saved_magic, .long 0) saved_eip: .long 0 # saved registers diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 7f9ade13bbcf..c8daa92f38dc 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ /* * Hooray, we are in Long 64-bit mode (but still running in low memory) */ -ENTRY(wakeup_long64) +SYM_FUNC_START(wakeup_long64) movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax @@ -40,9 +40,9 @@ ENTRY(wakeup_long64) movq saved_rip, %rax jmp *%rax -ENDPROC(wakeup_long64) +SYM_FUNC_END(wakeup_long64) -ENTRY(do_suspend_lowlevel) +SYM_FUNC_START(do_suspend_lowlevel) FRAME_BEGIN subq $8, %rsp xorl %eax, %eax @@ -125,7 +125,7 @@ ENTRY(do_suspend_lowlevel) addq $8, %rsp FRAME_END jmp restore_processor_state -ENDPROC(do_suspend_lowlevel) +SYM_FUNC_END(do_suspend_lowlevel) .data saved_rbp: .quad 0 @@ -136,4 +136,4 @@ saved_rbx: .quad 0 saved_rip: .quad 0 saved_rsp: .quad 0 -ENTRY(saved_magic) .quad 0 +SYM_DATA(saved_magic, .quad 0) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3a971ea364..9ec463fe96f2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; - unsigned char int3 = 0xcc; void *ip; /* * Having observed our INT3 instruction, we now must observe * bp_patching.nr_entries. * - * nr_entries != 0 INT3 - * WMB RMB - * write INT3 if (nr_entries) + * nr_entries != 0 INT3 + * WMB RMB + * write INT3 if (nr_entries) * * Idem for other elements in bp_patching. */ @@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the sizeof(int3). See text_poke_bp_batch(). + * Discount the INT3. See text_poke_bp_batch(). */ - ip = (void *) regs->ip - sizeof(int3); + ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. @@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs) return 0; } - /* set up the specified breakpoint detour */ - regs->ip = (unsigned long) tp->detour; + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + return 0; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->rel32); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->rel32); + break; + + default: + BUG(); + } return 1; } @@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * synchronization using int3 breakpoint. * * The way it is done: - * - For each entry in the vector: + * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler); */ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - int patched_all_but_first = 0; - unsigned char int3 = 0xcc; + unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; + int do_sync; lockdep_assert_held(&text_mutex); @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) /* * Second step: update all but the first byte of the patched range. */ - for (i = 0; i < nr_entries; i++) { + for (do_sync = 0, i = 0; i < nr_entries; i++) { if (tp[i].len - sizeof(int3) > 0) { text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].opcode + sizeof(int3), + (const char *)tp[i].text + sizeof(int3), tp[i].len - sizeof(int3)); - patched_all_but_first++; + do_sync++; } } - if (patched_all_but_first) { + if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + do_sync++; + } + + if (do_sync) + on_each_cpu(do_sync_core, NULL, 1); - on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. @@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) bp_patching.nr_entries = 0; } +void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + + if (!opcode) + opcode = (void *)tp->text; + else + memcpy((void *)tp->text, opcode, len); + + if (!emulate) + emulate = opcode; + + kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); + insn_get_length(&insn); + + BUG_ON(!insn_complete(&insn)); + BUG_ON(len != insn.length); + + tp->addr = addr; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->rel32 = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->rel32 = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->rel32 = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp = { - .detour = handler, - .addr = addr, - .len = len, - }; - - if (len > POKE_MAX_OPCODE_SIZE) { - WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); - return; - } - - memcpy((void *)tp.opcode, opcode, len); + struct text_poke_loc tp; + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a6ac3712db8b..4bbccb9d16dc 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); + iommu_size >> 20); } return iommu_size; @@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); return -1; } @@ -733,8 +731,8 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); } return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2b0faf86da1b..28446fa6bf18 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warning("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n",(long)res); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -977,7 +977,7 @@ static int __init calibrate_APIC_clock(void) */ if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); - pr_warning("APIC frequency too slow, disabling apic timer\n"); + pr_warn("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -1021,7 +1021,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - pr_warning("APIC timer disabled due to verification failure\n"); + pr_warn("APIC timer disabled due to verification failure\n"); return -1; } @@ -1095,8 +1095,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", - smp_processor_id()); + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1811,11 +1811,11 @@ static int __init setup_nox2apic(char *str) int apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); return 0; } - pr_warning("x2apic already enabled.\n"); + pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } setup_clear_cpu_cap(X86_FEATURE_X2APIC); @@ -1983,7 +1983,7 @@ static int __init apic_verify(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); + pr_warn("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -2337,7 +2337,7 @@ static int cpuid_to_apicid[] = { #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @id: APIC ID to check + * @apicid: APIC ID to check */ bool apic_id_is_primary_thread(unsigned int apicid) { @@ -2410,9 +2410,8 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", - thiscpu, apicid); + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2426,8 +2425,7 @@ int generic_processor_info(int apicid, int version) apicid != boot_cpu_physical_apicid) { int thiscpu = max + disabled_cpus - 1; - pr_warning( - "APIC: NR_CPUS/possible_cpus limit of %i almost" + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2438,9 +2436,8 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2470,13 +2467,13 @@ int generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); version = 0x10; } if (version != boot_cpu_apic_version) { - pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", boot_cpu_apic_version, cpu, version); } @@ -2845,7 +2842,7 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_VERBOSE; #ifdef CONFIG_X86_64 else { - pr_warning("APIC Verbosity level %s not recognised" + pr_warn("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d6af97fd170a..913c88617848 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { - if (unlikely(masked)) { + if (unlikely(moveit)) { /* Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets @@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { } #endif @@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data) { struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; - bool masked; + bool moveit; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(irq_data); + moveit = ioapic_prepare_move(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(irq_data, masked); + ioapic_finish_move(irq_data, moveit); } static void ioapic_ir_ack_level(struct irq_data *irq_data) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e6230af19864..d5b51a740524 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -14,6 +14,8 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/efi.h> #include <asm/e820/api.h> #include <asm/uv/uv_mmrs.h> @@ -25,12 +27,17 @@ static DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static bool uv_hubless_system; +static int uv_hubbed_system; +static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; +/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + /* Information derived from CPUID: */ static struct { unsigned int apicid_shift; @@ -248,17 +255,35 @@ static void __init uv_set_apicid_hibit(void) } } -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static void __init uv_stringify(int len, char *to, char *from) +{ + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); +} + +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) { int pnodeid; int uv_apic; + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) == 0) { - uv_hubless_system = true; - pr_info("UV: OEM IDs %s/%s, HUBLESS\n", - oem_id, oem_table_id); - } + if (strncmp(oem_id, "NSGI", 4) != 0) + return 0; + + /* UV4 Hubless, CH, (0x11:UV4+Any) */ + if (strncmp(oem_id, "NSGI4", 5) == 0) + uv_hubless_system = 0x11; + + /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + else + uv_hubless_system = 0x9; + + pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", + oem_id, oem_table_id, uv_hubless_system); + return 0; } @@ -286,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (uv_hub_info->hub_revision == 0) goto badbios; + switch (uv_hub_info->hub_revision) { + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + break; + + case UV1_HUB_REVISION_BASE: + uv_hubbed_system = 0x3; + break; + } + pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); @@ -336,9 +379,15 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -int is_uv_hubless(void) +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +int is_uv_hubless(int uvtype) { - return uv_hubless_system; + return (uv_hubless_system & uvtype); } EXPORT_SYMBOL_GPL(is_uv_hubless); @@ -1255,7 +1304,8 @@ static int __init decode_uv_systab(void) struct uv_systab *st; int i; - if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + /* If system is uv3 or lower, there is no extended UVsystab */ + if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) return 0; /* No extended UVsystab required */ st = uv_systab; @@ -1434,6 +1484,103 @@ static void __init build_socket_tables(void) } } +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* Setup user proc fs files */ +static int proc_hubbed_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int proc_hubless_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int proc_oemid_show(struct seq_file *file, void *data) +{ + seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + return 0; +} + +static int proc_hubbed_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubbed_show, (void *)NULL); +} + +static int proc_hubless_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubless_show, (void *)NULL); +} + +static int proc_oemid_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_oemid_show, (void *)NULL); +} + +/* (struct is "non-const" as open function is set at runtime) */ +static struct file_operations proc_version_fops = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations proc_oemid_fops = { + .open = proc_oemid_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + char *name = hubless ? "hubless" : "hubbed"; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create("oemid", 0, pde, &proc_oemid_fops); + proc_create(name, 0, pde, &proc_version_fops); + if (hubless) + proc_version_fops.open = proc_hubless_open; + else + proc_version_fops.open = proc_hubbed_open; +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; @@ -1559,32 +1706,27 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); uv_scir_register_cpu_notifier(); - proc_mkdir("sgi_uv", NULL); + uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); - /* - * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel: - */ - if (is_kdump_kernel()) - reboot_type = BOOT_ACPI; + check_efi_reboot(); } /* - * There is a small amount of UV specific code needed to initialize a - * UV system that does not have a "UV HUB" (referred to as "hubless"). + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). */ void __init uv_system_init(void) { - if (likely(!is_uv_system() && !is_uv_hubless())) + if (likely(!is_uv_system() && !is_uv_hubless(1))) return; if (is_uv_system()) uv_system_init_hub(); else - uv_nmi_setup_hubless(); + uv_system_init_hubless(); } apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4c7b0fa15a19..8bf64899f56a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ @@ -108,6 +109,12 @@ void __init check_bugs(void) mds_select_mitigation(); taa_select_mitigation(); + /* + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. + */ + mds_print_mitigation(); + arch_smt_update(); #ifdef CONFIG_X86_32 @@ -245,6 +252,12 @@ static void __init mds_select_mitigation(void) (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -304,8 +317,12 @@ static void __init taa_select_mitigation(void) return; } - /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ - if (taa_mitigation == TAA_MITIGATION_OFF) + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) goto out; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) @@ -339,6 +356,15 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } out: pr_info("%s\n", taa_strings[taa_mitigation]); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fffe21945374..baa2fed8deb6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -53,10 +53,7 @@ #include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> - -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> -#endif #include "cpu.h" @@ -565,8 +562,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); void load_percpu_segment(int cpu) { @@ -1780,7 +1778,7 @@ static void wait_for_master_cpu(int cpu) } #ifdef CONFIG_X86_64 -static void setup_getcpu(int cpu) +static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; @@ -1800,7 +1798,59 @@ static void setup_getcpu(int cpu) write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } + +static inline void ucode_cpu_init(int cpu) +{ + if (cpu) + load_ucode_ap(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) +{ + /* Set up the per-CPU TSS IST stacks */ + tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); +} + +static inline void gdt_setup_doublefault_tss(int cpu) { } + +#else /* CONFIG_X86_64 */ + +static inline void setup_getcpu(int cpu) { } + +static inline void ucode_cpu_init(int cpu) +{ + show_ucode_info_early(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) { } + +static inline void gdt_setup_doublefault_tss(int cpu) +{ +#ifdef CONFIG_DOUBLEFAULT + /* Set up the doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif +} +#endif /* !CONFIG_X86_64 */ + +static inline void tss_setup_io_bitmap(struct tss_struct *tss) +{ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; + +#ifdef CONFIG_X86_IOPL_IOPERM + tss->io_bitmap.prev_max = 0; + tss->io_bitmap.prev_sequence = 0; + memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap)); + /* + * Invalidate the extra array entry past the end of the all + * permission bitmap as required by the hardware. + */ + tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL; #endif +} /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1808,21 +1858,15 @@ static void setup_getcpu(int cpu) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -#ifdef CONFIG_X86_64 - void cpu_init(void) { + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - struct task_struct *me; - struct tss_struct *t; - int i; wait_for_master_cpu(cpu); - if (cpu) - load_ucode_ap(); - - t = &per_cpu(cpu_tss_rw, cpu); + ucode_cpu_init(cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1831,63 +1875,47 @@ void cpu_init(void) #endif setup_getcpu(cpu); - me = current; - pr_debug("Initializing CPU#%d\n", cpu); - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ - switch_to_new_gdt(cpu); - loadsegment(fs, 0); - load_current_idt(); - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); + if (IS_ENABLED(CONFIG_X86_64)) { + loadsegment(fs, 0); + memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); - x86_configure_nx(); - x2apic_setup(); + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); - /* - * set up and load the per-CPU TSS - */ - if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + x2apic_setup(); } - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - mmgrab(&init_mm); - me->active_mm = &init_mm; - BUG_ON(me->mm); + cur->active_mm = &init_mm; + BUG_ON(cur->mm); initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, me); + enter_lazy_tlb(&init_mm, cur); - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ + /* Initialize the TSS. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + /* + * sp0 points to the entry trampoline stack regardless of what task + * is running. + */ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); @@ -1895,6 +1923,8 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); + gdt_setup_doublefault_tss(cpu); + fpu__init_cpu(); if (is_uv_system()) @@ -1903,63 +1933,6 @@ void cpu_init(void) load_fixmap_gdt(cpu); } -#else - -void cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); - - wait_for_master_cpu(cpu); - - show_ucode_info_early(); - - pr_info("Initializing CPU#%d\n", cpu); - - if (cpu_feature_enabled(X86_FEATURE_VME) || - boot_cpu_has(X86_FEATURE_TSC) || - boot_cpu_has(X86_FEATURE_DE)) - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - load_current_idt(); - switch_to_new_gdt(cpu); - - /* - * Set up and load the per-CPU TSS and LDT - */ - mmgrab(&init_mm); - curr->active_mm = &init_mm; - BUG_ON(curr->mm); - initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, curr); - - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); - load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); - - load_mm_ldt(&init_mm); - - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - -#ifdef CONFIG_DOUBLEFAULT - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif - - clear_all_debug_regs(); - dbg_restore_debug_regs(); - - fpu__init_cpu(); - - load_fixmap_gdt(cpu); -} -#endif - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 11d5c5950e2d..4a900804a023 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -819,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, @@ -847,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -859,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc) return; /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && \ - intel_tlb_table[k].descriptor != 0; k++) + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) ; if (intel_tlb_table[k].tlb_type == 0) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c656d92cd708..caa032ce3fe3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -290,7 +290,12 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - mark_tsc_unstable("running on Hyper-V"); + if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } else { + mark_tsc_unstable("running on Hyper-V"); + } /* * Generation 2 instances don't support reading the NMI status from diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 5c900f9527ff..c4be62058dd9 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup); #ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned long tmp; + unsigned int changed = 0; + unsigned long tmp, prev; int i; if (!cpu_has(c, X86_FEATURE_RDRAND)) @@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } + + /* + * Stupid sanity-check whether RDRAND does *actually* generate + * some at least random-looking data. + */ + prev = tmp; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (rdrand_long(&tmp)) { + if (prev != tmp) + changed++; + + prev = tmp; + } + } + + if (WARN_ON_ONCE(!changed)) + pr_emerg( +"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); + } #endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index eb651fbde92a..00fc55ac7ffa 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/memblock.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -39,6 +40,7 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> +#include <asm/cmdline.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } +/* + * When the crashkernel option is specified, only use the low + * 1M for the real mode trampoline. + */ +void __init crash_reserve_low_1M(void) +{ + if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) + return; + + memblock_reserve(0, 1<<20); + pr_info("Reserving the low 1M of memory for crashkernel\n"); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_KEXEC_FILE -static unsigned long crash_zero_bytes; - static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) unsigned int nr_ranges = 0; struct crash_mem *cmem; - walk_system_ram_res(0, -1, &nr_ranges, - get_nr_ram_ranges_callback); + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); if (!nr_ranges) return NULL; @@ -217,15 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, 1<<20); + if (ret) + return ret; + /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; - if (crashk_low_res.end) { + if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, - crashk_low_res.end); - } + crashk_low_res.end); return ret; } @@ -246,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { struct crash_mem *cmem; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - int ret, i; + int ret; cmem = fill_up_crash_elf_data(); if (!cmem) return -ENOMEM; - ret = walk_system_ram_res(0, -1, cmem, - prepare_elf64_ram_headers_callback); + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); if (ret) goto out; @@ -265,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr, goto out; /* By default prepare 64bit headers */ - ret = crash_prepare_elf64_headers(cmem, - IS_ENABLED(CONFIG_X86_64), addr, sz); - if (ret) - goto out; + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - ehdr = (Elf64_Ehdr *)*addr; - phdr = (Elf64_Phdr *)(ehdr + 1); - for (i = 0; i < ehdr->e_phnum; phdr++, i++) - if (phdr->p_type == PT_LOAD && - phdr->p_paddr == image->arch.backup_src_start && - phdr->p_memsz == image->arch.backup_src_sz) { - phdr->p_offset = image->arch.backup_load_addr; - break; - } out: vfree(cmem); return ret; @@ -296,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_table[nr_e820_entries], entry, - sizeof(struct e820_entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -321,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; - int ret = 0; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; cmem->nr_ranges = 1; - /* Exclude Backup region */ - start = image->arch.backup_load_addr; - end = start + image->arch.backup_src_sz - 1; - ret = crash_exclude_mem_range(cmem, start, end); - if (ret) - return ret; - /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; @@ -356,28 +344,28 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; - /* Add first 640K segment */ - ei.addr = image->arch.backup_src_start; - ei.size = image->arch.backup_src_sz; - ei.type = E820_TYPE_RAM; - add_e820_entry(params, &ei); + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); /* Add ACPI tables */ cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add e820 reserved ranges */ cmd.type = E820_TYPE_RESERVED; flags = IORESOURCE_MEM; walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add crashk_low_res region */ if (crashk_low_res.end) { @@ -388,8 +376,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) } /* Exclude some ranges from crashk_res and add rest to memmap */ - ret = memmap_exclude_ranges(image, cmem, crashk_res.start, - crashk_res.end); + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); if (ret) goto out; @@ -409,55 +396,12 @@ out: return ret; } -static int determine_backup_region(struct resource *res, void *arg) -{ - struct kimage *image = arg; - - image->arch.backup_src_start = res->start; - image->arch.backup_src_sz = resource_size(res); - - /* Expecting only one range for backup region */ - return 1; -} - int crash_load_segments(struct kimage *image) { int ret; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; - /* - * Determine and load a segment for backup area. First 640K RAM - * region is backup source - */ - - ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, - image, determine_backup_region); - - /* Zero or postive return values are ok */ - if (ret < 0) - return ret; - - /* Add backup segment. */ - if (image->arch.backup_src_sz) { - kbuf.buffer = &crash_zero_bytes; - kbuf.bufsz = sizeof(crash_zero_bytes); - kbuf.memsz = image->arch.backup_src_sz; - kbuf.buf_align = PAGE_SIZE; - /* - * Ideally there is no source for backup segment. This is - * copied in purgatory after crash. Just add a zero filled - * segment for now to make sure checksum logic works fine. - */ - ret = kexec_add_buffer(&kbuf); - if (ret) - return ret; - image->arch.backup_load_addr = kbuf.mem; - pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, - image->arch.backup_src_start, kbuf.memsz); - } - /* Prepare elf headers and add a segment */ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0b8cedb20d6d..0d6c657593f8 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .sp0 = STACK_START, .ss0 = __KERNEL_DS, .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, .ip = (unsigned long) doublefault_fn, /* 0x2 bit is always set */ @@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif .__cr3 = __pa_nodebug(swapper_pg_dir), }; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7da2bcd2b8eb..c5399e80c59c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type) case E820_TYPE_RAM: /* Fall through: */ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; @@ -999,6 +1000,17 @@ void __init e820__reserve_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + e820__range_update(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -1037,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; case E820_TYPE_PMEM: return "Persistent Memory"; case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; default: return "Unknown E820 type"; } } @@ -1052,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) case E820_TYPE_PRAM: /* Fall-through: */ case E820_TYPE_PMEM: /* Fall-through: */ case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ default: return IORESOURCE_MEM; } } @@ -1064,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ @@ -1078,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) return true; /* - * Treat persistent memory like device memory, i.e. reserve it - * for exclusive use of a driver + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver */ switch (type) { case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; @@ -1285,6 +1301,9 @@ void __init e820__memblock_setup(void) if (end != (resource_size_t)end) continue; + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e5cb67d67c03..319be936c348 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -254,10 +254,13 @@ static void __init setup_xstate_features(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_offsets[0] = 0; - xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); - xstate_offsets[1] = xstate_sizes[0]; - xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = FIELD_SIZEOF(struct fxregs_state, + xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) @@ -342,7 +345,7 @@ static int xfeature_is_aligned(int xfeature_nr) */ static void __init setup_xstate_comp(void) { - unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + unsigned int xstate_comp_sizes[XFEATURE_MAX]; int i; /* @@ -350,8 +353,9 @@ static void __init setup_xstate_comp(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); + xstate_comp_offsets[XFEATURE_FP] = 0; + xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, + xmm_space); if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { @@ -840,7 +844,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that pcntxt_mask is + * have not enabled. Remember that xfeatures_mask is * what we write to the XCR0 register. */ WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 024c3053dbba..060a361d9d11 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1043,6 +1043,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, return; /* + * If the return location is actually pointing directly to + * the start of a direct trampoline (if we trace the trampoline + * it will still be offset by MCOUNT_INSN_SIZE), then the + * return address is actually off by one word, and we + * need to adjust for that. + */ + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } + } + + /* * Protect against fault, even if it shouldn't * happen. This tool is too much intrusive to * ignore such a protection. diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 073aab525d80..e8a9f8370112 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -12,20 +12,18 @@ #include <asm/frame.h> #include <asm/asm-offsets.h> -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER # define MCOUNT_FRAME 1 /* using frame = true */ #else # define MCOUNT_FRAME 0 /* using frame = false */ #endif -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) ret -END(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_CODE_START(ftrace_caller) #ifdef CONFIG_FRAME_POINTER /* @@ -85,11 +83,11 @@ ftrace_graph_call: #endif /* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) ret -END(ftrace_caller) +SYM_CODE_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_CODE_START(ftrace_regs_caller) /* * We're here from an mcount/fentry CALL, and the stack frame looks like: * @@ -138,7 +136,7 @@ ENTRY(ftrace_regs_caller) movl function_trace_op, %ecx # 3rd argument: ftrace_pos pushl %esp # 4th argument: pt_regs -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub addl $4, %esp # skip 4th argument @@ -163,9 +161,10 @@ GLOBAL(ftrace_regs_call) popl %eax jmp .Lftrace_ret +SYM_CODE_END(ftrace_regs_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_CODE_START(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx @@ -179,7 +178,7 @@ ENTRY(ftrace_graph_caller) popl %ecx popl %eax ret -END(ftrace_graph_caller) +SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 809d54397dba..369e61faacfe 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -14,9 +14,6 @@ .code64 .section .entry.text, "ax" -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ # define MCOUNT_FRAME_SIZE (8+16*2) @@ -88,6 +85,7 @@ EXPORT_SYMBOL(__fentry__) movq %rdi, RDI(%rsp) movq %r8, R8(%rsp) movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) /* * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. @@ -114,7 +112,11 @@ EXPORT_SYMBOL(__fentry__) subq $MCOUNT_INSN_SIZE, %rdi .endm -.macro restore_mcount_regs +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + movq R9(%rsp), %r9 movq R8(%rsp), %r8 movq RDI(%rsp), %rdi @@ -123,31 +125,29 @@ EXPORT_SYMBOL(__fentry__) movq RCX(%rsp), %rcx movq RAX(%rsp), %rax - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp + addq $MCOUNT_REG_SIZE-\save, %rsp .endm #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) retq -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs -GLOBAL(ftrace_caller_op_ptr) +SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx -GLOBAL(ftrace_call) +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub restore_mcount_regs @@ -157,10 +157,10 @@ GLOBAL(ftrace_call) * think twice before adding any new code or changing the * layout here. */ -GLOBAL(ftrace_epilogue) +SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub #endif @@ -168,19 +168,21 @@ GLOBAL(ftrace_graph_call) * This is weak to keep gas from relaxing the jumps. * It is also used to copy the retq for trampolines. */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -ENDPROC(ftrace_caller) +SYM_FUNC_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq + UNWIND_HINT_SAVE + /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ -GLOBAL(ftrace_regs_caller_op_ptr) +SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -209,7 +211,7 @@ GLOBAL(ftrace_regs_caller_op_ptr) /* regs go into 4th parameter */ leaq (%rsp), %rcx -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub /* Copy flags back to SS, to restore them */ @@ -228,7 +230,33 @@ GLOBAL(ftrace_regs_call) movq R10(%rsp), %r10 movq RBX(%rsp), %rbx - restore_mcount_regs + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* If ORIG_RAX is anything but zero, make this a call to that */ + movq ORIG_RAX(%rsp), %rax + cmpq $0, %rax + je 1f + + /* Swap the flags with orig_rax */ + movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + + jmp 2f + +1: restore_mcount_regs + + +2: + /* + * The stack layout is nondetermistic here, depending on which path was + * taken. This confuses objtool and ORC, rightfully so. For now, + * pretend the stack always looks like the non-direct case. + */ + UNWIND_HINT_RESTORE /* Restore flags */ popfq @@ -239,16 +267,16 @@ GLOBAL(ftrace_regs_call) * The trampoline will add the code to jump * to the return. */ -GLOBAL(ftrace_regs_caller_end) +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue -ENDPROC(ftrace_regs_caller) +SYM_FUNC_END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -261,7 +289,7 @@ fgraph_trace: jnz ftrace_graph_caller #endif -GLOBAL(ftrace_stub) +SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) retq trace: @@ -279,11 +307,12 @@ trace: restore_mcount_regs jmp fgraph_trace -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_FUNC_START(ftrace_graph_caller) /* Saves rbp into %rdx and fills first parameter */ save_mcount_regs @@ -294,9 +323,9 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) -ENTRY(return_to_handler) +SYM_CODE_START(return_to_handler) UNWIND_HINT_EMPTY subq $24, %rsp @@ -312,5 +341,5 @@ ENTRY(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30f9cb2c0b55..3923ab4630d7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -64,7 +64,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * can. */ __HEAD -ENTRY(startup_32) +SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -156,7 +156,7 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(xen_entry) +SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a @@ -172,6 +172,7 @@ num_subarch_entries = (. - subarch_entries) / 4 #else jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ +SYM_CODE_END(startup_32) #ifdef CONFIG_HOTPLUG_CPU /* @@ -179,12 +180,12 @@ num_subarch_entries = (. - subarch_entries) / 4 * up already except stack. We just set up stack here. Then call * start_secondary(). */ -ENTRY(start_cpu0) +SYM_FUNC_START(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp call *(initial_code) 1: jmp 1b -ENDPROC(start_cpu0) +SYM_FUNC_END(start_cpu0) #endif /* @@ -195,7 +196,7 @@ ENDPROC(start_cpu0) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ -ENTRY(startup_32_smp) +SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax movl %eax,%ds @@ -362,7 +363,7 @@ ENTRY(startup_32_smp) call *(initial_code) 1: jmp 1b -ENDPROC(startup_32_smp) +SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" @@ -392,7 +393,7 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handler_array) +SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip @@ -407,9 +408,9 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +SYM_FUNC_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -460,10 +461,10 @@ early_idt_handler_common: decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret -ENDPROC(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ -ENTRY(early_ignore_irq) +SYM_FUNC_START(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -498,19 +499,16 @@ ENTRY(early_ignore_irq) hlt_loop: hlt jmp hlt_loop -ENDPROC(early_ignore_irq) +SYM_FUNC_END(early_ignore_irq) __INITDATA .align 4 -GLOBAL(early_recursion_flag) - .long 0 +SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 -ENTRY(initial_code) - .long i386_start_kernel -ENTRY(setup_once_ref) - .long setup_once +SYM_DATA(initial_code, .long i386_start_kernel) +SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) @@ -553,7 +551,7 @@ EXPORT_SYMBOL(empty_zero_page) __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PGD_ALIGN -ENTRY(initial_page_table) +SYM_DATA_START(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 @@ -571,17 +569,28 @@ ENTRY(initial_page_table) # error "Kernel PMDs should be 1, 2 or 3" # endif .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + +SYM_DATA_END(initial_page_table) #endif .data .balign 4 -ENTRY(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ - TOP_OF_KERNEL_STACK_PADDING; +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, + .long init_thread_union + THREAD_SIZE - + SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) __INITRODATA int_msg: @@ -597,27 +606,28 @@ int_msg: */ .data -.globl boot_gdt_descr - ALIGN # early boot GDT descriptor (must use 1:1 address mapping) .word 0 # 32 bit align gdt_desc.address -boot_gdt_descr: +SYM_DATA_START_LOCAL(boot_gdt_descr) .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET +SYM_DATA_END(boot_gdt_descr) # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(early_gdt_descr) +SYM_DATA_START(early_gdt_descr) .word GDT_ENTRIES*8-1 .long gdt_page /* Overwritten for secondary CPUs */ +SYM_DATA_END(early_gdt_descr) /* * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt) +SYM_DATA_START(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +SYM_DATA_END(boot_gdt) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index f3d3e9646a99..4bbc770af632 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,8 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .text __HEAD .code64 - .globl startup_64 -startup_64: +SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -90,7 +89,9 @@ startup_64: /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f -ENTRY(secondary_startup_64) +SYM_CODE_END(startup_64) + +SYM_CODE_START(secondary_startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -240,7 +241,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -END(secondary_startup_64) +SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -250,30 +251,28 @@ END(secondary_startup_64) * up already except stack. We just set up stack here. Then call * start_secondary() via .Ljump_to_C_code. */ -ENTRY(start_cpu0) +SYM_CODE_START(start_cpu0) UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code -END(start_cpu0) +SYM_CODE_END(start_cpu0) #endif /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 - GLOBAL(initial_code) - .quad x86_64_start_kernel - GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(fixed_percpu_data) - GLOBAL(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS +SYM_DATA(initial_code, .quad x86_64_start_kernel) +SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) + +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) __FINITDATA __INIT -ENTRY(early_idt_handler_array) +SYM_CODE_START(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 @@ -289,9 +288,9 @@ ENTRY(early_idt_handler_array) .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr UNWIND_HINT_IRET_REGS offset=16 -END(early_idt_handler_array) +SYM_CODE_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -333,17 +332,11 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel -END(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) - __INITDATA - .balign 4 -GLOBAL(early_recursion_flag) - .long 0 - -#define NEXT_PAGE(name) \ - .balign PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* @@ -358,11 +351,11 @@ GLOBAL(name) */ #define PTI_USER_PGD_FILL 512 /* This ensures they are 8k-aligned: */ -#define NEXT_PGD_PAGE(name) \ - .balign 2 * PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE) #else -#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_DATA_START_PAGE_ALIGNED(name) #define PTI_USER_PGD_FILL 0 #endif @@ -375,17 +368,23 @@ GLOBAL(name) .endr __INITDATA -NEXT_PGD_PAGE(early_top_pgt) + .balign 4 + +SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(early_top_pgt) -NEXT_PAGE(early_dynamic_pgts) +SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +SYM_DATA_END(early_dynamic_pgts) + +SYM_DATA(early_recursion_flag, .long 0) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH) -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -393,11 +392,13 @@ NEXT_PGD_PAGE(init_top_pgt) /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) -NEXT_PAGE(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 -NEXT_PAGE(level2_ident_pgt) +SYM_DATA_END(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt) /* * Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. @@ -407,25 +408,29 @@ NEXT_PAGE(level2_ident_pgt) * the CPU should ignore the bit. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +SYM_DATA_END(level2_ident_pgt) #else -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) #endif #ifdef CONFIG_X86_5LEVEL -NEXT_PAGE(level4_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level4_kernel_pgt) #endif -NEXT_PAGE(level3_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level3_kernel_pgt) -NEXT_PAGE(level2_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable * anyway. @@ -442,8 +447,9 @@ NEXT_PAGE(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) +SYM_DATA_END(level2_kernel_pgt) -NEXT_PAGE(level2_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 pgtno = 0 .rept (FIXMAP_PMD_NUM) @@ -453,31 +459,32 @@ NEXT_PAGE(level2_fixmap_pgt) .endr /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 +SYM_DATA_END(level2_fixmap_pgt) -NEXT_PAGE(level1_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) .fill 512,8,0 .endr +SYM_DATA_END(level1_fixmap_pgt) #undef PMDS .data .align 16 - .globl early_gdt_descr -early_gdt_descr: - .word GDT_ENTRIES*8-1 -early_gdt_descr_base: - .quad INIT_PER_CPU_VAR(gdt_page) - -ENTRY(phys_base) - /* This must match the first entry in level2_kernel_pgt */ - .quad 0x0000000000000000 + +SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) +SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + + .align 16 +/* This must match the first entry in level2_kernel_pgt */ +SYM_DATA(phys_base, .quad 0x0) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS -NEXT_PAGE(empty_zero_page) +SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) .skip PAGE_SIZE +SYM_DATA_END(empty_zero_page) EXPORT_SYMBOL(empty_zero_page) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 61a89d3c0382..8abeee0dd7bf 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -3,32 +3,69 @@ * This contains the io-permission bitmap code - written by obz, with changes * by Linus. 32/64 bits code unification by Miguel Botón. */ - -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/kernel.h> #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> #include <linux/security.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> #include <linux/syscalls.h> #include <linux/bitmap.h> -#include <asm/syscalls.h> +#include <linux/ioport.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/io_bitmap.h> #include <asm/desc.h> +#ifdef CONFIG_X86_IOPL_IOPERM + +static atomic64_t io_bitmap_sequence; + +void io_bitmap_share(struct task_struct *tsk) +{ + /* Can be NULL when current->thread.iopl_emul == 3 */ + if (current->thread.io_bitmap) { + /* + * Take a refcount on current's bitmap. It can be used by + * both tasks as long as none of them changes the bitmap. + */ + refcount_inc(¤t->thread.io_bitmap->refcnt); + tsk->thread.io_bitmap = current->thread.io_bitmap; + } + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); +} + +static void task_update_io_bitmap(void) +{ + struct thread_struct *t = ¤t->thread; + + if (t->iopl_emul == 3 || t->io_bitmap) { + /* TSS update is handled on exit to user space */ + set_thread_flag(TIF_IO_BITMAP); + } else { + clear_thread_flag(TIF_IO_BITMAP); + /* Invalidate TSS */ + preempt_disable(); + tss_update_io_bitmap(); + preempt_enable(); + } +} + +void io_bitmap_exit(void) +{ + struct io_bitmap *iobm = current->thread.io_bitmap; + + current->thread.io_bitmap = NULL; + task_update_io_bitmap(); + if (iobm && refcount_dec_and_test(&iobm->refcnt)) + kfree(iobm); +} + /* - * this changes the io permissions bitmap in the current task. + * This changes the io permissions bitmap in the current task. */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) { struct thread_struct *t = ¤t->thread; - struct tss_struct *tss; - unsigned int i, max_long, bytes, bytes_updated; + unsigned int i, max_long; + struct io_bitmap *iobm; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -41,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ - if (!t->io_bitmap_ptr) { - unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - - if (!bitmap) + iobm = t->io_bitmap; + if (!iobm) { + /* No point to allocate a bitmap just to clear permissions */ + if (!turn_on) + return 0; + iobm = kmalloc(sizeof(*iobm), GFP_KERNEL); + if (!iobm) return -ENOMEM; - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); + memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap)); + refcount_set(&iobm->refcnt, 1); + } - /* - * Now that we have an IO bitmap, we need our TSS limit to be - * correct. It's fine if we are preempted after doing this: - * with TIF_IO_BITMAP set, context switches will keep our TSS - * limit correct. - */ - preempt_disable(); - refresh_tss_limit(); - preempt_enable(); + /* + * If the bitmap is not shared, then nothing can take a refcount as + * current can obviously not fork at the same time. If it's shared + * duplicate it and drop the refcount on the original one. + */ + if (refcount_read(&iobm->refcnt) > 1) { + iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + refcount_set(&iobm->refcnt, 1); + io_bitmap_exit(); } /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: + * Store the bitmap pointer (might be the same if the task already + * head one). Must be done here so freeing the bitmap when all + * permissions are dropped has the pointer set up. */ - tss = &per_cpu(cpu_tss_rw, get_cpu()); + t->io_bitmap = iobm; + /* Mark it active for context switching and exit to user mode */ + set_thread_flag(TIF_IO_BITMAP); + /* + * Update the tasks bitmap. The update of the TSS bitmap happens on + * exit to user mode. So this needs no protection. + */ if (turn_on) - bitmap_clear(t->io_bitmap_ptr, from, num); + bitmap_clear(iobm->bitmap, from, num); else - bitmap_set(t->io_bitmap_ptr, from, num); + bitmap_set(iobm->bitmap, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, * to keep it obviously correct: */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) + max_long = UINT_MAX; + for (i = 0; i < IO_BITMAP_LONGS; i++) { + if (iobm->bitmap[i] != ~0UL) max_long = i; + } + /* All permissions dropped? */ + if (max_long == UINT_MAX) { + io_bitmap_exit(); + return 0; + } - bytes = (max_long + 1) * sizeof(unsigned long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + iobm->max = (max_long + 1) * sizeof(unsigned long); - put_cpu(); + /* + * Update the sequence number to force a TSS update on return to + * user mode. + */ + iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); return 0; } @@ -104,38 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) } /* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. + * The sys_iopl functionality depends on the level argument, which if + * granted for the task is used to enable access to all 65536 I/O ports. + * + * This does not use the IOPL mechanism provided by the CPU as that would + * also allow the user space task to use the CLI/STI instructions. * - * Here we just change the flags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. + * Disabling interrupts in a user space task is dangerous as it might lock + * up the machine and the semantics vs. syscalls and exceptions is + * undefined. + * + * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3 + * 3 enables them. + * + * IOPL is strictly per thread and inherited on fork. */ SYSCALL_DEFINE1(iopl, unsigned int, level) { - struct pt_regs *regs = current_pt_regs(); struct thread_struct *t = ¤t->thread; - - /* - * Careful: the IOPL bits in regs->flags are undefined under Xen PV - * and changing them has no effect. - */ - unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + unsigned int old; if (level > 3) return -EINVAL; + + old = t->iopl_emul; + + /* No point in going further if nothing changes */ + if (level == old) + return 0; + /* Trying to gain more privileges? */ if (level > old) { if (!capable(CAP_SYS_RAWIO) || security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | - (level << X86_EFLAGS_IOPL_BIT); - t->iopl = level << X86_EFLAGS_IOPL_BIT; - set_iopl_mask(t->iopl); + + t->iopl_emul = level; + task_update_io_bitmap(); return 0; } + +#else /* CONFIG_X86_IOPL_IOPERM */ + +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + return -ENOSYS; +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index ddeeaac8adda..0db0375235b4 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,20 +7,20 @@ /* * unsigned long native_save_fl(void) */ -ENTRY(native_save_fl) +SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret -ENDPROC(native_save_fl) +SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) /* * void native_restore_fl(unsigned long flags) * %eax/%rdi: flags */ -ENTRY(native_restore_fl) +SYM_FUNC_START(native_restore_fl) push %_ASM_ARG1 popf ret -ENDPROC(native_restore_fl) +SYM_FUNC_END(native_restore_fl) EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 3ad34f01de2a..6eb8b50ea07e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -11,6 +11,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/kernel.h> #include <linux/reboot.h> +#include <linux/serial_8250.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/hypervisor.h> @@ -21,9 +22,24 @@ #include <asm/setup.h> #include <asm/jailhouse_para.h> -static __initdata struct jailhouse_setup_data setup_data; +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + static unsigned int precalibrated_tsc_khz; +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + static uint32_t jailhouse_cpuid_base(void) { if (boot_cpu_data.cpuid_level < 0 || @@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_period = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early) .type = IOAPIC_DOMAIN_STRICT, .ops = &mp_ioapic_irqdomain_ops, }; - struct mpc_intsrc mp_irq = { - .type = MP_INTSRC, - .irqtype = mp_INT, - .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, - }; unsigned int cpu; jailhouse_x2apic_init(); register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { - generic_processor_info(setup_data.cpu_ids[cpu], + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { + generic_processor_info(setup_data.v1.cpu_ids[cpu], boot_cpu_apic_version); } smp_found_config = 1; - if (setup_data.standard_ioapic) { + if (setup_data.v1.standard_ioapic) { mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); - /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ - mp_irq.srcbusirq = mp_irq.dstirq = 3; - mp_save_irq(&mp_irq); - - mp_irq.srcbusirq = mp_irq.dstirq = 4; - mp_save_irq(&mp_irq); + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } } } @@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void) pcibios_last_bus = 0xff; #ifdef CONFIG_PCI_MMCONFIG - if (setup_data.pci_mmconfig_base) { + if (setup_data.v1.pci_mmconfig_base) { pci_mmconfig_add(0, 0, pcibios_last_bus, - setup_data.pci_mmconfig_base); + setup_data.v1.pci_mmconfig_base); pci_mmcfg_arch_init(); } #endif @@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void) return 0; } +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; struct setup_data header; void *mapping; @@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void) memcpy(&header, mapping, sizeof(header)); early_memunmap(mapping, sizeof(header)); - if (header.type == SETUP_JAILHOUSE && - header.len >= sizeof(setup_data)) { - pa_data += offsetof(struct setup_data, data); - - mapping = early_memremap(pa_data, sizeof(setup_data)); - memcpy(&setup_data, mapping, sizeof(setup_data)); - early_memunmap(mapping, sizeof(setup_data)); - + if (header.type == SETUP_JAILHOUSE) break; - } pa_data = header.next; } @@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void) if (!pa_data) panic("Jailhouse: No valid setup data found"); - if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) - panic("Jailhouse: Unsupported setup data structure"); - - pmtmr_ioport = setup_data.pm_timer_address; + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); - precalibrated_tsc_khz = setup_data.tsc_khz; + precalibrated_tsc_khz = setup_data.v1.tsc_khz; setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); pci_probe = 0; @@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void) * are none in a non-root cell. */ disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); } bool jailhouse_paravirt(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 044053235302..c1a8b9e71408 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, - (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); } void arch_jump_label_transform(struct jump_entry *entry, @@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } __jump_label_set_jump_code(entry, type, - (union jump_code_union *) &tp->opcode, 0); + (union jump_code_union *)&tp->text, 0); - tp->addr = entry_code; - tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; - tp->len = JUMP_LABEL_NOP_SIZE; + text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); tp_vec_nr++; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index edaa30b20841..64b6da95af98 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, if (count > node->len - pos) count = node->len - pos; - pa = node->paddr + sizeof(struct setup_data) + pos; + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + p = memremap(pa, count, MEMREMAP_WB); if (!p) return -ENOMEM; @@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - node->paddr = pa_data; - node->type = data->type; - node->len = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + node->paddr = ((struct setup_indirect *)data->data)->addr; + node->type = ((struct setup_indirect *)data->data)->type; + node->len = ((struct setup_indirect *)data->data)->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + create_setup_data_node(d, no, node); pa_data = data->next; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 43fc13c831af..4f13af7cbcdb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -351,6 +351,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) kernel_insn_init(insn, dest, MAX_INSN_SIZE); insn_get_length(insn); + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b348dd506d58..8900329c28a7 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = RELATIVEJUMP_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); list_del_init(&op->list); } @@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist) void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 emulate_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ insn_buff[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + emulate_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + emulate_buff); } /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 7969da939213..d0a19121c6a4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size) if (!data) return -ENOMEM; if (nr == i) { - *size = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + *size = ((struct setup_indirect *)data->data)->len; + else + *size = data->len; + memunmap(data); return 0; } @@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - ret = sprintf(buf, "0x%x\n", data->type); + if (data->type == SETUP_INDIRECT) + ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); + else + ret = sprintf(buf, "0x%x\n", data->type); memunmap(data); return ret; } @@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp, loff_t off, size_t count) { int nr, ret = 0; - u64 paddr; + u64 paddr, len; struct setup_data *data; void *p; @@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (off > data->len) { + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { ret = -EINVAL; goto out; } - if (count > data->len - off) - count = data->len - off; + if (count > len - off) + count = len - off; if (!count) goto out; ret = count; - p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); + p = memremap(paddr, len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5dcd438ad8f2..16e125a50b33 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -298,48 +298,6 @@ static void load_segments(void) ); } -#ifdef CONFIG_KEXEC_FILE -/* Update purgatory as needed after various image segments have been prepared */ -static int arch_update_purgatory(struct kimage *image) -{ - int ret = 0; - - if (!image->file_mode) - return 0; - - /* Setup copying of backup region */ - if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_dest", - &image->arch.backup_load_addr, - sizeof(image->arch.backup_load_addr), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_src", - &image->arch.backup_src_start, - sizeof(image->arch.backup_src_start), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_sz", - &image->arch.backup_src_sz, - sizeof(image->arch.backup_src_sz), 0); - if (ret) - return ret; - } - - return ret; -} -#else /* !CONFIG_KEXEC_FILE */ -static inline int arch_update_purgatory(struct kimage *image) -{ - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; - /* update purgatory as needed */ - result = arch_update_purgatory(image); - if (result) - return result; - return 0; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 59d3d2763a9e..789f5e4f89de 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = { .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, - .cpu.set_iopl_mask = native_set_iopl_mask, - .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c deleted file mode 100644 index 23fdec030c37..000000000000 --- a/arch/x86/kernel/pci-calgary_64.c +++ /dev/null @@ -1,1586 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Derived from arch/powerpc/kernel/iommu.c - * - * Copyright IBM Corporation, 2006-2007 - * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> - * - * Author: Jon Mason <jdmason@kudzu.us> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - - */ - -#define pr_fmt(fmt) "Calgary: " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/crash_dump.h> -#include <linux/dma-mapping.h> -#include <linux/dma-direct.h> -#include <linux/bitmap.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/scatterlist.h> -#include <linux/iommu-helper.h> - -#include <asm/iommu.h> -#include <asm/calgary.h> -#include <asm/tce.h> -#include <asm/pci-direct.h> -#include <asm/dma.h> -#include <asm/rio.h> -#include <asm/bios_ebda.h> -#include <asm/x86_init.h> -#include <asm/iommu_table.h> - -#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT -int use_calgary __read_mostly = 1; -#else -int use_calgary __read_mostly = 0; -#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ - -#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 - -/* register offsets inside the host bridge space */ -#define CALGARY_CONFIG_REG 0x0108 -#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ -#define PHB_PLSSR_OFFSET 0x0120 -#define PHB_CONFIG_RW_OFFSET 0x0160 -#define PHB_IOBASE_BAR_LOW 0x0170 -#define PHB_IOBASE_BAR_HIGH 0x0180 -#define PHB_MEM_1_LOW 0x0190 -#define PHB_MEM_1_HIGH 0x01A0 -#define PHB_IO_ADDR_SIZE 0x01B0 -#define PHB_MEM_1_SIZE 0x01C0 -#define PHB_MEM_ST_OFFSET 0x01D0 -#define PHB_AER_OFFSET 0x0200 -#define PHB_CONFIG_0_HIGH 0x0220 -#define PHB_CONFIG_0_LOW 0x0230 -#define PHB_CONFIG_0_END 0x0240 -#define PHB_MEM_2_LOW 0x02B0 -#define PHB_MEM_2_HIGH 0x02C0 -#define PHB_MEM_2_SIZE_HIGH 0x02D0 -#define PHB_MEM_2_SIZE_LOW 0x02E0 -#define PHB_DOSHOLE_OFFSET 0x08E0 - -/* CalIOC2 specific */ -#define PHB_SAVIOR_L2 0x0DB0 -#define PHB_PAGE_MIG_CTRL 0x0DA8 -#define PHB_PAGE_MIG_DEBUG 0x0DA0 -#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 - -/* PHB_CONFIG_RW */ -#define PHB_TCE_ENABLE 0x20000000 -#define PHB_SLOT_DISABLE 0x1C000000 -#define PHB_DAC_DISABLE 0x01000000 -#define PHB_MEM2_ENABLE 0x00400000 -#define PHB_MCSR_ENABLE 0x00100000 -/* TAR (Table Address Register) */ -#define TAR_SW_BITS 0x0000ffffffff800fUL -#define TAR_VALID 0x0000000000000008UL -/* CSR (Channel/DMA Status Register) */ -#define CSR_AGENT_MASK 0xffe0ffff -/* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL -/* PMCR/PMDR (Page Migration Control/Debug Registers */ -#define PMR_SOFTSTOP 0x80000000 -#define PMR_SOFTSTOPFAULT 0x40000000 -#define PMR_HARDSTOP 0x20000000 - -/* - * The maximum PHB bus number. - * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 - * x3950M2: 4 chassis, 48 PHBs per chassis = 192 - * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 - * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 - */ -#define MAX_PHB_BUS_NUM 256 - -#define PHBS_PER_CALGARY 4 - -/* register offsets in Calgary's internal register space */ -static const unsigned long tar_offsets[] = { - 0x0580 /* TAR0 */, - 0x0588 /* TAR1 */, - 0x0590 /* TAR2 */, - 0x0598 /* TAR3 */ -}; - -static const unsigned long split_queue_offsets[] = { - 0x4870 /* SPLIT QUEUE 0 */, - 0x5870 /* SPLIT QUEUE 1 */, - 0x6870 /* SPLIT QUEUE 2 */, - 0x7870 /* SPLIT QUEUE 3 */ -}; - -static const unsigned long phb_offsets[] = { - 0x8000 /* PHB0 */, - 0x9000 /* PHB1 */, - 0xA000 /* PHB2 */, - 0xB000 /* PHB3 */ -}; - -/* PHB debug registers */ - -static const unsigned long phb_debug_offsets[] = { - 0x4000 /* PHB 0 DEBUG */, - 0x5000 /* PHB 1 DEBUG */, - 0x6000 /* PHB 2 DEBUG */, - 0x7000 /* PHB 3 DEBUG */ -}; - -/* - * STUFF register for each debug PHB, - * byte 1 = start bus number, byte 2 = end bus number - */ - -#define PHB_DEBUG_STUFF_OFFSET 0x0020 - -unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; -static int translate_empty_slots __read_mostly = 0; -static int calgary_detected __read_mostly = 0; - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; - -struct calgary_bus_info { - void *tce_space; - unsigned char translation_disabled; - signed char phbid; - void __iomem *bbar; -}; - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calgary_tce_cache_blast(struct iommu_table *tbl); -static void calgary_dump_error_regs(struct iommu_table *tbl); -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calioc2_tce_cache_blast(struct iommu_table *tbl); -static void calioc2_dump_error_regs(struct iommu_table *tbl); -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); -static void get_tce_space_from_tar(void); - -static const struct cal_chipset_ops calgary_chip_ops = { - .handle_quirks = calgary_handle_quirks, - .tce_cache_blast = calgary_tce_cache_blast, - .dump_error_regs = calgary_dump_error_regs -}; - -static const struct cal_chipset_ops calioc2_chip_ops = { - .handle_quirks = calioc2_handle_quirks, - .tce_cache_blast = calioc2_tce_cache_blast, - .dump_error_regs = calioc2_dump_error_regs -}; - -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; - -static inline int translation_enabled(struct iommu_table *tbl) -{ - /* only PHBs with translation enabled have an IOMMU table */ - return (tbl != NULL); -} - -static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) -{ - unsigned long index; - unsigned long end; - unsigned long flags; - - index = start_addr >> PAGE_SHIFT; - - /* bail out if we're asked to reserve a region we don't cover */ - if (index >= tbl->it_size) - return; - - end = index + npages; - if (end > tbl->it_size) /* don't go off the table */ - end = tbl->it_size; - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_set(tbl->it_map, index, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static unsigned long iommu_range_alloc(struct device *dev, - struct iommu_table *tbl, - unsigned int npages) -{ - unsigned long flags; - unsigned long offset; - unsigned long boundary_size; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - BUG_ON(npages == 0); - - spin_lock_irqsave(&tbl->it_lock, flags); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - tbl->chip_ops->tce_cache_blast(tbl); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - pr_warn("IOMMU full\n"); - spin_unlock_irqrestore(&tbl->it_lock, flags); - if (panic_on_overflow) - panic("Calgary: fix the allocator.\n"); - else - return DMA_MAPPING_ERROR; - } - } - - tbl->it_hint = offset + npages; - BUG_ON(tbl->it_hint > tbl->it_size); - - spin_unlock_irqrestore(&tbl->it_lock, flags); - - return offset; -} - -static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, - void *vaddr, unsigned int npages, int direction) -{ - unsigned long entry; - dma_addr_t ret; - - entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_MAPPING_ERROR)) { - pr_warn("failed to allocate %u pages in iommu %p\n", - npages, tbl); - return DMA_MAPPING_ERROR; - } - - /* set the return dma address */ - ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); - - /* put the TCEs in the HW table */ - tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, - direction); - return ret; -} - -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry; - unsigned long flags; - - /* were we called with bad_dma_address? */ - if (unlikely(dma_addr == DMA_MAPPING_ERROR)) { - WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " - "address 0x%Lx\n", dma_addr); - return; - } - - entry = dma_addr >> PAGE_SHIFT; - - BUG_ON(entry + npages > tbl->it_size); - - tce_free(tbl, entry, npages); - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_clear(tbl->it_map, entry, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static inline struct iommu_table *find_iommu_table(struct device *dev) -{ - struct pci_dev *pdev; - struct pci_bus *pbus; - struct iommu_table *tbl; - - pdev = to_pci_dev(dev); - - /* search up the device tree for an iommu */ - pbus = pdev->bus; - do { - tbl = pci_iommu(pbus); - if (tbl && tbl->it_busno == pbus->number) - break; - tbl = NULL; - pbus = pbus->parent; - } while (pbus); - - BUG_ON(tbl && (tbl->it_busno != pbus->number)); - - return tbl; -} - -static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems,enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - int i; - - if (!translation_enabled(tbl)) - return; - - for_each_sg(sglist, s, nelems, i) { - unsigned int npages; - dma_addr_t dma = s->dma_address; - unsigned int dmalen = s->dma_length; - - if (dmalen == 0) - break; - - npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); - iommu_free(tbl, dma, npages); - } -} - -static int calgary_map_sg(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - unsigned long vaddr; - unsigned int npages; - unsigned long entry; - int i; - - for_each_sg(sg, s, nelems, i) { - BUG_ON(!sg_page(s)); - - vaddr = (unsigned long) sg_virt(s); - npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); - - entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_MAPPING_ERROR) { - /* makes sure unmap knows to stop */ - s->dma_length = 0; - goto error; - } - - s->dma_address = (entry << PAGE_SHIFT) | s->offset; - - /* insert into HW table */ - tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); - - s->dma_length = s->length; - } - - return nelems; -error: - calgary_unmap_sg(dev, sg, nelems, dir, 0); - for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_MAPPING_ERROR; - sg->dma_length = 0; - } - return 0; -} - -static dma_addr_t calgary_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - void *vaddr = page_address(page) + offset; - unsigned long uaddr; - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, PAGE_SIZE); - - return iommu_alloc(dev, tbl, vaddr, npages, dir); -} - -static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned int npages; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - iommu_free(tbl, dma_addr, npages); -} - -static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) -{ - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); /* size rounded up to full pages */ - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(flag, order); - if (!ret) - goto error; - memset(ret, 0, size); - - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_MAPPING_ERROR) - goto free; - *dma_handle = mapping; - return ret; -free: - free_pages((unsigned long)ret, get_order(size)); - ret = NULL; -error: - return ret; -} - -static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); -} - -static const struct dma_map_ops calgary_dma_ops = { - .alloc = calgary_alloc_coherent, - .free = calgary_free_coherent, - .map_sg = calgary_map_sg, - .unmap_sg = calgary_unmap_sg, - .map_page = calgary_map_page, - .unmap_page = calgary_unmap_page, - .dma_supported = dma_direct_supported, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, -}; - -static inline void __iomem * busno_to_bbar(unsigned char num) -{ - return bus_info[num].bbar; -} - -static inline int busno_to_phbid(unsigned char num) -{ - return bus_info[num].phbid; -} - -static inline unsigned long split_queue_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return split_queue_offsets[idx]; -} - -static inline unsigned long tar_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return tar_offsets[idx]; -} - -static inline unsigned long phb_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return phb_offsets[idx]; -} - -static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) -{ - unsigned long target = ((unsigned long)bar) | offset; - return (void __iomem*)target; -} - -static inline int is_calioc2(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALIOC2); -} - -static inline int is_calgary(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALGARY); -} - -static inline int is_cal_pci_dev(unsigned short device) -{ - return (is_calgary(device) || is_calioc2(device)); -} - -static void calgary_tce_cache_blast(struct iommu_table *tbl) -{ - u64 val; - u32 aer; - int i = 0; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - - /* disable arbitration on the bus */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - aer = readl(target); - writel(0, target); - - /* read plssr to ensure it got there */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - val = readl(target); - - /* poll split queues until all DMA activity is done */ - target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); - do { - val = readq(target); - i++; - } while ((val & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("PCI bus not quiesced, continuing anyway\n"); - - /* invalidate TCE cache */ - target = calgary_reg(bbar, tar_offset(tbl->it_busno)); - writeq(tbl->tar_val, target); - - /* enable arbitration */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - writel(aer, target); - (void)readl(target); /* flush */ -} - -static void calioc2_tce_cache_blast(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u64 val64; - u32 val; - int i = 0; - int count = 1; - unsigned char bus = tbl->it_busno; - -begin: - printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " - "sequence - count %d\n", bus, count); - - /* 1. using the Page Migration Control reg set SoftStop */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); - val |= PMR_SOFTSTOP; - printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - - /* 2. poll split queues until all DMA activity is done */ - printk(KERN_DEBUG "2a. starting to poll split queues\n"); - target = calgary_reg(bbar, split_queue_offset(bus)); - do { - val64 = readq(target); - i++; - } while ((val64 & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); - - /* 3. poll Page Migration DEBUG for SoftStopFault */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); - - /* 4. if SoftStopFault - goto (1) */ - if (val & PMR_SOFTSTOPFAULT) { - if (++count < 100) - goto begin; - else { - pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); - return; /* pray for the best */ - } - } - - /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); - - /* 6. invalidate TCE cache */ - printk(KERN_DEBUG "6. invalidating TCE cache\n"); - target = calgary_reg(bbar, tar_offset(bus)); - writeq(tbl->tar_val, target); - - /* 7. Re-read PMCR */ - printk(KERN_DEBUG "7a. Re-reading PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); - - /* 8. Remove HardStop */ - printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = 0; - printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); -} - -static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, - u64 limit) -{ - unsigned int numpages; - - limit = limit | 0xfffff; - limit++; - - numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(pci_iommu(dev->bus), start, numpages); -} - -static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) -{ - void __iomem *target; - u64 low, high, sizelow; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* peripheral MEM_1 region */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); - sizelow = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) -{ - void __iomem *target; - u32 val32; - u64 low, high, sizelow, sizehigh; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* is it enabled? */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - if (!(val32 & PHB_MEM2_ENABLE)) - return; - - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); - sizelow = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); - sizehigh = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = (sizehigh << 32) | sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -/* - * some regions of the IO address space do not get translated, so we - * must not give devices IO addresses in those regions. The regions - * are the 640KB-1MB region and the two PCI peripheral memory holes. - * Reserve all of them in the IOMMU bitmap to avoid giving them out - * later. - */ -static void __init calgary_reserve_regions(struct pci_dev *dev) -{ - unsigned int npages; - u64 start; - struct iommu_table *tbl = pci_iommu(dev->bus); - - /* avoid the BIOS/VGA first 640KB-1MB region */ - /* for CalIOC2 - avoid the entire first MB */ - if (is_calgary(dev->device)) { - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; - } else { /* calioc2 */ - start = 0; - npages = (1 * 1024 * 1024) >> PAGE_SHIFT; - } - iommu_range_reserve(tbl, start, npages); - - /* reserve the two PCI peripheral memory regions in IO space */ - calgary_reserve_peripheral_mem_1(dev); - calgary_reserve_peripheral_mem_2(dev); -} - -static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) -{ - u64 val64; - u64 table_phys; - void __iomem *target; - int ret; - struct iommu_table *tbl; - - /* build TCE tables for each PHB */ - ret = build_tce_table(dev, bbar); - if (ret) - return ret; - - tbl = pci_iommu(dev->bus); - tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - - if (is_kdump_kernel()) - calgary_init_bitmap_from_tce_table(tbl); - else - tce_free(tbl, 0, tbl->it_size); - - if (is_calgary(dev->device)) - tbl->chip_ops = &calgary_chip_ops; - else if (is_calioc2(dev->device)) - tbl->chip_ops = &calioc2_chip_ops; - else - BUG(); - - calgary_reserve_regions(dev); - - /* set TARs for each PHB */ - target = calgary_reg(bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - - /* zero out all TAR bits under sw control */ - val64 &= ~TAR_SW_BITS; - table_phys = (u64)__pa(tbl->it_base); - - val64 |= table_phys; - - BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); - val64 |= (u64) specified_table_size; - - tbl->tar_val = cpu_to_be64(val64); - - writeq(tbl->tar_val, target); - readq(target); /* flush */ - - return 0; -} - -static void __init calgary_free_bus(struct pci_dev *dev) -{ - u64 val64; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *target; - unsigned int bitmapsz; - - target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - val64 &= ~TAR_SW_BITS; - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ - - bitmapsz = tbl->it_size / BITS_PER_BYTE; - free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); - tbl->it_map = NULL; - - kfree(tbl); - - set_pci_iommu(dev->bus, NULL); - - /* Can't free bootmem allocated memory after system is up :-( */ - bus_info[dev->bus->number].tce_space = NULL; -} - -static void calgary_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 csr, plssr; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", - tbl->it_busno, csr, plssr); -} - -static void calioc2_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - u32 csr, csmr, plssr, mck, rcstat; - void __iomem *target; - unsigned long phboff = phb_offset(tbl->it_busno); - unsigned long erroff; - u32 errregs[7]; - int i; - - /* dump CSR */ - target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - /* dump PLSSR */ - target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - /* dump CSMR */ - target = calgary_reg(bbar, phboff | 0x290); - csmr = be32_to_cpu(readl(target)); - /* dump mck */ - target = calgary_reg(bbar, phboff | 0x800); - mck = be32_to_cpu(readl(target)); - - pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - - pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); - - /* dump rest of error regs */ - pr_emerg(""); - for (i = 0; i < ARRAY_SIZE(errregs); i++) { - /* err regs are at 0x810 - 0x870 */ - erroff = (0x810 + (i * 0x10)); - target = calgary_reg(bbar, phboff | erroff); - errregs[i] = be32_to_cpu(readl(target)); - pr_cont("0x%08x@0x%lx ", errregs[i], erroff); - } - pr_cont("\n"); - - /* root complex status */ - target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); - rcstat = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, - PHB_ROOT_COMPLEX_STATUS); -} - -static void calgary_watchdog(struct timer_list *t) -{ - struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); - void __iomem *bbar = tbl->bbar; - u32 val32; - void __iomem *target; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - val32 = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - if (val32 & CSR_AGENT_MASK) { - tbl->chip_ops->dump_error_regs(tbl); - - /* reset error */ - writel(0, target); - - /* Disable bus that caused the error */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_SLOT_DISABLE; - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - } else { - /* Reset the timer */ - mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); - } -} - -static void __init calgary_set_split_completion_timeout(void __iomem *bbar, - unsigned char busnum, unsigned long timeout) -{ - u64 val64; - void __iomem *target; - unsigned int phb_shift = ~0; /* silence gcc */ - u64 mask; - - switch (busno_to_phbid(busnum)) { - case 0: phb_shift = (63 - 19); - break; - case 1: phb_shift = (63 - 23); - break; - case 2: phb_shift = (63 - 27); - break; - case 3: phb_shift = (63 - 35); - break; - default: - BUG_ON(busno_to_phbid(busnum)); - } - - target = calgary_reg(bbar, CALGARY_CONFIG_REG); - val64 = be64_to_cpu(readq(target)); - - /* zero out this PHB's timer bits */ - mask = ~(0xFUL << phb_shift); - val64 &= mask; - val64 |= (timeout << phb_shift); - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ -} - -static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 val; - - /* - * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 - */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); - val = cpu_to_be32(readl(target)); - val |= 0x00800000; - writel(cpu_to_be32(val), target); -} - -static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (is_calgary(dev->device) && (busnum == 1)) - calgary_set_split_completion_timeout(tbl->bbar, busnum, - CCR_2SEC_TIMEOUT); -} - -static void __init calgary_enable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* enable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - - printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", - (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? - "Calgary" : "CalIOC2", busnum); - printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " - "bus.\n"); - - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); - mod_timer(&tbl->watchdog_timer, jiffies); -} - -static void __init calgary_disable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* disable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); - - printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - del_timer_sync(&tbl->watchdog_timer); -} - -static void __init calgary_init_one_nontraslated(struct pci_dev *dev) -{ - pci_dev_get(dev); - set_pci_iommu(dev->bus, NULL); - - /* is the device behind a bridge? */ - if (dev->bus->parent) - dev->bus->parent->self = dev; - else - dev->bus->self = dev; -} - -static int __init calgary_init_one(struct pci_dev *dev) -{ - void __iomem *bbar; - struct iommu_table *tbl; - int ret; - - bbar = busno_to_bbar(dev->bus->number); - ret = calgary_setup_tar(dev, bbar); - if (ret) - goto done; - - pci_dev_get(dev); - - if (dev->bus->parent) { - if (dev->bus->parent->self) - printk(KERN_WARNING "Calgary: IEEEE, dev %p has " - "bus->parent->self!\n", dev); - dev->bus->parent->self = dev; - } else - dev->bus->self = dev; - - tbl = pci_iommu(dev->bus); - tbl->chip_ops->handle_quirks(tbl, dev); - - calgary_enable_translation(dev); - - return 0; - -done: - return ret; -} - -static int __init calgary_locate_bbars(void) -{ - int ret; - int rioidx, phb, bus; - void __iomem *bbar; - void __iomem *target; - unsigned long offset; - u8 start_bus, end_bus; - u32 val; - - ret = -ENODATA; - for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { - struct rio_detail *rio = rio_devs[rioidx]; - - if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) - continue; - - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); - if (!bbar) - goto error; - - for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { - offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; - target = calgary_reg(bbar, offset); - - val = be32_to_cpu(readl(target)); - - start_bus = (u8)((val & 0x00FF0000) >> 16); - end_bus = (u8)((val & 0x0000FF00) >> 8); - - if (end_bus) { - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; - } - } else { - bus_info[start_bus].bbar = bbar; - bus_info[start_bus].phbid = phb; - } - } - } - - return 0; - -error: - /* scan bus_info and iounmap any bbars we previously ioremap'd */ - for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) - if (bus_info[bus].bbar) - iounmap(bus_info[bus].bbar); - - return ret; -} - -static int __init calgary_init(void) -{ - int ret; - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - ret = calgary_locate_bbars(); - if (ret) - return ret; - - /* Purely for kdump kernel case */ - if (is_kdump_kernel()) - get_tce_space_from_tar(); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - calgary_init_one_nontraslated(dev); - continue; - } - - if (!info->tce_space && !translate_empty_slots) - continue; - - ret = calgary_init_one(dev); - if (ret) - goto error; - } while (1); - - dev = NULL; - for_each_pci_dev(dev) { - struct iommu_table *tbl; - - tbl = find_iommu_table(&dev->dev); - - if (translation_enabled(tbl)) - dev->dev.dma_ops = &calgary_dma_ops; - } - - return ret; - -error: - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - pci_dev_put(dev); - continue; - } - if (!info->tce_space && !translate_empty_slots) - continue; - - calgary_disable_translation(dev); - calgary_free_bus(dev); - pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.dma_ops = NULL; - } while (1); - - return ret; -} - -static inline int __init determine_tce_table_size(void) -{ - int ret; - - if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) - return specified_table_size; - - if (is_kdump_kernel() && saved_max_pfn) { - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); - if (ret > TCE_TABLE_SIZE_8M) - ret = TCE_TABLE_SIZE_8M; - } else { - /* - * Use 8M by default (suggested by Muli) if it's not - * kdump kernel and saved_max_pfn isn't set. - */ - ret = TCE_TABLE_SIZE_8M; - } - - return ret; -} - -static int __init build_detail_arrays(void) -{ - unsigned long ptr; - unsigned numnodes, i; - int scal_detail_size, rio_detail_size; - - numnodes = rio_table_hdr->num_scal_dev; - if (numnodes > MAX_NUMNODES){ - printk(KERN_WARNING - "Calgary: MAX_NUMNODES too low! Defined as %d, " - "but system has %d nodes.\n", - MAX_NUMNODES, numnodes); - return -ENODEV; - } - - switch (rio_table_hdr->version){ - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - default: - printk(KERN_WARNING - "Calgary: Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return -EPROTO; - } - - ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < numnodes; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; - i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 0; -} - -static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) -{ - int dev; - u32 val; - - if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { - /* - * FIXME: properly scan for devices across the - * PCI-to-PCI bridge on every CalIOC2 port. - */ - return 1; - } - - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff) - break; - } - return (val != 0xffffffff); -} - -/* - * calgary_init_bitmap_from_tce_table(): - * Function for kdump case. In the second/kdump kernel initialize - * the bitmap based on the tce table entries obtained from first kernel - */ -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) -{ - u64 *tp; - unsigned int index; - tp = ((u64 *)tbl->it_base); - for (index = 0 ; index < tbl->it_size; index++) { - if (*tp != 0x0) - set_bit(index, tbl->it_map); - tp++; - } -} - -/* - * get_tce_space_from_tar(): - * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base address register of calgary iommu - */ -static void __init get_tce_space_from_tar(void) -{ - int bus; - void __iomem *target; - unsigned long tce_space; - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - target = calgary_reg(bus_info[bus].bbar, - tar_offset(bus)); - tce_space = be64_to_cpu(readq(target)); - tce_space = tce_space & TAR_SW_BITS; - - tce_space = tce_space & (~specified_table_size); - info->tce_space = (u64 *)__va(tce_space); - } - } - return; -} - -static int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - return 0; -} - -int __init detect_calgary(void) -{ - int bus; - void *tbl; - int calgary_found = 0; - unsigned long ptr; - unsigned int offset, prev_offset; - int ret; - - /* - * if the user specified iommu=off or iommu=soft or we found - * another HW IOMMU already, bail out. - */ - if (no_iommu || iommu_detected) - return -ENODEV; - - if (!use_calgary) - return -ENODEV; - - if (!early_pci_allowed()) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); - - ptr = (unsigned long)phys_to_virt(get_bios_ebda()); - - rio_table_hdr = NULL; - prev_offset = 0; - offset = 0x180; - /* - * The next offset is stored in the 1st word. - * Only parse up until the offset increases: - */ - while (offset > prev_offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - prev_offset = offset; - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " - "in EBDA - bailing!\n"); - return -ENODEV; - } - - ret = build_detail_arrays(); - if (ret) { - printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return -ENOMEM; - } - - specified_table_size = determine_tce_table_size(); - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - /* - * If it is kdump kernel, find and use tce tables - * from first kernel, else allocate tce tables here - */ - if (!is_kdump_kernel()) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - } - calgary_found = 1; - } - } - - printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", - calgary_found ? "found" : "not found"); - - if (calgary_found) { - iommu_detected = 1; - calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", - specified_table_size); - - x86_init.iommu.iommu_init = calgary_iommu_init; - } - return calgary_found; - -cleanup: - for (--bus; bus >= 0; --bus) { - struct calgary_bus_info *info = &bus_info[bus]; - - if (info->tce_space) - free_tce_table(info->tce_space); - } - return -ENOMEM; -} - -static int __init calgary_parse_options(char *p) -{ - unsigned int bridge; - unsigned long val; - size_t len; - ssize_t ret; - - while (*p) { - if (!strncmp(p, "64k", 3)) - specified_table_size = TCE_TABLE_SIZE_64K; - else if (!strncmp(p, "128k", 4)) - specified_table_size = TCE_TABLE_SIZE_128K; - else if (!strncmp(p, "256k", 4)) - specified_table_size = TCE_TABLE_SIZE_256K; - else if (!strncmp(p, "512k", 4)) - specified_table_size = TCE_TABLE_SIZE_512K; - else if (!strncmp(p, "1M", 2)) - specified_table_size = TCE_TABLE_SIZE_1M; - else if (!strncmp(p, "2M", 2)) - specified_table_size = TCE_TABLE_SIZE_2M; - else if (!strncmp(p, "4M", 2)) - specified_table_size = TCE_TABLE_SIZE_4M; - else if (!strncmp(p, "8M", 2)) - specified_table_size = TCE_TABLE_SIZE_8M; - - len = strlen("translate_empty_slots"); - if (!strncmp(p, "translate_empty_slots", len)) - translate_empty_slots = 1; - - len = strlen("disable"); - if (!strncmp(p, "disable", len)) { - p += len; - if (*p == '=') - ++p; - if (*p == '\0') - break; - ret = kstrtoul(p, 0, &val); - if (ret) - break; - - bridge = val; - if (bridge < MAX_PHB_BUS_NUM) { - printk(KERN_INFO "Calgary: disabling " - "translation for PHB %#x\n", bridge); - bus_info[bridge].translation_disabled = 1; - } - } - - p = strpbrk(p, ","); - if (!p) - break; - - p++; /* skip ',' */ - } - return 1; -} -__setup("calgary=", calgary_parse_options); - -static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) -{ - struct iommu_table *tbl; - unsigned int npages; - int i; - - tbl = pci_iommu(dev->bus); - - for (i = 0; i < 4; i++) { - struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; - - /* Don't give out TCEs that map MEM resources */ - if (!(r->flags & IORESOURCE_MEM)) - continue; - - /* 0-based? we reserve the whole 1st MB anyway */ - if (!r->start) - continue; - - /* cover the whole region */ - npages = resource_size(r) >> PAGE_SHIFT; - npages++; - - iommu_range_reserve(tbl, r->start, npages); - } -} - -static int __init calgary_fixup_tce_spaces(void) -{ - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - if (no_iommu || swiotlb || !calgary_detected) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) - continue; - - if (!info->tce_space) - continue; - - calgary_fixup_one_tce_space(dev); - - } while (1); - - return 0; -} - -/* - * We need to be call after pcibios_assign_resources (fs_initcall level) - * and before device_initcall. - */ -rootfs_initcall(calgary_fixup_tce_spaces); - -IOMMU_INIT_POST(detect_calgary); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fa4352dce491..57de2ebff7e2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -12,7 +12,6 @@ #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/calgary.h> #include <asm/x86_init.h> #include <asm/iommu_table.h> @@ -112,11 +111,6 @@ static __init int iommu_setup(char *p) gart_parse_options(p); -#ifdef CONFIG_CALGARY_IOMMU - if (!strncmp(p, "calgary", 7)) - use_calgary = 1; -#endif /* CONFIG_CALGARY_IOMMU */ - p += strcspn(p, ","); if (*p == ',') ++p; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e94c4354d4e..bd2a11ca5dd6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -41,6 +41,7 @@ #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> +#include <asm/io_bitmap.h> #include <asm/proto.h> #include "process.h" @@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, #endif + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, -#ifdef CONFIG_X86_32 - /* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, -#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); @@ -110,28 +102,89 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; - if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); - - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - kfree(bp); - } + if (test_thread_flag(TIF_IO_BITMAP)) + io_bitmap_exit(); free_vm86(t); fpu__drop(fpu); } +static int set_new_tls(struct task_struct *p, unsigned long tls) +{ + struct user_desc __user *utls = (struct user_desc __user *)tls; + + if (in_ia32_syscall()) + return do_set_thread_area(p, -1, utls, 0); + else + return do_set_thread_area_64(p, ARCH_SET_FS, tls); +} + +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) +{ + struct inactive_task_frame *frame; + struct fork_frame *fork_frame; + struct pt_regs *childregs; + int ret = 0; + + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; + p->thread.io_bitmap = NULL; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef CONFIG_X86_64 + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); +#else + p->thread.sp0 = (unsigned long) (childregs + 1); + /* + * Clear all status flags including IF and set fixed bit. 64bit + * does not have this initialization as the frame does not contain + * flags. The flags consistency (especially vs. AC) is there + * ensured via objtool, which lacks 32bit support. + */ + frame->flags = X86_EFLAGS_FIXED; +#endif + + /* Kernel thread ? */ + if (unlikely(p->flags & PF_KTHREAD)) { + memset(childregs, 0, sizeof(struct pt_regs)); + kthread_frame_init(frame, sp, arg); + return 0; + } + + frame->bx = 0; + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + +#ifdef CONFIG_X86_32 + task_user_gs(p) = get_user_gs(current_pt_regs()); +#endif + + /* Set a new TLS for the child thread? */ + if (clone_flags & CLONE_SETTLS) + ret = set_new_tls(p, tls); + + if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) + io_bitmap_share(p); + + return ret; +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -269,31 +322,96 @@ void arch_setup_new_exec(void) } } -static inline void switch_to_bitmap(struct thread_struct *prev, - struct thread_struct *next, - unsigned long tifp, unsigned long tifn) +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + /* + * Invalidate the I/O bitmap by moving io_bitmap_base outside the + * TSS limit so any subsequent I/O access from user space will + * trigger a #GP. + * + * This is correct even when VMEXIT rewrites the TSS limit + * to 0x67 as the only requirement is that the base points + * outside the limit. + */ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; +} - if (tifn & _TIF_IO_BITMAP) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); +static inline void switch_to_bitmap(unsigned long tifp) +{ + /* + * Invalidate I/O bitmap if the previous task used it. This prevents + * any possible leakage of an active I/O bitmap. + * + * If the next task has an I/O bitmap it will handle it on exit to + * user mode. + */ + if (tifp & _TIF_IO_BITMAP) + tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); +} + +static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) +{ + /* + * Copy at least the byte range of the incoming tasks bitmap which + * covers the permitted I/O ports. + * + * If the previous task which used an I/O bitmap had more bits + * permitted, then the copy needs to cover those as well so they + * get turned off. + */ + memcpy(tss->io_bitmap.bitmap, iobm->bitmap, + max(tss->io_bitmap.prev_max, iobm->max)); + + /* + * Store the new max and the sequence number of this bitmap + * and a pointer to the bitmap itself. + */ + tss->io_bitmap.prev_max = iobm->max; + tss->io_bitmap.prev_sequence = iobm->sequence; +} + +/** + * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + */ +void tss_update_io_bitmap(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + u16 *base = &tss->x86_tss.io_bitmap_base; + + if (test_thread_flag(TIF_IO_BITMAP)) { + struct thread_struct *t = ¤t->thread; + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + /* + * Only copy bitmap data when the sequence number + * differs. The update time is accounted to the + * incoming task. + */ + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; + } /* - * Make sure that the TSS limit is correct for the CPU - * to notice the IO bitmap. + * Make sure that the TSS limit is covering the io bitmap. + * It might have been cut down by a VMEXIT to 0x67 which + * would cause a subsequent I/O access from user space to + * trigger a #GP because tbe bitmap is outside the TSS + * limit. */ refresh_tss_limit(); - } else if (tifp & _TIF_IO_BITMAP) { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } else { + tss_invalidate_io_bitmap(tss); } } +#else /* CONFIG_X86_IOPL_IOPERM */ +static inline void switch_to_bitmap(unsigned long tifp) { } +#endif #ifdef CONFIG_SMP @@ -505,7 +623,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) tifn = READ_ONCE(task_thread_info(next_p)->flags); tifp = READ_ONCE(task_thread_info(prev_p)->flags); - switch_to_bitmap(prev, next, tifp, tifn); + + switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b8ceec4974fe..323499f48858 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - struct pt_regs *childregs = task_pt_regs(p); - struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); - struct inactive_task_frame *frame = &fork_frame->frame; - struct task_struct *tsk; - int err; - - /* - * For a new task use the RESET flags value since there is no before. - * All the status flags are zero; DF and all the system flags must also - * be 0, specifically IF must be 0 because we context switch to the new - * task with interrupts disabled. - */ - frame->flags = X86_EFLAGS_FIXED; - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.sp0 = (unsigned long) (childregs+1); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->di = arg; - p->thread.io_bitmap_ptr = NULL; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - task_user_gs(p) = get_user_gs(current_pt_regs()); - - p->thread.io_bitmap_ptr = NULL; - tsk = current; - err = -ENOMEM; - - if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - err = 0; - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { @@ -255,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_TLS(next, cpu); - /* - * Restore IOPL if needed. In normal use, the flags restore - * in the switch assembly will handle this. But if the kernel - * is running virtualized at a non-zero CPL, the popf will - * not restore flags, so it must be done in a separate step. - */ - if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) - set_iopl_mask(next->iopl); - switch_to_extra(prev_p, next_p); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index af64519b2695..506d66830d4d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) task->thread.gsbase = gsbase; } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - int err; - struct pt_regs *childregs; - struct fork_frame *fork_frame; - struct inactive_task_frame *frame; - struct task_struct *me = current; - - childregs = task_pt_regs(p); - fork_frame = container_of(childregs, struct fork_frame, regs); - frame = &fork_frame->frame; - - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.io_bitmap_ptr = NULL; - - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; - savesegment(es, p->thread.es); - savesegment(ds, p->thread.ds); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->r12 = arg; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (in_ia32_syscall()) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - else -#endif - err = do_arch_prctl_64(p, ARCH_SET_FS, tls); - if (err) - goto out; - } - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - - return err; -} - static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, @@ -572,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_to_extra(prev_p, next_p); -#ifdef CONFIG_XEN_PV - /* - * On Xen PV, IOPL bits in pt_regs->flags have no effect, and - * current_pt_regs()->flags may not match the current task's - * intended IOPL. We need to switch it manually. - */ - if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && - prev->iopl != next->iopl)) - xen_set_iopl_mask(next->iopl); -#endif - if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3c5bbe8e4120..066e5b01a7e0 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> +#include <asm/io_bitmap.h> #include "tls.h" @@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { - return target->thread.io_bitmap_max / regset->size; + struct io_bitmap *iobm = target->thread.io_bitmap; + + return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, @@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - if (!target->thread.io_bitmap_ptr) + struct io_bitmap *iobm = target->thread.io_bitmap; + + if (!iobm) return -ENXIO; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.io_bitmap_ptr, - 0, IO_BITMAP_BYTES); + iobm->bitmap, 0, IO_BITMAP_BYTES); } /* diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index ee26df08002e..94b33885f8d2 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -35,8 +35,7 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* Save the CPU context, used for jumping back */ pushl %ebx @@ -93,8 +92,9 @@ relocate_kernel: addl $(identity_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushl $0 /* store the start address on the stack */ @@ -191,8 +191,9 @@ identity_mapped: addl $(virtual_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movl CR4(%edi), %eax movl %eax, %cr4 movl CR3(%edi), %eax @@ -208,9 +209,10 @@ virtual_mapped: popl %esi popl %ebx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl 8(%esp), %edx movl 4(%esp), %ecx pushl %ebp @@ -270,6 +272,7 @@ swap_pages: popl %ebx popl %ebp ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c51ccff5cd01..ef3ba99068d3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -38,8 +38,7 @@ .text .align PAGE_SIZE .code64 - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* * %rdi indirection_page * %rsi page_list @@ -103,8 +102,9 @@ relocate_kernel: addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -209,8 +209,9 @@ identity_mapped: movq $virtual_mapped, %rax pushq %rax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -228,9 +229,10 @@ virtual_mapped: popq %rbp popq %rbx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi @@ -283,6 +285,7 @@ swap_pages: jmp 0b 3: ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 77ea96b794bd..cedfe2077a69 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -143,6 +143,13 @@ struct boot_params boot_params; /* * Machine setup.. */ +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + static struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -438,6 +445,12 @@ static void __init memblock_x86_reserve_range_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + memblock_reserve(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len); + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -459,7 +472,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) * due to mapping restrictions. * * On 64bit, kdump kernel need be restricted to be under 64TB, which is - * the upper limit of system RAM in 4-level paing mode. Since the kdump + * the upper limit of system RAM in 4-level paging mode. Since the kdump * jumping could be from 5-level to 4-level, the jumping will fail if * kernel is put above 64TB, and there's no way to detect the paging mode * of the kernel which will be loaded for dumping during the 1st kernel @@ -743,8 +756,8 @@ static void __init trim_bios_range(void) e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* - * special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); @@ -951,7 +964,9 @@ void __init setup_arch(char **cmdline_p) code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; - data_resource.start = __pa_symbol(_etext); + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); data_resource.end = __pa_symbol(_edata)-1; bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; @@ -1040,6 +1055,7 @@ void __init setup_arch(char **cmdline_p) /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); @@ -1122,17 +1138,15 @@ void __init setup_arch(char **cmdline_p) reserve_bios_regions(); - if (efi_enabled(EFI_MEMMAP)) { - efi_fake_memmap(); - efi_find_mirror(); - efi_esrt_init(); + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be - * called after ExitBootServices(). This is, in fact, a lie. - */ - efi_reserve_boot_services(); - } + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ e820__memblock_alloc_reserved_mpc_new(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 86663874ef04..e6d7894ad127 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("%s allocator failed (%d), falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a49fe1dcb47e..4c61f0713832 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -57,7 +57,7 @@ void __init tboot_probe(void) */ if (!e820__mapped_any(boot_params.tboot_addr, boot_params.tboot_addr, E820_TYPE_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -65,13 +65,12 @@ void __init tboot_probe(void) set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warning("tboot at 0x%llx is invalid\n", - boot_params.tboot_addr); + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); tboot = NULL; return; } if (tboot->version < 5) { - pr_warning("tboot version is invalid: %u\n", tboot->version); + pr_warn("tboot version is invalid: %u\n", tboot->version); tboot = NULL; return; } @@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { - pr_warning("unsupported sleep state 0x%x\n", sleep_state); + pr_warn("unsupported sleep state 0x%x\n", sleep_state); return -1; } @@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) if (!tboot_enabled()) return 0; - pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); return -ENODEV; } @@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps) } if (timeout) - pr_warning("tboot wait for APs timeout\n"); + pr_warn("tboot wait for APs timeout\n"); return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } @@ -516,7 +515,7 @@ int tboot_force_iommu(void) return 1; if (no_iommu || swiotlb || dmar_disabled) - pr_warning("Forcing Intel-IOMMU to enabled\n"); + pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c deleted file mode 100644 index 6384be751eff..000000000000 --- a/arch/x86/kernel/tce_64.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file manages the translation entries for the IBM Calgary IOMMU. - * - * Derived from arch/powerpc/platforms/pseries/iommu.c - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <linux/memblock.h> -#include <asm/tce.h> -#include <asm/calgary.h> -#include <asm/proto.h> -#include <asm/cacheflush.h> - -/* flush a tce at 'tceaddr' to main memory */ -static inline void flush_tce(void* tceaddr) -{ - /* a single tce can't cross a cache line */ - if (boot_cpu_has(X86_FEATURE_CLFLUSH)) - clflush(tceaddr); - else - wbinvd(); -} - -void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction) -{ - u64* tp; - u64 t; - u64 rpn; - - t = (1 << TCE_READ_SHIFT); - if (direction != DMA_TO_DEVICE) - t |= (1 << TCE_WRITE_SHIFT); - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; - t &= ~TCE_RPN_MASK; - t |= (rpn << TCE_RPN_SHIFT); - - *tp = cpu_to_be64(t); - flush_tce(tp); - - uaddr += PAGE_SIZE; - tp++; - } -} - -void tce_free(struct iommu_table *tbl, long index, unsigned int npages) -{ - u64* tp; - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - *tp = cpu_to_be64(0); - flush_tce(tp); - tp++; - } -} - -static inline unsigned int table_size_to_number_of_entries(unsigned char size) -{ - /* - * size is the order of the table, 0-7 - * smallest table is 8K entries, so shift result by 13 to - * multiply by 8K - */ - return (1 << size) << 13; -} - -static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) -{ - unsigned int bitmapsz; - unsigned long bmppages; - int ret; - - tbl->it_busno = dev->bus->number; - - /* set the tce table size - measured in entries */ - tbl->it_size = table_size_to_number_of_entries(specified_table_size); - - /* - * number of bytes needed for the bitmap size in number of - * entries; we need one bit per entry - */ - bitmapsz = tbl->it_size / BITS_PER_BYTE; - bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); - if (!bmppages) { - printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); - ret = -ENOMEM; - goto done; - } - - tbl->it_map = (unsigned long*)bmppages; - - memset(tbl->it_map, 0, bitmapsz); - - tbl->it_hint = 0; - - spin_lock_init(&tbl->it_lock); - - return 0; - -done: - return ret; -} - -int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) -{ - struct iommu_table *tbl; - int ret; - - if (pci_iommu(dev->bus)) { - printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", - dev, pci_iommu(dev->bus)); - BUG(); - } - - tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); - if (!tbl) { - printk(KERN_ERR "Calgary: error allocating iommu_table\n"); - ret = -ENOMEM; - goto done; - } - - ret = tce_table_setparms(dev, tbl); - if (ret) - goto free_tbl; - - tbl->bbar = bbar; - - set_pci_iommu(dev->bus, tbl); - - return 0; - -free_tbl: - kfree(tbl); -done: - return ret; -} - -void * __init alloc_tce_table(void) -{ - unsigned int size; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - return memblock_alloc_low(size, size); -} - -void __init free_tce_table(void *tbl) -{ - unsigned int size; - - if (!tbl) - return; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - memblock_free(__pa(tbl), size); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bb0f8447112..c90312146da0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,11 +37,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index ec534f978867..b8acf639abd1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -364,12 +364,12 @@ retry: /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); if (random_warps) - pr_warning("TSC warped randomly between CPUs\n"); + pr_warn("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 548fefed71ee..4d732a444711 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -1,6 +1,6 @@ /* - * umip.c Emulation for instruction protected by the Intel User-Mode - * Instruction Prevention feature + * umip.c Emulation for instruction protected by the User-Mode Instruction + * Prevention feature * * Copyright (c) 2017, Intel Corporation. * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> @@ -18,10 +18,10 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * - * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR) - * from being executed with CPL > 0. Otherwise, a general protection fault is - * issued. + * User-Mode Instruction Prevention is a security feature present in recent + * x86 processors that, when enabled, prevents a group of instructions (SGDT, + * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general + * protection fault if the instruction is executed with CPL > 0. * * Rather than relaying to the user space the general protection fault caused by * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be @@ -91,7 +91,7 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warning(regs, fmt, ...) \ +#define umip_pr_warn(regs, fmt, ...) \ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) /** @@ -380,14 +380,14 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); /* Do not emulate (spoof) SLDT or STR. */ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; - umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8cd745ef8c7b..15e5aad8ac2c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -842,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @auprobe: the probepoint information. * @mm: the probed address space. - * @arch_uprobe: the probepoint information. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index a024c4f7ba56..641f0fe1e5b4 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -31,7 +31,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -ENTRY(verify_cpu) +SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -137,4 +137,4 @@ ENTRY(verify_cpu) popf # Restore caller passed flags xorl %eax, %eax ret -ENDPROC(verify_cpu) +SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e2feacf921a0..3a1a819da137 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,9 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -141,17 +144,12 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif + } :text =0xcccc - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - - EXCEPTION_TABLE(16) :text = 0x9090 - - /* .text should occupy whole number of pages */ + /* End of text section, which should occupy whole number of pages */ + _etext = .; . = ALIGN(PAGE_SIZE); + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) X86_ALIGN_RODATA_END diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 18a799c8fa28..ce89430a7f80 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } +static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static __init void get_rtc_noop(struct timespec64 *now) { } + +static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, + {} +}; + +/* + * Allow devicetree configured systems to disable the RTC by setting the + * corresponding DT node's status property to disabled. Code is optimized + * out for CONFIG_OF=n builds. + */ +static __init void x86_wallclock_init(void) +{ + struct device_node *node = of_find_matching_node(NULL, of_cmos_match); + + if (node && !of_device_is_available(node)) { + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + } +} /* * The platform setup functions are preset with the default functions @@ -73,7 +95,7 @@ struct x86_init_ops x86_init __initdata = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, - .wallclock_init = x86_init_noop, + .wallclock_init = x86_wallclock_init, }, .iommu = { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d5e6d5b3f06f..bcc6a73d6628 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -508,8 +508,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) *filter = tmp; mutex_lock(&kvm->lock); - rcu_swap_protected(kvm->arch.pmu_event_filter, filter, - mutex_is_locked(&kvm->lock)); + filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 751a384c2eb0..81ada2ce99e7 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -43,7 +43,7 @@ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump * to vmx_vmexit. */ -ENTRY(vmx_vmenter) +SYM_FUNC_START(vmx_vmenter) /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ je 2f @@ -65,7 +65,7 @@ ENTRY(vmx_vmenter) _ASM_EXTABLE(1b, 5b) _ASM_EXTABLE(2b, 5b) -ENDPROC(vmx_vmenter) +SYM_FUNC_END(vmx_vmenter) /** * vmx_vmexit - Handle a VMX VM-Exit @@ -77,7 +77,7 @@ ENDPROC(vmx_vmenter) * here after hardware loads the host's state, i.e. this is the destination * referred to by VMCS.HOST_RIP. */ -ENTRY(vmx_vmexit) +SYM_FUNC_START(vmx_vmexit) #ifdef CONFIG_RETPOLINE ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE /* Preserve guest's RAX, it's used to stuff the RSB. */ @@ -90,7 +90,7 @@ ENTRY(vmx_vmexit) .Lvmexit_skip_rsb: #endif ret -ENDPROC(vmx_vmexit) +SYM_FUNC_END(vmx_vmexit) /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode @@ -101,7 +101,7 @@ ENDPROC(vmx_vmexit) * Returns: * 0 on VM-Exit, 1 on VM-Fail */ -ENTRY(__vmx_vcpu_run) +SYM_FUNC_START(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 @@ -233,4 +233,4 @@ ENTRY(__vmx_vcpu_run) /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %eax jmp 1b -ENDPROC(__vmx_vcpu_run) +SYM_FUNC_END(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d175429c91b0..1b9ab4166397 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1368,14 +1368,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - /* - * VM exits change the host TR limit to 0x67 after a VM - * exit. This is okay, since 0x67 covers everything except - * the IO bitmap and have have code to handle the IO bitmap - * being lost after a VM exit. - */ - BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ed167e039e5..cf917139de6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,6 +68,7 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/intel_pt.h> +#include <asm/emulate_prefix.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS @@ -5492,6 +5493,7 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { + static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5499,7 +5501,7 @@ int handle_ud(struct kvm_vcpu *vcpu) if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && - memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { + memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); emul_type = EMULTYPE_TRAP_UD_FORCED; } diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index e0788bade5ab..3b6544111ac9 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -20,10 +20,10 @@ #define BEGIN(op) \ .macro endp; \ -ENDPROC(atomic64_##op##_386); \ +SYM_FUNC_END(atomic64_##op##_386); \ .purgem endp; \ .endm; \ -ENTRY(atomic64_##op##_386); \ +SYM_FUNC_START(atomic64_##op##_386); \ LOCK v; #define ENDP endp diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 843d978ee341..1c5c81c16b06 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -16,12 +16,12 @@ cmpxchg8b (\reg) .endm -ENTRY(atomic64_read_cx8) +SYM_FUNC_START(atomic64_read_cx8) read64 %ecx ret -ENDPROC(atomic64_read_cx8) +SYM_FUNC_END(atomic64_read_cx8) -ENTRY(atomic64_set_cx8) +SYM_FUNC_START(atomic64_set_cx8) 1: /* we don't need LOCK_PREFIX since aligned 64-bit writes * are atomic on 586 and newer */ @@ -29,19 +29,19 @@ ENTRY(atomic64_set_cx8) jne 1b ret -ENDPROC(atomic64_set_cx8) +SYM_FUNC_END(atomic64_set_cx8) -ENTRY(atomic64_xchg_cx8) +SYM_FUNC_START(atomic64_xchg_cx8) 1: LOCK_PREFIX cmpxchg8b (%esi) jne 1b ret -ENDPROC(atomic64_xchg_cx8) +SYM_FUNC_END(atomic64_xchg_cx8) .macro addsub_return func ins insc -ENTRY(atomic64_\func\()_return_cx8) +SYM_FUNC_START(atomic64_\func\()_return_cx8) pushl %ebp pushl %ebx pushl %esi @@ -69,14 +69,14 @@ ENTRY(atomic64_\func\()_return_cx8) popl %ebx popl %ebp ret -ENDPROC(atomic64_\func\()_return_cx8) +SYM_FUNC_END(atomic64_\func\()_return_cx8) .endm addsub_return add add adc addsub_return sub sub sbb .macro incdec_return func ins insc -ENTRY(atomic64_\func\()_return_cx8) +SYM_FUNC_START(atomic64_\func\()_return_cx8) pushl %ebx read64 %esi @@ -94,13 +94,13 @@ ENTRY(atomic64_\func\()_return_cx8) movl %ecx, %edx popl %ebx ret -ENDPROC(atomic64_\func\()_return_cx8) +SYM_FUNC_END(atomic64_\func\()_return_cx8) .endm incdec_return inc add adc incdec_return dec sub sbb -ENTRY(atomic64_dec_if_positive_cx8) +SYM_FUNC_START(atomic64_dec_if_positive_cx8) pushl %ebx read64 %esi @@ -119,9 +119,9 @@ ENTRY(atomic64_dec_if_positive_cx8) movl %ecx, %edx popl %ebx ret -ENDPROC(atomic64_dec_if_positive_cx8) +SYM_FUNC_END(atomic64_dec_if_positive_cx8) -ENTRY(atomic64_add_unless_cx8) +SYM_FUNC_START(atomic64_add_unless_cx8) pushl %ebp pushl %ebx /* these just push these two parameters on the stack */ @@ -155,9 +155,9 @@ ENTRY(atomic64_add_unless_cx8) jne 2b xorl %eax, %eax jmp 3b -ENDPROC(atomic64_add_unless_cx8) +SYM_FUNC_END(atomic64_add_unless_cx8) -ENTRY(atomic64_inc_not_zero_cx8) +SYM_FUNC_START(atomic64_inc_not_zero_cx8) pushl %ebx read64 %esi @@ -177,4 +177,4 @@ ENTRY(atomic64_inc_not_zero_cx8) 3: popl %ebx ret -ENDPROC(atomic64_inc_not_zero_cx8) +SYM_FUNC_END(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4df90c9ea383..4742e8fa7ee7 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -46,7 +46,7 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * Fortunately, it is easy to convert 2-byte alignment to 4-byte * alignment for the unrolled loop. */ -ENTRY(csum_partial) +SYM_FUNC_START(csum_partial) pushl %esi pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum @@ -128,13 +128,13 @@ ENTRY(csum_partial) popl %ebx popl %esi ret -ENDPROC(csum_partial) +SYM_FUNC_END(csum_partial) #else /* Version for PentiumII/PPro */ -ENTRY(csum_partial) +SYM_FUNC_START(csum_partial) pushl %esi pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum @@ -246,7 +246,7 @@ ENTRY(csum_partial) popl %ebx popl %esi ret -ENDPROC(csum_partial) +SYM_FUNC_END(csum_partial) #endif EXPORT_SYMBOL(csum_partial) @@ -280,7 +280,7 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define ARGBASE 16 #define FP 12 -ENTRY(csum_partial_copy_generic) +SYM_FUNC_START(csum_partial_copy_generic) subl $4,%esp pushl %edi pushl %esi @@ -398,7 +398,7 @@ DST( movb %cl, (%edi) ) popl %edi popl %ecx # equivalent to addl $4,%esp ret -ENDPROC(csum_partial_copy_generic) +SYM_FUNC_END(csum_partial_copy_generic) #else @@ -416,7 +416,7 @@ ENDPROC(csum_partial_copy_generic) #define ARGBASE 12 -ENTRY(csum_partial_copy_generic) +SYM_FUNC_START(csum_partial_copy_generic) pushl %ebx pushl %edi pushl %esi @@ -483,7 +483,7 @@ DST( movb %dl, (%edi) ) popl %edi popl %ebx ret -ENDPROC(csum_partial_copy_generic) +SYM_FUNC_END(csum_partial_copy_generic) #undef ROUND #undef ROUND1 diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 75a5a4515fa7..c4c7dd115953 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -13,15 +13,15 @@ * Zero a page. * %rdi - page */ -ENTRY(clear_page_rep) +SYM_FUNC_START(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq ret -ENDPROC(clear_page_rep) +SYM_FUNC_END(clear_page_rep) EXPORT_SYMBOL_GPL(clear_page_rep) -ENTRY(clear_page_orig) +SYM_FUNC_START(clear_page_orig) xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -40,13 +40,13 @@ ENTRY(clear_page_orig) jnz .Lloop nop ret -ENDPROC(clear_page_orig) +SYM_FUNC_END(clear_page_orig) EXPORT_SYMBOL_GPL(clear_page_orig) -ENTRY(clear_page_erms) +SYM_FUNC_START(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb ret -ENDPROC(clear_page_erms) +SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index d63185698a23..3542502faa3b 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -13,7 +13,7 @@ * %rcx : high 64 bits of new value * %al : Operation successful */ -ENTRY(this_cpu_cmpxchg16b_emu) +SYM_FUNC_START(this_cpu_cmpxchg16b_emu) # # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not @@ -44,4 +44,4 @@ ENTRY(this_cpu_cmpxchg16b_emu) xor %al,%al ret -ENDPROC(this_cpu_cmpxchg16b_emu) +SYM_FUNC_END(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 691d80e97488..ca01ed6029f4 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -13,7 +13,7 @@ * %ebx : low 32 bits of new value * %ecx : high 32 bits of new value */ -ENTRY(cmpxchg8b_emu) +SYM_FUNC_START(cmpxchg8b_emu) # # Emulate 'cmpxchg8b (%esi)' on UP except we don't @@ -42,5 +42,5 @@ ENTRY(cmpxchg8b_emu) popfl ret -ENDPROC(cmpxchg8b_emu) +SYM_FUNC_END(cmpxchg8b_emu) EXPORT_SYMBOL(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index fd2d09afa097..2402d4c489d2 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -13,15 +13,15 @@ * prefetch distance based on SMP/UP. */ ALIGN -ENTRY(copy_page) +SYM_FUNC_START(copy_page) ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret -ENDPROC(copy_page) +SYM_FUNC_END(copy_page) EXPORT_SYMBOL(copy_page) -ENTRY(copy_page_regs) +SYM_FUNC_START_LOCAL(copy_page_regs) subq $2*8, %rsp movq %rbx, (%rsp) movq %r12, 1*8(%rsp) @@ -86,4 +86,4 @@ ENTRY(copy_page_regs) movq 1*8(%rsp), %r12 addq $2*8, %rsp ret -ENDPROC(copy_page_regs) +SYM_FUNC_END(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 86976b55ae74..816f128a6d52 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -53,7 +53,7 @@ * Output: * eax uncopied bytes or 0 if successful. */ -ENTRY(copy_user_generic_unrolled) +SYM_FUNC_START(copy_user_generic_unrolled) ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -136,7 +136,7 @@ ENTRY(copy_user_generic_unrolled) _ASM_EXTABLE_UA(19b, 40b) _ASM_EXTABLE_UA(21b, 50b) _ASM_EXTABLE_UA(22b, 50b) -ENDPROC(copy_user_generic_unrolled) +SYM_FUNC_END(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) /* Some CPUs run faster using the string copy instructions. @@ -157,7 +157,7 @@ EXPORT_SYMBOL(copy_user_generic_unrolled) * Output: * eax uncopied bytes or 0 if successful. */ -ENTRY(copy_user_generic_string) +SYM_FUNC_START(copy_user_generic_string) ASM_STAC cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ @@ -182,7 +182,7 @@ ENTRY(copy_user_generic_string) _ASM_EXTABLE_UA(1b, 11b) _ASM_EXTABLE_UA(3b, 12b) -ENDPROC(copy_user_generic_string) +SYM_FUNC_END(copy_user_generic_string) EXPORT_SYMBOL(copy_user_generic_string) /* @@ -197,7 +197,7 @@ EXPORT_SYMBOL(copy_user_generic_string) * Output: * eax uncopied bytes or 0 if successful. */ -ENTRY(copy_user_enhanced_fast_string) +SYM_FUNC_START(copy_user_enhanced_fast_string) ASM_STAC cmpl $64,%edx jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */ @@ -214,7 +214,7 @@ ENTRY(copy_user_enhanced_fast_string) .previous _ASM_EXTABLE_UA(1b, 12b) -ENDPROC(copy_user_enhanced_fast_string) +SYM_FUNC_END(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) /* @@ -230,8 +230,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Output: * eax uncopied bytes or 0 if successful. */ -ALIGN; -.Lcopy_user_handle_tail: +SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax @@ -239,7 +238,7 @@ ALIGN; ret _ASM_EXTABLE_UA(1b, 2b) -END(.Lcopy_user_handle_tail) +SYM_CODE_END(.Lcopy_user_handle_tail) /* * copy_user_nocache - Uncached memory copy with exception handling @@ -250,7 +249,7 @@ END(.Lcopy_user_handle_tail) * - Require 8-byte alignment when size is 8 bytes or larger. * - Require 4-byte alignment when size is 4 bytes. */ -ENTRY(__copy_user_nocache) +SYM_FUNC_START(__copy_user_nocache) ASM_STAC /* If size is less than 8 bytes, go to 4-byte copy */ @@ -389,5 +388,5 @@ ENTRY(__copy_user_nocache) _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy) _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy) _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy) -ENDPROC(__copy_user_nocache) +SYM_FUNC_END(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index a4a379e79259..3394a8ff7fd0 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -49,7 +49,7 @@ .endm -ENTRY(csum_partial_copy_generic) +SYM_FUNC_START(csum_partial_copy_generic) cmpl $3*64, %edx jle .Lignore @@ -225,4 +225,4 @@ ENTRY(csum_partial_copy_generic) jz .Lende movl $-EFAULT, (%rax) jmp .Lende -ENDPROC(csum_partial_copy_generic) +SYM_FUNC_END(csum_partial_copy_generic) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 9578eb88fc87..c8a85b512796 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -36,7 +36,7 @@ #include <asm/export.h> .text -ENTRY(__get_user_1) +SYM_FUNC_START(__get_user_1) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user @@ -47,10 +47,10 @@ ENTRY(__get_user_1) xor %eax,%eax ASM_CLAC ret -ENDPROC(__get_user_1) +SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) -ENTRY(__get_user_2) +SYM_FUNC_START(__get_user_2) add $1,%_ASM_AX jc bad_get_user mov PER_CPU_VAR(current_task), %_ASM_DX @@ -63,10 +63,10 @@ ENTRY(__get_user_2) xor %eax,%eax ASM_CLAC ret -ENDPROC(__get_user_2) +SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) -ENTRY(__get_user_4) +SYM_FUNC_START(__get_user_4) add $3,%_ASM_AX jc bad_get_user mov PER_CPU_VAR(current_task), %_ASM_DX @@ -79,10 +79,10 @@ ENTRY(__get_user_4) xor %eax,%eax ASM_CLAC ret -ENDPROC(__get_user_4) +SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) -ENTRY(__get_user_8) +SYM_FUNC_START(__get_user_8) #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user @@ -111,25 +111,27 @@ ENTRY(__get_user_8) ASM_CLAC ret #endif -ENDPROC(__get_user_8) +SYM_FUNC_END(__get_user_8) EXPORT_SYMBOL(__get_user_8) -.Lbad_get_user_clac: +SYM_CODE_START_LOCAL(.Lbad_get_user_clac) ASM_CLAC bad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX ret +SYM_CODE_END(.Lbad_get_user_clac) #ifdef CONFIG_X86_32 -.Lbad_get_user_8_clac: +SYM_CODE_START_LOCAL(.Lbad_get_user_8_clac) ASM_CLAC bad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX ret +SYM_CODE_END(.Lbad_get_user_8_clac) #endif _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index a14f9939c365..dbf8cc97b7f5 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -8,7 +8,7 @@ * unsigned int __sw_hweight32(unsigned int w) * %rdi: w */ -ENTRY(__sw_hweight32) +SYM_FUNC_START(__sw_hweight32) #ifdef CONFIG_X86_64 movl %edi, %eax # w @@ -33,10 +33,10 @@ ENTRY(__sw_hweight32) shrl $24, %eax # w = w_tmp >> 24 __ASM_SIZE(pop,) %__ASM_REG(dx) ret -ENDPROC(__sw_hweight32) +SYM_FUNC_END(__sw_hweight32) EXPORT_SYMBOL(__sw_hweight32) -ENTRY(__sw_hweight64) +SYM_FUNC_START(__sw_hweight64) #ifdef CONFIG_X86_64 pushq %rdi pushq %rdx @@ -79,5 +79,5 @@ ENTRY(__sw_hweight64) popl %ecx ret #endif -ENDPROC(__sw_hweight64) +SYM_FUNC_END(__sw_hweight64) EXPORT_SYMBOL(__sw_hweight64) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 0b5862ba6a75..404279563891 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -13,6 +13,8 @@ #include <asm/inat.h> #include <asm/insn.h> +#include <asm/emulate_prefix.h> + /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) @@ -58,6 +60,36 @@ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) insn->addr_bytes = 4; } +static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; +static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; + +static int __insn_get_emulate_prefix(struct insn *insn, + const insn_byte_t *prefix, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) + goto err_out; + } + + insn->emulate_prefix_size = len; + insn->next_byte += len; + + return 1; + +err_out: + return 0; +} + +static void insn_get_emulate_prefix(struct insn *insn) +{ + if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) + return; + + __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); +} + /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction @@ -76,6 +108,8 @@ void insn_get_prefixes(struct insn *insn) if (prefixes->got) return; + insn_get_emulate_prefix(insn); + nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index a9bdf0805be0..cb5a1964506b 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -8,8 +8,8 @@ /* * override generic version in lib/iomap_copy.c */ -ENTRY(__iowrite32_copy) +SYM_FUNC_START(__iowrite32_copy) movl %edx,%ecx rep movsd ret -ENDPROC(__iowrite32_copy) +SYM_FUNC_END(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 92748660ba51..56b243b14c3a 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -28,8 +28,8 @@ * Output: * rax original destination */ -ENTRY(__memcpy) -ENTRY(memcpy) +SYM_FUNC_START_ALIAS(__memcpy) +SYM_FUNC_START_LOCAL(memcpy) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ "jmp memcpy_erms", X86_FEATURE_ERMS @@ -41,8 +41,8 @@ ENTRY(memcpy) movl %edx, %ecx rep movsb ret -ENDPROC(memcpy) -ENDPROC(__memcpy) +SYM_FUNC_END(memcpy) +SYM_FUNC_END_ALIAS(__memcpy) EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(__memcpy) @@ -50,14 +50,14 @@ EXPORT_SYMBOL(__memcpy) * memcpy_erms() - enhanced fast string memcpy. This is faster and * simpler than memcpy. Use memcpy_erms when possible. */ -ENTRY(memcpy_erms) +SYM_FUNC_START_LOCAL(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -ENDPROC(memcpy_erms) +SYM_FUNC_END(memcpy_erms) -ENTRY(memcpy_orig) +SYM_FUNC_START_LOCAL(memcpy_orig) movq %rdi, %rax cmpq $0x20, %rdx @@ -182,7 +182,7 @@ ENTRY(memcpy_orig) .Lend: retq -ENDPROC(memcpy_orig) +SYM_FUNC_END(memcpy_orig) #ifndef CONFIG_UML @@ -193,7 +193,7 @@ MCSAFE_TEST_CTL * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(__memcpy_mcsafe) +SYM_FUNC_START(__memcpy_mcsafe) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -260,7 +260,7 @@ ENTRY(__memcpy_mcsafe) xorl %eax, %eax .L_done: ret -ENDPROC(__memcpy_mcsafe) +SYM_FUNC_END(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) .section .fixup, "ax" diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index bbec69d8223b..337830d7a59c 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -26,8 +26,8 @@ */ .weak memmove -ENTRY(memmove) -ENTRY(__memmove) +SYM_FUNC_START_ALIAS(memmove) +SYM_FUNC_START(__memmove) /* Handle more 32 bytes in loop */ mov %rdi, %rax @@ -207,7 +207,7 @@ ENTRY(__memmove) movb %r11b, (%rdi) 13: retq -ENDPROC(__memmove) -ENDPROC(memmove) +SYM_FUNC_END(__memmove) +SYM_FUNC_END_ALIAS(memmove) EXPORT_SYMBOL(__memmove) EXPORT_SYMBOL(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9bc861c71e75..9ff15ee404a4 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -19,8 +19,8 @@ * * rax original destination */ -ENTRY(memset) -ENTRY(__memset) +SYM_FUNC_START_ALIAS(memset) +SYM_FUNC_START(__memset) /* * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended * to use it when possible. If not available, use fast string instructions. @@ -43,8 +43,8 @@ ENTRY(__memset) rep stosb movq %r9,%rax ret -ENDPROC(memset) -ENDPROC(__memset) +SYM_FUNC_END(__memset) +SYM_FUNC_END_ALIAS(memset) EXPORT_SYMBOL(memset) EXPORT_SYMBOL(__memset) @@ -59,16 +59,16 @@ EXPORT_SYMBOL(__memset) * * rax original destination */ -ENTRY(memset_erms) +SYM_FUNC_START_LOCAL(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -ENDPROC(memset_erms) +SYM_FUNC_END(memset_erms) -ENTRY(memset_orig) +SYM_FUNC_START_LOCAL(memset_orig) movq %rdi,%r10 /* expand byte value */ @@ -139,4 +139,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: -ENDPROC(memset_orig) +SYM_FUNC_END(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index ed33cbab3958..a2b9caa5274c 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -12,7 +12,7 @@ * */ .macro op_safe_regs op -ENTRY(\op\()_safe_regs) +SYM_FUNC_START(\op\()_safe_regs) pushq %rbx pushq %r12 movq %rdi, %r10 /* Save pointer */ @@ -41,13 +41,13 @@ ENTRY(\op\()_safe_regs) jmp 2b _ASM_EXTABLE(1b, 3b) -ENDPROC(\op\()_safe_regs) +SYM_FUNC_END(\op\()_safe_regs) .endm #else /* X86_32 */ .macro op_safe_regs op -ENTRY(\op\()_safe_regs) +SYM_FUNC_START(\op\()_safe_regs) pushl %ebx pushl %ebp pushl %esi @@ -83,7 +83,7 @@ ENTRY(\op\()_safe_regs) jmp 2b _ASM_EXTABLE(1b, 3b) -ENDPROC(\op\()_safe_regs) +SYM_FUNC_END(\op\()_safe_regs) .endm #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 126dd6a9ec9b..7c7c92db8497 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -34,7 +34,7 @@ #define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX .text -ENTRY(__put_user_1) +SYM_FUNC_START(__put_user_1) ENTER cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX jae .Lbad_put_user @@ -43,10 +43,10 @@ ENTRY(__put_user_1) xor %eax,%eax ASM_CLAC ret -ENDPROC(__put_user_1) +SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) -ENTRY(__put_user_2) +SYM_FUNC_START(__put_user_2) ENTER mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX @@ -57,10 +57,10 @@ ENTRY(__put_user_2) xor %eax,%eax ASM_CLAC ret -ENDPROC(__put_user_2) +SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) -ENTRY(__put_user_4) +SYM_FUNC_START(__put_user_4) ENTER mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX @@ -71,10 +71,10 @@ ENTRY(__put_user_4) xor %eax,%eax ASM_CLAC ret -ENDPROC(__put_user_4) +SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) -ENTRY(__put_user_8) +SYM_FUNC_START(__put_user_8) ENTER mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX @@ -88,14 +88,15 @@ ENTRY(__put_user_8) xor %eax,%eax ASM_CLAC RET -ENDPROC(__put_user_8) +SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) -.Lbad_put_user_clac: +SYM_CODE_START_LOCAL(.Lbad_put_user_clac) ASM_CLAC .Lbad_put_user: movl $-EFAULT,%eax RET +SYM_CODE_END(.Lbad_put_user_clac) _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index c909961e678a..363ec132df7e 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -11,11 +11,11 @@ .macro THUNK reg .section .text.__x86.indirect_thunk -ENTRY(__x86_indirect_thunk_\reg) +SYM_FUNC_START(__x86_indirect_thunk_\reg) CFI_STARTPROC JMP_NOSPEC %\reg CFI_ENDPROC -ENDPROC(__x86_indirect_thunk_\reg) +SYM_FUNC_END(__x86_indirect_thunk_\reg) .endm /* diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index e0b85930dd77..0a0e9112f284 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -333,7 +333,7 @@ AVXcode: 1 06: CLTS 07: SYSRET (o64) 08: INVD -09: WBINVD +09: WBINVD | WBNOINVD (F3) 0a: 0b: UD2 (1B) 0c: @@ -364,7 +364,7 @@ AVXcode: 1 # a ModR/M byte. 1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev 1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv -1c: +1c: Grp20 (1A),(1C) 1d: 1e: 1f: NOP Ev @@ -792,6 +792,8 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f9: MOVDIRI My,Gy EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -943,9 +945,9 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: rdpkru (110),(11B) | wrpkru (111),(11B) @@ -1020,7 +1022,7 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE | ptwrite Ey (F3),(11B) 5: XRSTOR | lfence (11B) -6: XSAVEOPT | clwb (66) | mfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) 7: clflush | clflushopt (66) | sfence (11B) EndTable @@ -1051,6 +1053,10 @@ GrpTable: Grp19 6: vscatterpf1qps/d Wx (66),(ev) EndTable +GrpTable: Grp20 +0: cldemote Mb +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index ee08449d20fd..951da2ad54bb 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -75,7 +75,7 @@ FPU_result_1: .text -ENTRY(div_Xsig) +SYM_FUNC_START(div_Xsig) pushl %ebp movl %esp,%ebp #ifndef NON_REENTRANT_FPU @@ -364,4 +364,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ -ENDPROC(div_Xsig) +SYM_FUNC_END(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 8f5025c80ee0..d047d1816abe 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -19,7 +19,7 @@ #include "fpu_emu.h" .text -ENTRY(FPU_div_small) +SYM_FUNC_START(FPU_div_small) pushl %ebp movl %esp,%ebp @@ -45,4 +45,4 @@ ENTRY(FPU_div_small) leave ret -ENDPROC(FPU_div_small) +SYM_FUNC_END(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index f98a0c956764..9b41391867dc 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -107,6 +107,8 @@ static inline bool seg_writable(struct desc_struct *d) #define FPU_access_ok(y,z) if ( !access_ok(y,z) ) \ math_abort(FPU_info,SIGSEGV) #define FPU_abort math_abort(FPU_info, SIGSEGV) +#define FPU_copy_from_user(to, from, n) \ + do { if (copy_from_user(to, from, n)) FPU_abort; } while (0) #undef FPU_IGNORE_CODE_SEGV #ifdef FPU_IGNORE_CODE_SEGV @@ -122,7 +124,7 @@ static inline bool seg_writable(struct desc_struct *d) #define FPU_code_access_ok(z) FPU_access_ok((void __user *)FPU_EIP,z) #endif -#define FPU_get_user(x,y) get_user((x),(y)) -#define FPU_put_user(x,y) put_user((x),(y)) +#define FPU_get_user(x,y) do { if (get_user((x),(y))) FPU_abort; } while (0) +#define FPU_put_user(x,y) do { if (put_user((x),(y))) FPU_abort; } while (0) #endif diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 3e489122a2b0..4afc7b1fa6e9 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -25,7 +25,7 @@ #include "fpu_emu.h" .text -ENTRY(mul32_Xsig) +SYM_FUNC_START(mul32_Xsig) pushl %ebp movl %esp,%ebp subl $16,%esp @@ -63,10 +63,10 @@ ENTRY(mul32_Xsig) popl %esi leave ret -ENDPROC(mul32_Xsig) +SYM_FUNC_END(mul32_Xsig) -ENTRY(mul64_Xsig) +SYM_FUNC_START(mul64_Xsig) pushl %ebp movl %esp,%ebp subl $16,%esp @@ -116,11 +116,11 @@ ENTRY(mul64_Xsig) popl %esi leave ret -ENDPROC(mul64_Xsig) +SYM_FUNC_END(mul64_Xsig) -ENTRY(mul_Xsig_Xsig) +SYM_FUNC_START(mul_Xsig_Xsig) pushl %ebp movl %esp,%ebp subl $16,%esp @@ -176,4 +176,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret -ENDPROC(mul_Xsig_Xsig) +SYM_FUNC_END(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 604f0b2d17e8..702315eecb86 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -37,7 +37,7 @@ #define OVERFLOWED -16(%ebp) /* addition overflow flag */ .text -ENTRY(polynomial_Xsig) +SYM_FUNC_START(polynomial_Xsig) pushl %ebp movl %esp,%ebp subl $32,%esp @@ -134,4 +134,4 @@ L_accum_done: popl %esi leave ret -ENDPROC(polynomial_Xsig) +SYM_FUNC_END(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index f3779743d15e..fe6246ff9887 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -85,7 +85,7 @@ int FPU_load_extended(long double __user *s, int stnr) RE_ENTRANT_CHECK_OFF; FPU_access_ok(s, 10); - __copy_from_user(sti_ptr, s, 10); + FPU_copy_from_user(sti_ptr, s, 10); RE_ENTRANT_CHECK_ON; return FPU_tagof(sti_ptr); @@ -1126,9 +1126,9 @@ void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) /* Copy all registers in stack order. */ RE_ENTRANT_CHECK_OFF; FPU_access_ok(s, 80); - __copy_from_user(register_base + offset, s, other); + FPU_copy_from_user(register_base + offset, s, other); if (offset) - __copy_from_user(register_base, s + other, offset); + FPU_copy_from_user(register_base, s + other, offset); RE_ENTRANT_CHECK_ON; for (i = 0; i < 8; i++) { diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 7f6b4392a15d..cad1d60b1e84 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -22,7 +22,7 @@ .text -ENTRY(FPU_normalize) +SYM_FUNC_START(FPU_normalize) pushl %ebp movl %esp,%ebp pushl %ebx @@ -95,12 +95,12 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit -ENDPROC(FPU_normalize) +SYM_FUNC_END(FPU_normalize) /* Normalise without reporting underflow or overflow */ -ENTRY(FPU_normalize_nuo) +SYM_FUNC_START(FPU_normalize_nuo) pushl %ebp movl %esp,%ebp pushl %ebx @@ -147,4 +147,4 @@ L_exit_nuo_zero: popl %ebx leave ret -ENDPROC(FPU_normalize_nuo) +SYM_FUNC_END(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index 04563421ee7d..11a1f798451b 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -109,7 +109,7 @@ FPU_denormal: .globl fpu_Arith_exit /* Entry point when called from C */ -ENTRY(FPU_round) +SYM_FUNC_START(FPU_round) pushl %ebp movl %esp,%ebp pushl %esi @@ -708,4 +708,4 @@ L_exception_exit: jmp fpu_reg_round_special_exit #endif /* PARANOID */ -ENDPROC(FPU_round) +SYM_FUNC_END(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 50fe9f8c893c..9c9e2c810afe 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -32,7 +32,7 @@ #include "control_w.h" .text -ENTRY(FPU_u_add) +SYM_FUNC_START(FPU_u_add) pushl %ebp movl %esp,%ebp pushl %esi @@ -166,4 +166,4 @@ L_exit: leave ret #endif /* PARANOID */ -ENDPROC(FPU_u_add) +SYM_FUNC_END(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index 94d545e118e4..e2fb5c2644c5 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -75,7 +75,7 @@ FPU_ovfl_flag: #define DEST PARAM3 .text -ENTRY(FPU_u_div) +SYM_FUNC_START(FPU_u_div) pushl %ebp movl %esp,%ebp #ifndef NON_REENTRANT_FPU @@ -471,4 +471,4 @@ L_exit: ret #endif /* PARANOID */ -ENDPROC(FPU_u_div) +SYM_FUNC_END(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 21cde47fb3e5..0c779c87ac5b 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -45,7 +45,7 @@ FPU_accum_1: .text -ENTRY(FPU_u_mul) +SYM_FUNC_START(FPU_u_mul) pushl %ebp movl %esp,%ebp #ifndef NON_REENTRANT_FPU @@ -147,4 +147,4 @@ L_exit: ret #endif /* PARANOID */ -ENDPROC(FPU_u_mul) +SYM_FUNC_END(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index f05dea7dec38..e9bb7c248649 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -33,7 +33,7 @@ #include "control_w.h" .text -ENTRY(FPU_u_sub) +SYM_FUNC_START(FPU_u_sub) pushl %ebp movl %esp,%ebp pushl %esi @@ -271,4 +271,4 @@ L_exit: popl %esi leave ret -ENDPROC(FPU_u_sub) +SYM_FUNC_END(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index 226a51e991f1..d9d7de8dbd7b 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -23,7 +23,7 @@ .text -ENTRY(round_Xsig) +SYM_FUNC_START(round_Xsig) pushl %ebp movl %esp,%ebp pushl %ebx /* Reserve some space */ @@ -79,11 +79,11 @@ L_exit: popl %ebx leave ret -ENDPROC(round_Xsig) +SYM_FUNC_END(round_Xsig) -ENTRY(norm_Xsig) +SYM_FUNC_START(norm_Xsig) pushl %ebp movl %esp,%ebp pushl %ebx /* Reserve some space */ @@ -139,4 +139,4 @@ L_n_exit: popl %ebx leave ret -ENDPROC(norm_Xsig) +SYM_FUNC_END(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 96f4779aa9c1..726af985f758 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -22,7 +22,7 @@ #include "fpu_emu.h" .text -ENTRY(shr_Xsig) +SYM_FUNC_START(shr_Xsig) push %ebp movl %esp,%ebp pushl %esi @@ -86,4 +86,4 @@ L_more_than_95: popl %esi leave ret -ENDPROC(shr_Xsig) +SYM_FUNC_END(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index d588874eb6fb..4fc89174caf0 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -33,7 +33,7 @@ | Results returned in the 64 bit arg and eax. | +---------------------------------------------------------------------------*/ -ENTRY(FPU_shrx) +SYM_FUNC_START(FPU_shrx) push %ebp movl %esp,%ebp pushl %esi @@ -93,7 +93,7 @@ L_more_than_95: popl %esi leave ret -ENDPROC(FPU_shrx) +SYM_FUNC_END(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -112,7 +112,7 @@ ENDPROC(FPU_shrx) | part which has been shifted out of the arg. | | Results returned in the 64 bit arg and eax. | +---------------------------------------------------------------------------*/ -ENTRY(FPU_shrxs) +SYM_FUNC_START(FPU_shrxs) push %ebp movl %esp,%ebp pushl %esi @@ -204,4 +204,4 @@ Ls_more_than_95: popl %esi leave ret -ENDPROC(FPU_shrxs) +SYM_FUNC_END(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index f031c0e19356..3b2b58164ec1 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -75,7 +75,7 @@ FPU_fsqrt_arg_0: .text -ENTRY(wm_sqrt) +SYM_FUNC_START(wm_sqrt) pushl %ebp movl %esp,%ebp #ifndef NON_REENTRANT_FPU @@ -469,4 +469,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result -ENDPROC(wm_sqrt) +SYM_FUNC_END(wm_sqrt) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 84373dc9b341..3b89c201ac26 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace -obj-$(CONFIG_X86_PAT) += pat_rbtree.o +obj-$(CONFIG_X86_PAT) += pat_interval.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 752ad11d6868..82ead8e27888 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -161,6 +161,14 @@ static void __init setup_cpu_entry_area(unsigned int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + /* + * VMX changes the host TR limit to 0x67 after a VM exit. This is + * okay, since 0x67 covers the size of struct x86_hw_tss. Make sure + * that this is correct. + */ + BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0); + BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68); + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); @@ -178,7 +186,9 @@ static __init void setup_cpu_entry_area_ptes(void) #ifdef CONFIG_X86_32 unsigned long start, end; - BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + /* The +1 is for the readonly IDT: */ + BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); + BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4d75bc656f97..30bb0bd3b1b8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -45,55 +45,6 @@ __visible bool ex_handler_fault(const struct exception_table_entry *fixup, EXPORT_SYMBOL_GPL(ex_handler_fault); /* - * Handler for UD0 exception following a failed test against the - * result of a refcount inc/dec/add/sub. - */ -__visible bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - /* First unconditionally saturate the refcount. */ - *(int *)regs->cx = INT_MIN / 2; - - /* - * Strictly speaking, this reports the fixup destination, not - * the fault location, and not the actually overflowing - * instruction, which is the instruction before the "js", but - * since that instruction could be a variety of lengths, just - * report the location after the overflow, which should be close - * enough for finding the overflow, as it's at least back in - * the function, having returned from .text.unlikely. - */ - regs->ip = ex_fixup_addr(fixup); - - /* - * This function has been called because either a negative refcount - * value was seen by any of the refcount functions, or a zero - * refcount value was seen by refcount_dec(). - * - * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result - * wrapped around) will be set. Additionally, seeing the refcount - * reach 0 will set ZF (Zero Flag: result was zero). In each of - * these cases we want a report, since it's a boundary condition. - * The SF case is not reported since it indicates post-boundary - * manipulations below zero or above INT_MAX. And if none of the - * flags are set, something has gone very wrong, so report it. - */ - if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { - bool zero = regs->flags & X86_EFLAGS_ZF; - - refcount_error_report(regs, zero ? "hit zero" : "overflow"); - } else if ((regs->flags & X86_EFLAGS_SF) == 0) { - /* Report if none of OF, ZF, nor SF are set. */ - refcount_error_report(regs, "unexpected saturation"); - } - - return true; -} -EXPORT_SYMBOL(ex_handler_refcount); - -/* * Handler for when we fail to restore a task's FPU state. We should never get * here because the FPU state of a task using the FPU (task->thread.fpu.state) * should always be valid. However, past bugs have allowed userspace to set diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd10d91a6115..e7bb483557c9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -829,14 +829,13 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) * used for the kernel image only. free_init_pages() will do the * right thing for either kind of address. */ -void free_kernel_image_pages(void *begin, void *end) +void free_kernel_image_pages(const char *what, void *begin, void *end) { unsigned long begin_ul = (unsigned long)begin; unsigned long end_ul = (unsigned long)end; unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; - - free_init_pages("unused kernel image", begin_ul, end_ul); + free_init_pages(what, begin_ul, end_ul); /* * PTI maps some of the kernel into userspace. For performance, @@ -865,7 +864,8 @@ void __ref free_initmem(void) mem_encrypt_free_decrypted_mem(); - free_kernel_image_pages(&__init_begin, &__init_end); + free_kernel_image_pages("unused kernel image (initmem)", + &__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a6b5c653727b..dcb9bc961b39 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1263,7 +1263,7 @@ int kernel_set_to_readonly; void set_kernel_text_rw(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__stop___ex_table); + unsigned long end = PFN_ALIGN(_etext); if (!kernel_set_to_readonly) return; @@ -1282,7 +1282,7 @@ void set_kernel_text_rw(void) void set_kernel_text_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__stop___ex_table); + unsigned long end = PFN_ALIGN(_etext); if (!kernel_set_to_readonly) return; @@ -1300,9 +1300,9 @@ void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = PFN_ALIGN(__start_rodata); - unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PFN_ALIGN(&__stop___ex_table); - unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(_etext); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", @@ -1334,8 +1334,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_kernel_image_pages((void *)text_end, (void *)rodata_start); - free_kernel_image_pages((void *)rodata_end, (void *)_sdata); + free_kernel_image_pages("unused kernel image (text/rodata gap)", + (void *)text_end, (void *)rodata_start); + free_kernel_image_pages("unused kernel image (rodata/data gap)", + (void *)rodata_end, (void *)_sdata); debug_checkwx(); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a39dcdb5ae34..1ff9c2030b4f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -626,6 +626,17 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + memunmap(data); + return true; + } + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } + memunmap(data); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 79eb55ce69a9..49d7814b59a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -193,8 +193,8 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", - f->addr, f->count, !!f->old_presence); + pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), @@ -341,8 +341,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 000000000000..f5b85bdc0535 --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#ifdef CONFIG_X86_64 +static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + /* + * Range covering the highest possible canonical userspace address + * as well as non-canonical address range. For the canonical range + * we also need to include the userspace guard page. + */ + return vaddr < TASK_SIZE_MAX + PAGE_SIZE || + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; +} +#else +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + return vaddr < TASK_SIZE_MAX; +} +#endif + +long probe_kernel_read_strict(void *dst, const void *src, size_t size) +{ + if (unlikely(invalid_probe_range((unsigned long)src))) + return -EFAULT; + + return __probe_kernel_read(dst, src, size); +} + +long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) +{ + if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) + return -EFAULT; + + return __strncpy_from_unsafe(dst, unsafe_addr, count); +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 6d71481a1e70..106ead05bbe3 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -ENTRY(sme_encrypt_execute) +SYM_FUNC_START(sme_encrypt_execute) /* * Entry parameters: @@ -66,9 +66,9 @@ ENTRY(sme_encrypt_execute) pop %rbp ret -ENDPROC(sme_encrypt_execute) +SYM_FUNC_END(sme_encrypt_execute) -ENTRY(__enc_copy) +SYM_FUNC_START(__enc_copy) /* * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since @@ -153,4 +153,4 @@ ENTRY(__enc_copy) ret .L__enc_copy_end: -ENDPROC(__enc_copy) +SYM_FUNC_END(__enc_copy) diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index b8ef8557d4b3..673de6063345 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -394,7 +394,7 @@ static void enter_uniprocessor(void) } out: if (num_online_cpus() > 1) - pr_warning("multiple CPUs still online, may miss events.\n"); + pr_warn("multiple CPUs still online, may miss events.\n"); } static void leave_uniprocessor(void) @@ -418,8 +418,8 @@ static void leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning("multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warn("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4123100e0eaf..99f7a68738f0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -699,7 +699,7 @@ static int __init dummy_numa_init(void) * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The - * last fallback is dummy single node config encomapssing whole memory and + * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index abffa0be80da..7f1d2034df1e 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -438,7 +438,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) goto no_emu; if (numa_cleanup_meminfo(&ei) < 0) { - pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); goto no_emu; } @@ -449,7 +449,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (!phys) { - pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } memblock_reserve(phys, phys_size); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d9fbd4f69920..2d758e19ef22 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, spin_lock(&memtype_lock); - err = rbt_memtype_check_insert(new, new_type); + err = memtype_check_insert(new, new_type); if (err) { pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, @@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end) } spin_lock(&memtype_lock); - entry = rbt_memtype_erase(start, end); + entry = memtype_erase(start, end); spin_unlock(&memtype_lock); if (IS_ERR(entry)) { @@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr) spin_lock(&memtype_lock); - entry = rbt_memtype_lookup(paddr); + entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else @@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos) return NULL; spin_lock(&memtype_lock); - ret = rbt_memtype_copy_nth_element(print_entry, pos); + ret = memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); if (!ret) { diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index eeb5caeb089b..79a06684349e 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -29,20 +29,20 @@ static inline char *cattr_name(enum page_cache_mode pcm) } #ifdef CONFIG_X86_PAT -extern int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type); -extern struct memtype *rbt_memtype_erase(u64 start, u64 end); -extern struct memtype *rbt_memtype_lookup(u64 addr); -extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +extern int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type); +extern struct memtype *memtype_erase(u64 start, u64 end); +extern struct memtype *memtype_lookup(u64 addr); +extern int memtype_copy_nth_element(struct memtype *out, loff_t pos); #else -static inline int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type) +static inline int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type) { return 0; } -static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +static inline struct memtype *memtype_erase(u64 start, u64 end) { return NULL; } -static inline struct memtype *rbt_memtype_lookup(u64 addr) +static inline struct memtype *memtype_lookup(u64 addr) { return NULL; } -static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) { return 0; } #endif diff --git a/arch/x86/mm/pat_interval.c b/arch/x86/mm/pat_interval.c new file mode 100644 index 000000000000..47a1bf30748f --- /dev/null +++ b/arch/x86/mm/pat_interval.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree used to store the PAT memory type reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/pat.h> + +#include "pat_internal.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) with tree ordered + * on starting address. Tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course. + * + * memtype_lock protects the rbtree. + */ +static inline u64 memtype_interval_start(struct memtype *memtype) +{ + return memtype->start; +} + +static inline u64 memtype_interval_end(struct memtype *memtype) +{ + return memtype->end - 1; +} +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, + memtype_interval_start, memtype_interval_end, + static, memtype_interval) + +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_match(u64 start, u64 end, int match_type) +{ + struct memtype *match; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + while (match != NULL && match->start < end) { + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) + return match; + + match = memtype_interval_iter_next(match, start, end); + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_check_conflict(u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct memtype *match; + enum page_cache_mode found_type = reqtype; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + if (match == NULL) + goto success; + + if (match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); + found_type = match->type; + + match = memtype_interval_iter_next(match, start, end); + while (match) { + if (match->type != found_type) + goto failure; + + match = memtype_interval_iter_next(match, start, end); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); + return -EBUSY; +} + +int memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_check_conflict(new->start, new->end, new->type, ret_type); + if (err) + return err; + + if (ret_type) + new->type = *ret_type; + + memtype_interval_insert(new, &memtype_rbroot); + return 0; +} + +struct memtype *memtype_erase(u64 start, u64 end) +{ + struct memtype *data; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_match(start, end, MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_match(start, end, MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + memtype_interval_remove(data, &memtype_rbroot); + } else { + /* mremap: update the end value of this node */ + memtype_interval_remove(data, &memtype_rbroot); + data->end = start; + memtype_interval_insert(data, &memtype_rbroot); + return NULL; + } + + return data; +} + +struct memtype *memtype_lookup(u64 addr) +{ + return memtype_interval_iter_first(&memtype_rbroot, addr, + addr + PAGE_SIZE); +} + +#if defined(CONFIG_DEBUG_FS) +int memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ + struct memtype *match; + int i = 1; + + match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); + while (match && pos != i) { + match = memtype_interval_iter_next(match, 0, ULONG_MAX); + i++; + } + + if (match) { /* pos == i */ + *out = *match; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c deleted file mode 100644 index 65ebe4b88f7c..000000000000 --- a/arch/x86/mm/pat_rbtree.c +++ /dev/null @@ -1,268 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Handle caching attributes in page tables (PAT) - * - * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * Suresh B Siddha <suresh.b.siddha@intel.com> - * - * Interval tree (augmented rbtree) used to store the PAT memory type - * reservations. - */ - -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/rbtree_augmented.h> -#include <linux/sched.h> -#include <linux/gfp.h> - -#include <asm/pgtable.h> -#include <asm/pat.h> - -#include "pat_internal.h" - -/* - * The memtype tree keeps track of memory type for specific - * physical memory areas. Without proper tracking, conflicting memory - * types in different mappings can cause CPU cache corruption. - * - * The tree is an interval tree (augmented rbtree) with tree ordered - * on starting address. Tree can contain multiple entries for - * different regions which overlap. All the aliases have the same - * cache attributes of course. - * - * memtype_lock protects the rbtree. - */ - -static struct rb_root memtype_rbroot = RB_ROOT; - -static int is_node_overlap(struct memtype *node, u64 start, u64 end) -{ - if (node->start >= end || node->end <= start) - return 0; - - return 1; -} - -static u64 get_subtree_max_end(struct rb_node *node) -{ - u64 ret = 0; - if (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - ret = data->subtree_max_end; - } - return ret; -} - -#define NODE_END(node) ((node)->end) - -RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, - struct memtype, rb, u64, subtree_max_end, NODE_END) - -/* Find the first (lowest start addr) overlapping range from rb tree */ -static struct memtype *memtype_rb_lowest_match(struct rb_root *root, - u64 start, u64 end) -{ - struct rb_node *node = root->rb_node; - struct memtype *last_lower = NULL; - - while (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - - if (get_subtree_max_end(node->rb_left) > start) { - /* Lowest overlap if any must be on left side */ - node = node->rb_left; - } else if (is_node_overlap(data, start, end)) { - last_lower = data; - break; - } else if (start >= data->start) { - /* Lowest overlap if any must be on right side */ - node = node->rb_right; - } else { - break; - } - } - return last_lower; /* Returns NULL if there is no overlap */ -} - -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_rb_match(struct rb_root *root, - u64 start, u64 end, int match_type) -{ - struct memtype *match; - - match = memtype_rb_lowest_match(root, start, end); - while (match != NULL && match->start < end) { - struct rb_node *node; - - if ((match_type == MEMTYPE_EXACT_MATCH) && - (match->start == start) && (match->end == end)) - return match; - - if ((match_type == MEMTYPE_END_MATCH) && - (match->start < start) && (match->end == end)) - return match; - - node = rb_next(&match->rb); - if (node) - match = rb_entry(node, struct memtype, rb); - else - match = NULL; - } - - return NULL; /* Returns NULL if there is no match */ -} - -static int memtype_rb_check_conflict(struct rb_root *root, - u64 start, u64 end, - enum page_cache_mode reqtype, - enum page_cache_mode *newtype) -{ - struct rb_node *node; - struct memtype *match; - enum page_cache_mode found_type = reqtype; - - match = memtype_rb_lowest_match(&memtype_rbroot, start, end); - if (match == NULL) - goto success; - - if (match->type != found_type && newtype == NULL) - goto failure; - - dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); - found_type = match->type; - - node = rb_next(&match->rb); - while (node) { - match = rb_entry(node, struct memtype, rb); - - if (match->start >= end) /* Checked all possible matches */ - goto success; - - if (is_node_overlap(match, start, end) && - match->type != found_type) { - goto failure; - } - - node = rb_next(&match->rb); - } -success: - if (newtype) - *newtype = found_type; - - return 0; - -failure: - pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, start, end, - cattr_name(found_type), cattr_name(match->type)); - return -EBUSY; -} - -static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) -{ - struct rb_node **node = &(root->rb_node); - struct rb_node *parent = NULL; - - while (*node) { - struct memtype *data = rb_entry(*node, struct memtype, rb); - - parent = *node; - if (data->subtree_max_end < newdata->end) - data->subtree_max_end = newdata->end; - if (newdata->start <= data->start) - node = &((*node)->rb_left); - else if (newdata->start > data->start) - node = &((*node)->rb_right); - } - - newdata->subtree_max_end = newdata->end; - rb_link_node(&newdata->rb, parent, node); - rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); -} - -int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *ret_type) -{ - int err = 0; - - err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, - new->type, ret_type); - - if (!err) { - if (ret_type) - new->type = *ret_type; - - new->subtree_max_end = new->end; - memtype_rb_insert(&memtype_rbroot, new); - } - return err; -} - -struct memtype *rbt_memtype_erase(u64 start, u64 end) -{ - struct memtype *data; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_EXACT_MATCH); - if (!data) { - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_END_MATCH); - if (!data) - return ERR_PTR(-EINVAL); - } - - if (data->start == start) { - /* munmap: erase this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - } else { - /* mremap: update the end value of this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - data->end = start; - data->subtree_max_end = data->end; - memtype_rb_insert(&memtype_rbroot, data); - return NULL; - } - - return data; -} - -struct memtype *rbt_memtype_lookup(u64 addr) -{ - return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); -} - -#if defined(CONFIG_DEBUG_FS) -int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) -{ - struct rb_node *node; - int i = 1; - - node = rb_first(&memtype_rbroot); - while (node && pos != i) { - node = rb_next(node); - i++; - } - - if (node) { /* pos == i */ - struct memtype *this = rb_entry(node, struct memtype, rb); - *out = *this; - return 0; - } else { - return 1; - } -} -#endif diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3e4b9035bb9a..7bd2c3a52297 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -643,8 +643,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 7f2140414440..44a9f068eee0 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -574,7 +574,7 @@ static void pti_clone_kernel_text(void) */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; - unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); + unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index a8bd952e136d..92153d054d6c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -127,9 +127,9 @@ static int __init init(void) return -ENXIO; } - pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " - "and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); do_test_bulk_ioremapping(); pr_info("All done.\n"); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 991549a1c5f3..b8be18427277 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -9,9 +9,11 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <linux/bpf.h> - +#include <linux/memory.h> +#include <asm/extable.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/text-patching.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -96,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2) /* * The following table maps BPF registers to x86-64 registers. @@ -104,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * Also x86-64 register R9 is unused. x86-64 register R10 is - * used for blinding (if enabled). + * x86-64 register R9 is not used by BPF programs, but can be used by BPF + * trampoline. x86-64 register R10 is used for blinding (if enabled). */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* RAX */ @@ -121,6 +124,20 @@ static const int reg2hex[] = { [BPF_REG_FP] = 5, /* RBP readonly */ [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ + [X86_REG_R9] = 1, /* R9 register, 6th function argument */ +}; + +static const int reg2pt_regs[] = { + [BPF_REG_0] = offsetof(struct pt_regs, ax), + [BPF_REG_1] = offsetof(struct pt_regs, di), + [BPF_REG_2] = offsetof(struct pt_regs, si), + [BPF_REG_3] = offsetof(struct pt_regs, dx), + [BPF_REG_4] = offsetof(struct pt_regs, cx), + [BPF_REG_5] = offsetof(struct pt_regs, r8), + [BPF_REG_6] = offsetof(struct pt_regs, bx), + [BPF_REG_7] = offsetof(struct pt_regs, r13), + [BPF_REG_8] = offsetof(struct pt_regs, r14), + [BPF_REG_9] = offsetof(struct pt_regs, r15), }; /* @@ -135,6 +152,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_7) | BIT(BPF_REG_8) | BIT(BPF_REG_9) | + BIT(X86_REG_R9) | BIT(BPF_REG_AX)); } @@ -186,7 +204,10 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define PROLOGUE_SIZE 20 +/* Number of bytes emit_patch() needs to generate instructions */ +#define X86_PATCH_SIZE 5 + +#define PROLOGUE_SIZE 25 /* * Emit x86-64 prologue code for BPF program and check its size. @@ -195,8 +216,13 @@ struct jit_context { static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; - int cnt = 0; + int cnt = X86_PATCH_SIZE; + /* BPF trampoline can be made to work without these nops, + * but let's waste 5 bytes for now and optimize later + */ + memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); + prog += cnt; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ @@ -213,6 +239,89 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + X86_PATCH_SIZE); + if (!is_simm32(offset)) { + pr_err("Target call %p is out of range\n", func); + return -ERANGE; + } + EMIT1_off32(opcode, offset); + *pprog = prog; + return 0; +} + +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr, + const bool text_live) +{ + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + u8 old_insn[X86_PATCH_SIZE]; + u8 new_insn[X86_PATCH_SIZE]; + u8 *prog; + int ret; + + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + if (old_addr) { + prog = old_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, old_addr, ip) : + emit_jump(&prog, old_addr, ip); + if (ret) + return ret; + } + + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + if (new_addr) { + prog = new_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, new_addr, ip) : + emit_jump(&prog, new_addr, ip); + if (ret) + return ret; + } + + ret = -EBUSY; + mutex_lock(&text_mutex); + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) + goto out; + if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + } + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) + /* BPF poking in modules is not supported */ + return -EINVAL; + + return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); +} + /* * Generate the following code: * @@ -227,7 +336,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog) { u8 *prog = *pprog; int label1, label2, label3; @@ -294,6 +403,68 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } +static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, + u8 **pprog, int addr, u8 *image) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT2(X86_JA, 14); /* ja out */ + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + + poke->ip = image + (addr - X86_PATCH_SIZE); + poke->adj_off = PROLOGUE_SIZE; + + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; + /* out: */ + + *pprog = prog; +} + +static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) +{ + struct bpf_jit_poke_descriptor *poke; + struct bpf_array *array; + struct bpf_prog *target; + int i, ret; + + for (i = 0; i < prog->aux->size_poke_tab; i++) { + poke = &prog->aux->poke_tab[i]; + WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + + array = container_of(poke->tail_call.map, struct bpf_array, map); + mutex_lock(&array->aux->poke_mutex); + target = array->ptrs[poke->tail_call.key]; + if (target) { + /* Plain memcpy is used when image is not live yet + * and still not locked as read-only. Once poke + * location is active (poke->ip_stable), any parallel + * bpf_arch_text_poke() might occur still on the + * read-write image until we finally locked it as + * read-only. Both modifications on the given image + * are under text_mutex to avoid interference. + */ + ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + (u8 *)target->bpf_func + + poke->adj_off, false); + BUG_ON(ret < 0); + } + WRITE_ONCE(poke->ip_stable, true); + mutex_unlock(&array->aux->poke_mutex); + } +} + static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -377,6 +548,96 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* LDX: dst_reg = *(u8*)(src_reg + off) */ +static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'movzx rax, byte ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); + break; + case BPF_H: + /* Emit 'movzx rax, word ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); + break; + case BPF_W: + /* Emit 'mov eax, dword ptr [rax+0x14]' */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); + else + EMIT1(0x8B); + break; + case BPF_DW: + /* Emit 'mov rax, qword ptr [rax+0x14]' */ + EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); + break; + } + /* + * If insn->off == 0 we can save one extra byte, but + * special case of x86 R13 which always needs an offset + * is not worth the hassle + */ + if (is_imm8(off)) + EMIT2(add_2reg(0x40, src_reg, dst_reg), off); + else + EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + *pprog = prog; +} + +/* STX: *(u8*)(dst_reg + off) = src_reg */ +static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'mov byte ptr [rax + off], al' */ + if (is_ereg(dst_reg) || is_ereg(src_reg) || + /* We have to add extra byte for x86 SIL, DIL regs */ + src_reg == BPF_REG_1 || src_reg == BPF_REG_2) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); + else + EMIT1(0x88); + break; + case BPF_H: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT2(0x66, 0x89); + break; + case BPF_W: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT1(0x89); + break; + case BPF_DW: + EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); + break; + } + if (is_imm8(off)) + EMIT2(add_2reg(0x40, dst_reg, src_reg), off); + else + EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + *pprog = prog; +} + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + u32 reg = x->fixup >> 8; + + /* jump over faulting load and clear dest register */ + *(unsigned long *)((void *)regs + reg) = 0; + regs->ip += x->fixup & 0xff; + return true; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -384,7 +645,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0; + int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; @@ -747,64 +1008,64 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: - /* Emit 'mov byte ptr [rax + off], al' */ - if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* We have to add extra byte for x86 SIL, DIL regs */ - src_reg == BPF_REG_1 || src_reg == BPF_REG_2) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); - else - EMIT1(0x88); - goto stx; case BPF_STX | BPF_MEM | BPF_H: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT2(0x66, 0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_W: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT1(0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_DW: - EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); -stx: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); break; /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: - /* Emit 'movzx rax, byte ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_H: - /* Emit 'movzx rax, word ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_W: - /* Emit 'mov eax, dword ptr [rax+0x14]' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); - else - EMIT1(0x8B); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: - /* Emit 'mov rax, qword ptr [rax+0x14]' */ - EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); -ldx: /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), - insn->off); + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + struct exception_table_entry *ex; + u8 *_insn = image + proglen; + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("ex gen bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + if (!is_simm32(delta)) { + pr_err("extable->insn doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->insn = delta; + + delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + if (!is_simm32(delta)) { + pr_err("extable->handler doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->handler = delta; + + if (dst_reg > BPF_REG_9) { + pr_err("verifier error\n"); + return -EFAULT; + } + /* + * Compute size of x86 insn and its target dest x86 register. + * ex_handler_bpf() will use lower 8 bits to adjust + * pt_regs->ip to jump over this x86 instruction + * and upper bits to figure out which pt_regs to zero out. + * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" + * of 4 bytes will be ignored and rbx will be zero inited. + */ + ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + } break; /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ @@ -827,17 +1088,16 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - jmp_offset = func - (image + addrs[i]); - if (!imm32 || !is_simm32(jmp_offset)) { - pr_err("unsupported BPF func %d addr %p image %p\n", - imm32, func, image); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) return -EINVAL; - } - EMIT1_off32(0xE8, jmp_offset); break; case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + if (imm32) + emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + &prog, addrs[i], image); + else + emit_bpf_tail_call_indirect(&prog); break; /* cond jump */ @@ -909,6 +1169,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: + /* test dst_reg, dst_reg to save one extra byte */ + if (imm32 == 0) { + if (BPF_CLASS(insn->code) == BPF_JMP) + EMIT1(add_2mod(0x48, dst_reg, dst_reg)); + else if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); + goto emit_cond_jmp; + } + /* cmp dst_reg, imm8/32 */ if (BPF_CLASS(insn->code) == BPF_JMP) EMIT1(add_1mod(0x48, dst_reg)); @@ -1048,9 +1318,218 @@ emit_jmp: addrs[i] = proglen; prog = temp; } + + if (image && excnt != bpf_prog->aux->num_exentries) { + pr_err("extable is not populated\n"); + return -EFAULT; + } return proglen; } +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), + BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); +} + +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); +} + +static int invoke_bpf(struct btf_func_model *m, u8 **pprog, + struct bpf_prog **progs, int prog_cnt, int stack_size) +{ + u8 *prog = *pprog; + int cnt = 0, i; + + for (i = 0; i < prog_cnt; i++) { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* arg1: lea rdi, [rbp - stack_size] */ + EMIT4(0x48, 0x8D, 0x7D, -stack_size); + /* arg2: progs[i]->insnsi for interpreter */ + if (!progs[i]->jited) + emit_mov_imm64(&prog, BPF_REG_2, + (long) progs[i]->insnsi >> 32, + (u32) (long) progs[i]->insnsi); + /* call JITed bpf program or interpreter */ + if (emit_call(&prog, progs[i]->bpf_func, prog)) + return -EINVAL; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32, + (u32) (long) progs[i]); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +/* Example: + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); + * its 'struct btf_func_model' will be nr_args=2 + * The assembly code when eth_type_trans is executing after trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 16 // space for skb and dev + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 16], rdi // save skb pointer to stack + * mov qword ptr [rbp - 8], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 16] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack + * pop rbx + * leave + * ret + * + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be + * replaced with 'call generated_bpf_trampoline'. When it returns + * eth_type_trans will continue executing with original skb and dev pointers. + * + * The assembly code when eth_type_trans is called from trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 24 // space for skb, dev, return value + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 24], rdi // save skb pointer to stack + * mov qword ptr [rbp - 16], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time if bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack + * call eth_type_trans+5 // execute body of eth_type_trans + * mov qword ptr [rbp - 8], rax // save return value + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value + * pop rbx + * leave + * add rsp, 8 // skip eth_type_trans's frame + * ret // return to its caller + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + int cnt = 0, nr_args = m->nr_args; + int stack_size = nr_args * 8; + u8 *prog; + + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ + if (nr_args > 6) + return -ENOTSUPP; + + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += 8; /* room for return value of orig_call */ + + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip patched call instruction and point orig_call to actual + * body of the kernel function. + */ + orig_call += X86_PATCH_SIZE; + + prog = image; + + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ + EMIT1(0x53); /* push rbx */ + + save_regs(m, &prog, nr_args, stack_size); + + if (fentry_cnt) + if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + if (fentry_cnt) + restore_regs(m, &prog, nr_args, stack_size); + + /* call original function */ + if (emit_call(&prog, orig_call, prog)) + return -EINVAL; + /* remember return value in a stack for bpf prog to access */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + } + + if (fexit_cnt) + if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_RESTORE_REGS) + restore_regs(m, &prog, nr_args, stack_size); + + if (flags & BPF_TRAMP_F_CALL_ORIG) + /* restore original return value back into RAX */ + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + + EMIT1(0x5B); /* pop rbx */ + EMIT1(0xC9); /* leave */ + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip our return address and return to parent */ + EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ + EMIT1(0xC3); /* ret */ + /* One half of the page has active running trampoline. + * Another half is an area for next trampoline. + * Make sure the trampoline generation logic doesn't overflow. + */ + if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + return -EFAULT; + return 0; +} + struct x64_jit_data { struct bpf_binary_header *header; int *addrs; @@ -1148,12 +1627,24 @@ out_image: break; } if (proglen == oldproglen) { - header = bpf_jit_binary_alloc(proglen, &image, - 1, jit_fill_hole); + /* + * The number of entries in extable is the number of BPF_LDX + * insns that access kernel memory via "pointer to BTF type". + * The verifier changed their opcode from LDX|MEM|size + * to LDX|PROBE_MEM|size to make JITing easier. + */ + u32 align = __alignof__(struct exception_table_entry); + u32 extable_size = prog->aux->num_exentries * + sizeof(struct exception_table_entry); + + /* allocate module memory for x86 insns and extable */ + header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, + &image, align, jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; } + prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; cond_resched(); @@ -1164,6 +1655,7 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + bpf_tail_call_direct_fixup(prog); bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 71e8a67337e2..276cf79b5d24 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -67,13 +67,13 @@ static inline void op_x86_warn_in_use(int counter) * cannot be monitored by any other counter, contact your * hardware or BIOS vendor. */ - pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", - counter, smp_processor_id()); + pr_warn("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); } static inline void op_x86_warn_reserved(int counter) { - pr_warning("oprofile: counter #%d is already reserved\n", counter); + pr_warn("oprofile: counter #%d is already reserved\n", counter); } extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 425e025341db..38d44f36d5ed 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -128,6 +128,9 @@ void __init efi_find_mirror(void) efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; + if (!efi_enabled(EFI_MEMMAP)) + return; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -145,14 +148,18 @@ void __init efi_find_mirror(void) /* * Tell the kernel about the EFI memory map. This might include - * more than the max 128 entries that can fit in the e820 legacy - * (zeropage) memory map. + * more than the max 128 entries that can fit in the passed in e820 + * legacy (zeropage) memory map, but the kernel's e820 table can hold + * E820_MAX_ENTRIES. */ static void __init do_add_efi_memmap(void) { efi_memory_desc_t *md; + if (!efi_enabled(EFI_MEMMAP)) + return; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -164,7 +171,10 @@ static void __init do_add_efi_memmap(void) case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: - if (md->attribute & EFI_MEMORY_WB) + if (efi_soft_reserve_enabled() + && (md->attribute & EFI_MEMORY_SP)) + e820_type = E820_TYPE_SOFT_RESERVED; + else if (md->attribute & EFI_MEMORY_WB) e820_type = E820_TYPE_RAM; else e820_type = E820_TYPE_RESERVED; @@ -190,11 +200,36 @@ static void __init do_add_efi_memmap(void) e820_type = E820_TYPE_RESERVED; break; } + e820__range_add(start, size, e820_type); } e820__update_table(e820_table); } +/* + * Given add_efi_memmap defaults to 0 and there there is no alternative + * e820 mechanism for soft-reserved memory, import the full EFI memory + * map if soft reservations are present and enabled. Otherwise, the + * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is + * the efi=nosoftreserve option. + */ +static bool do_efi_soft_reserve(void) +{ + efi_memory_desc_t *md; + + if (!efi_enabled(EFI_MEMMAP)) + return false; + + if (!efi_soft_reserve_enabled()) + return false; + + for_each_efi_memory_desc(md) + if (md->type == EFI_CONVENTIONAL_MEMORY && + (md->attribute & EFI_MEMORY_SP)) + return true; + return false; +} + int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; @@ -224,9 +259,11 @@ int __init efi_memblock_x86_reserve_range(void) if (rv) return rv; - if (add_efi_memmap) + if (add_efi_memmap || do_efi_soft_reserve()) do_add_efi_memmap(); + efi_fake_memmap_early(); + WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", efi.memmap.desc_version); @@ -779,6 +816,15 @@ static bool should_map_region(efi_memory_desc_t *md) return false; /* + * EFI specific purpose memory may be reserved by default + * depending on kernel config and boot options. + */ + if (md->type == EFI_CONVENTIONAL_MEMORY && + efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + return false; + + /* * Map all of RAM so that we can access arguments in the 1:1 * mapping when making EFI runtime calls. */ diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index ab2e91e76894..eed8b5b441f8 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -22,7 +22,7 @@ */ .text -ENTRY(efi_call_phys) +SYM_FUNC_START(efi_call_phys) /* * 0. The function can only be called in Linux kernel. So CS has been * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found @@ -114,7 +114,7 @@ ENTRY(efi_call_phys) movl (%edx), %ecx pushl %ecx ret -ENDPROC(efi_call_phys) +SYM_FUNC_END(efi_call_phys) .previous .data diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 74628ec78f29..b1d2313fe3bf 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -39,7 +39,7 @@ mov %rsi, %cr0; \ mov (%rsp), %rsp -ENTRY(efi_call) +SYM_FUNC_START(efi_call) pushq %rbp movq %rsp, %rbp SAVE_XMM @@ -55,4 +55,4 @@ ENTRY(efi_call) RESTORE_XMM popq %rbp ret -ENDPROC(efi_call) +SYM_FUNC_END(efi_call) diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 46c58b08739c..3189f1394701 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -25,7 +25,7 @@ .text .code64 -ENTRY(efi64_thunk) +SYM_FUNC_START(efi64_thunk) push %rbp push %rbx @@ -60,14 +60,14 @@ ENTRY(efi64_thunk) pop %rbx pop %rbp retq -ENDPROC(efi64_thunk) +SYM_FUNC_END(efi64_thunk) /* * We run this function from the 1:1 mapping. * * This function must be invoked with a 1:1 mapped stack. */ -ENTRY(__efi64_thunk) +SYM_FUNC_START_LOCAL(__efi64_thunk) movl %ds, %eax push %rax movl %es, %eax @@ -114,14 +114,14 @@ ENTRY(__efi64_thunk) or %rcx, %rax 1: ret -ENDPROC(__efi64_thunk) +SYM_FUNC_END(__efi64_thunk) -ENTRY(efi_exit32) +SYM_FUNC_START_LOCAL(efi_exit32) movq func_rt_ptr(%rip), %rax push %rax mov %rdi, %rax ret -ENDPROC(efi_exit32) +SYM_FUNC_END(efi_exit32) .code32 /* @@ -129,7 +129,7 @@ ENDPROC(efi_exit32) * * The stack should represent the 32-bit calling convention. */ -ENTRY(efi_enter32) +SYM_FUNC_START_LOCAL(efi_enter32) movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %es @@ -145,7 +145,7 @@ ENTRY(efi_enter32) pushl %eax lret -ENDPROC(efi_enter32) +SYM_FUNC_END(efi_enter32) .data .balign 8 diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 3b9fd679cea9..7675cf754d90 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -320,6 +320,9 @@ void __init efi_reserve_boot_services(void) { efi_memory_desc_t *md; + if (!efi_enabled(EFI_MEMMAP)) + return; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 6d193bb36021..089413cd944e 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -39,7 +39,7 @@ static int set_lid_wake_behavior(bool wake_on_close) status = acpi_execute_simple_method(NULL, "\\_SB.PCI0.LID.LIDW", wake_on_close); if (ACPI_FAILURE(status)) { - pr_warning(PFX "failed to set lid behavior\n"); + pr_warn(PFX "failed to set lid behavior\n"); return 1; } diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S index 5fee3a2c2fd4..75f4faff8468 100644 --- a/arch/x86/platform/olpc/xo1-wakeup.S +++ b/arch/x86/platform/olpc/xo1-wakeup.S @@ -90,7 +90,7 @@ restore_registers: ret -ENTRY(do_olpc_suspend_lowlevel) +SYM_CODE_START(do_olpc_suspend_lowlevel) call save_processor_state call save_registers @@ -110,6 +110,7 @@ ret_point: call restore_registers call restore_processor_state ret +SYM_CODE_END(do_olpc_suspend_lowlevel) .data saved_gdt: .long 0,0 diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1f8825bbaffb..43b4d864817e 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -50,7 +50,7 @@ #define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8) #define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8) -ENTRY(pvh_start_xen) +SYM_CODE_START_LOCAL(pvh_start_xen) cld lgdt (_pa(gdt)) @@ -146,15 +146,16 @@ ENTRY(pvh_start_xen) ljmp $PVH_CS_SEL, $_pa(startup_32) #endif -END(pvh_start_xen) +SYM_CODE_END(pvh_start_xen) .section ".init.data","aw" .balign 8 -gdt: +SYM_DATA_START_LOCAL(gdt) .word gdt_end - gdt_start .long _pa(gdt_start) .word 0 -gdt_start: +SYM_DATA_END(gdt) +SYM_DATA_START_LOCAL(gdt_start) .quad 0x0000000000000000 /* NULL descriptor */ #ifdef CONFIG_X86_64 .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */ @@ -163,15 +164,14 @@ gdt_start: #endif .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */ .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */ -gdt_end: +SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 -canary: - .fill 48, 1, 0 +SYM_DATA_LOCAL(canary, .fill 48, 1, 0) -early_stack: +SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 -early_stack_end: +SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index bf6016f8db4e..6259563760f9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -26,8 +26,7 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static void __init mp_sfi_register_lapic(u8 id) { if (MAX_LOCAL_APIC - id <= 0) { - pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_LOCAL_APIC); + pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index c2ee31953372..ece9cb9c1189 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -184,20 +184,20 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) } EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); -void uv_bios_init(void) +int uv_bios_init(void) { uv_systab = NULL; if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || !uv_systab_phys || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); - return; + return -EEXIST; } uv_systab = ioremap(uv_systab_phys, sizeof(struct uv_systab)); if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { pr_err("UV: UVsystab: bad signature!\n"); iounmap(uv_systab); - return; + return -EINVAL; } /* Starting with UV4 the UV systab size is variable */ @@ -208,8 +208,9 @@ void uv_bios_init(void) uv_systab = ioremap(uv_systab_phys, size); if (!uv_systab) { pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); - return; + return -EFAULT; } } pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); + return 0; } diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 6fe383002125..8786653ad3c0 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -16,7 +16,7 @@ .text -ENTRY(swsusp_arch_suspend) +SYM_FUNC_START(swsusp_arch_suspend) movl %esp, saved_context_esp movl %ebx, saved_context_ebx movl %ebp, saved_context_ebp @@ -33,9 +33,9 @@ ENTRY(swsusp_arch_suspend) call swsusp_save FRAME_END ret -ENDPROC(swsusp_arch_suspend) +SYM_FUNC_END(swsusp_arch_suspend) -ENTRY(restore_image) +SYM_CODE_START(restore_image) /* prepare to jump to the image kernel */ movl restore_jump_address, %ebx movl restore_cr3, %ebp @@ -45,9 +45,10 @@ ENTRY(restore_image) /* jump to relocated restore code */ movl relocated_restore_code, %eax jmpl *%eax +SYM_CODE_END(restore_image) /* code below has been relocated to a safe page */ -ENTRY(core_restore_code) +SYM_CODE_START(core_restore_code) movl temp_pgt, %eax movl %eax, %cr3 @@ -77,10 +78,11 @@ copy_loop: done: jmpl *%ebx +SYM_CODE_END(core_restore_code) /* code below belongs to the image kernel */ .align PAGE_SIZE -ENTRY(restore_registers) +SYM_FUNC_START(restore_registers) /* go back to the original page tables */ movl %ebp, %cr3 movl mmu_cr4_features, %ecx @@ -107,4 +109,4 @@ ENTRY(restore_registers) movl %eax, in_suspend ret -ENDPROC(restore_registers) +SYM_FUNC_END(restore_registers) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index a4d5eb0a7ece..7918b8415f13 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -22,7 +22,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -ENTRY(swsusp_arch_suspend) +SYM_FUNC_START(swsusp_arch_suspend) movq $saved_context, %rax movq %rsp, pt_regs_sp(%rax) movq %rbp, pt_regs_bp(%rax) @@ -50,9 +50,9 @@ ENTRY(swsusp_arch_suspend) call swsusp_save FRAME_END ret -ENDPROC(swsusp_arch_suspend) +SYM_FUNC_END(swsusp_arch_suspend) -ENTRY(restore_image) +SYM_CODE_START(restore_image) /* prepare to jump to the image kernel */ movq restore_jump_address(%rip), %r8 movq restore_cr3(%rip), %r9 @@ -67,9 +67,10 @@ ENTRY(restore_image) /* jump to relocated restore code */ movq relocated_restore_code(%rip), %rcx jmpq *%rcx +SYM_CODE_END(restore_image) /* code below has been relocated to a safe page */ -ENTRY(core_restore_code) +SYM_CODE_START(core_restore_code) /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ @@ -97,10 +98,11 @@ ENTRY(core_restore_code) .Ldone: /* jump to the restore_registers address from the image header */ jmpq *%r8 +SYM_CODE_END(core_restore_code) /* code below belongs to the image kernel */ .align PAGE_SIZE -ENTRY(restore_registers) +SYM_FUNC_START(restore_registers) /* go back to the original page tables */ movq %r9, %cr3 @@ -142,4 +144,4 @@ ENTRY(restore_registers) movq %rax, in_suspend(%rip) ret -ENDPROC(restore_registers) +SYM_FUNC_END(restore_registers) diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S index 275a646d1048..0b4390ce586b 100644 --- a/arch/x86/purgatory/entry64.S +++ b/arch/x86/purgatory/entry64.S @@ -8,13 +8,13 @@ * This code has been taken from kexec-tools. */ +#include <linux/linkage.h> + .text .balign 16 .code64 - .globl entry64, entry64_regs - -entry64: +SYM_CODE_START(entry64) /* Setup a gdt that should be preserved */ lgdt gdt(%rip) @@ -54,10 +54,11 @@ new_cs_exit: /* Jump to the new code... */ jmpq *rip(%rip) +SYM_CODE_END(entry64) .section ".rodata" .balign 4 -entry64_regs: +SYM_DATA_START(entry64_regs) rax: .quad 0x0 rcx: .quad 0x0 rdx: .quad 0x0 @@ -75,13 +76,14 @@ r13: .quad 0x0 r14: .quad 0x0 r15: .quad 0x0 rip: .quad 0x0 - .size entry64_regs, . - entry64_regs +SYM_DATA_END(entry64_regs) /* GDT */ .section ".rodata" .balign 16 -gdt: - /* 0x00 unusable segment +SYM_DATA_START_LOCAL(gdt) + /* + * 0x00 unusable segment * 0x08 unused * so use them as gdt ptr */ @@ -94,6 +96,8 @@ gdt: /* 0x18 4GB flat data segment */ .word 0xFFFF, 0x0000, 0x9200, 0x00CF -gdt_end: -stack: .quad 0, 0 -stack_init: +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) + +SYM_DATA_START_LOCAL(stack) + .quad 0, 0 +SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, stack_init) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 3b95410ff0f8..2961234d0795 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -14,28 +14,10 @@ #include "../boot/string.h" -unsigned long purgatory_backup_dest __section(.kexec-purgatory); -unsigned long purgatory_backup_src __section(.kexec-purgatory); -unsigned long purgatory_backup_sz __section(.kexec-purgatory); - u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); -/* - * On x86, second kernel requries first 640K of memory to boot. Copy - * first 640K to a backup region in reserved memory range so that second - * kernel can use first 640K. - */ -static int copy_backup_region(void) -{ - if (purgatory_backup_dest) { - memcpy((void *)purgatory_backup_dest, - (void *)purgatory_backup_src, purgatory_backup_sz); - } - return 0; -} - static int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; @@ -66,7 +48,6 @@ void purgatory(void) for (;;) ; } - copy_backup_region(); } /* diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index 321146be741d..89d9e9e53fcd 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -7,14 +7,14 @@ * * This code has been taken from kexec-tools. */ +#include <linux/linkage.h> #include <asm/purgatory.h> .text - .globl purgatory_start .balign 16 -purgatory_start: .code64 +SYM_CODE_START(purgatory_start) /* Load a gdt so I know what the segment registers are */ lgdt gdt(%rip) @@ -32,10 +32,12 @@ purgatory_start: /* Call the C code */ call purgatory jmp entry64 +SYM_CODE_END(purgatory_start) .section ".rodata" .balign 16 -gdt: /* 0x00 unusable segment +SYM_DATA_START_LOCAL(gdt) + /* 0x00 unusable segment * 0x08 unused * so use them as the gdt ptr */ @@ -48,10 +50,10 @@ gdt: /* 0x00 unusable segment /* 0x18 4GB flat data segment */ .word 0xFFFF, 0x0000, 0x9200, 0x00CF -gdt_end: +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) .bss .balign 4096 -lstack: +SYM_DATA_START_LOCAL(lstack) .skip 4096 -lstack_end: +SYM_DATA_END_LABEL(lstack, SYM_L_LOCAL, lstack_end) diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S index 8b1427422dfc..1ef507ca50a5 100644 --- a/arch/x86/purgatory/stack.S +++ b/arch/x86/purgatory/stack.S @@ -5,13 +5,14 @@ * Copyright (C) 2014 Red Hat Inc. */ +#include <linux/linkage.h> + /* A stack for the loaded kernel. * Separate and in the data section so it can be prepopulated. */ .data .balign 4096 - .globl stack, stack_end -stack: +SYM_DATA_START(stack) .skip 4096 -stack_end: +SYM_DATA_END_LABEL(stack, SYM_L_GLOBAL, stack_end) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 7dce39c8c034..262f83cad355 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,6 +8,7 @@ #include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> +#include <asm/crash.h> struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -34,6 +35,7 @@ void __init reserve_real_mode(void) memblock_reserve(mem, size); set_real_mode_mem(mem); + crash_reserve_low_1M(); } static void __init setup_real_mode(void) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 6363761cc74c..af04512c02d9 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -14,7 +14,7 @@ .section ".header", "a" .balign 16 -GLOBAL(real_mode_header) +SYM_DATA_START(real_mode_header) .long pa_text_start .long pa_ro_end /* SMP trampoline */ @@ -33,11 +33,9 @@ GLOBAL(real_mode_header) #ifdef CONFIG_X86_64 .long __KERNEL32_CS #endif -END(real_mode_header) +SYM_DATA_END(real_mode_header) /* End signature, used to verify integrity */ .section ".signature","a" .balign 4 -GLOBAL(end_signature) - .long REALMODE_END_SIGNATURE -END(end_signature) +SYM_DATA(end_signature, .long REALMODE_END_SIGNATURE) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 3bb980800c58..64d135d1ee63 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -11,6 +11,7 @@ OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) +ENTRY(pa_text_start) SECTIONS { diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index cd2f97b9623b..f10515b10e0a 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -19,7 +19,7 @@ */ .section ".text32", "ax" .code32 -ENTRY(machine_real_restart_asm) +SYM_CODE_START(machine_real_restart_asm) #ifdef CONFIG_X86_64 /* Switch to trampoline GDT as it is guaranteed < 4 GiB */ @@ -33,7 +33,7 @@ ENTRY(machine_real_restart_asm) movl %eax, %cr0 ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off -GLOBAL(machine_real_restart_paging_off) +SYM_INNER_LABEL(machine_real_restart_paging_off, SYM_L_GLOBAL) xorl %eax, %eax xorl %edx, %edx movl $MSR_EFER, %ecx @@ -63,6 +63,7 @@ GLOBAL(machine_real_restart_paging_off) movl %ecx, %gs movl %ecx, %ss ljmpw $8, $1f +SYM_CODE_END(machine_real_restart_asm) /* * This is 16-bit protected mode code to disable paging and the cache, @@ -127,13 +128,13 @@ bios: .section ".rodata", "a" .balign 16 -GLOBAL(machine_real_restart_idt) +SYM_DATA_START(machine_real_restart_idt) .word 0xffff /* Length - real mode default value */ .long 0 /* Base - real mode default value */ -END(machine_real_restart_idt) +SYM_DATA_END(machine_real_restart_idt) .balign 16 -GLOBAL(machine_real_restart_gdt) +SYM_DATA_START(machine_real_restart_gdt) /* Self-pointer */ .word 0xffff /* Length - real mode default value */ .long pa_machine_real_restart_gdt @@ -153,4 +154,4 @@ GLOBAL(machine_real_restart_gdt) * semantics we don't have to reload the segments once CR0.PE = 0. */ .quad GDT_ENTRY(0x0093, 0x100, 0xffff) -END(machine_real_restart_gdt) +SYM_DATA_END(machine_real_restart_gdt) diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S index 8d4cb64799ea..0fca64061ad2 100644 --- a/arch/x86/realmode/rm/stack.S +++ b/arch/x86/realmode/rm/stack.S @@ -6,15 +6,13 @@ #include <linux/linkage.h> .data -GLOBAL(HEAP) - .long rm_heap -GLOBAL(heap_end) - .long rm_stack +SYM_DATA(HEAP, .long rm_heap) +SYM_DATA(heap_end, .long rm_stack) .bss .balign 16 -GLOBAL(rm_heap) - .space 2048 -GLOBAL(rm_stack) +SYM_DATA(rm_heap, .space 2048) + +SYM_DATA_START(rm_stack) .space 2048 -GLOBAL(rm_stack_end) +SYM_DATA_END_LABEL(rm_stack, SYM_L_GLOBAL, rm_stack_end) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1868b158480d..3fad907a179f 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -29,7 +29,7 @@ .code16 .balign PAGE_SIZE -ENTRY(trampoline_start) +SYM_CODE_START(trampoline_start) wbinvd # Needed for NUMA-Q should be harmless for others LJMPW_RM(1f) @@ -54,18 +54,20 @@ ENTRY(trampoline_start) lmsw %dx # into protected mode ljmpl $__BOOT_CS, $pa_startup_32 +SYM_CODE_END(trampoline_start) .section ".text32","ax" .code32 -ENTRY(startup_32) # note: also used from wakeup_asm.S +SYM_CODE_START(startup_32) # note: also used from wakeup_asm.S jmp *%eax +SYM_CODE_END(startup_32) .bss .balign 8 -GLOBAL(trampoline_header) - tr_start: .space 4 - tr_gdt_pad: .space 2 - tr_gdt: .space 6 -END(trampoline_header) +SYM_DATA_START(trampoline_header) + SYM_DATA_LOCAL(tr_start, .space 4) + SYM_DATA_LOCAL(tr_gdt_pad, .space 2) + SYM_DATA_LOCAL(tr_gdt, .space 6) +SYM_DATA_END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index aee2b45d83b8..251758ed7443 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -38,7 +38,7 @@ .code16 .balign PAGE_SIZE -ENTRY(trampoline_start) +SYM_CODE_START(trampoline_start) cli # We should be safe anyway wbinvd @@ -78,12 +78,14 @@ ENTRY(trampoline_start) no_longmode: hlt jmp no_longmode +SYM_CODE_END(trampoline_start) + #include "../kernel/verify_cpu.S" .section ".text32","ax" .code32 .balign 4 -ENTRY(startup_32) +SYM_CODE_START(startup_32) movl %edx, %ss addl $pa_real_mode_base, %esp movl %edx, %ds @@ -137,38 +139,39 @@ ENTRY(startup_32) * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ ljmpl $__KERNEL_CS, $pa_startup_64 +SYM_CODE_END(startup_32) .section ".text64","ax" .code64 .balign 4 -ENTRY(startup_64) +SYM_CODE_START(startup_64) # Now jump into the kernel using virtual addresses jmpq *tr_start(%rip) +SYM_CODE_END(startup_64) .section ".rodata","a" # Duplicate the global descriptor table # so the kernel can live anywhere .balign 16 - .globl tr_gdt -tr_gdt: +SYM_DATA_START(tr_gdt) .short tr_gdt_end - tr_gdt - 1 # gdt limit .long pa_tr_gdt .short 0 .quad 0x00cf9b000000ffff # __KERNEL32_CS .quad 0x00af9b000000ffff # __KERNEL_CS .quad 0x00cf93000000ffff # __KERNEL_DS -tr_gdt_end: +SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end) .bss .balign PAGE_SIZE -GLOBAL(trampoline_pgd) .space PAGE_SIZE +SYM_DATA(trampoline_pgd, .space PAGE_SIZE) .balign 8 -GLOBAL(trampoline_header) - tr_start: .space 8 - GLOBAL(tr_efer) .space 8 - GLOBAL(tr_cr4) .space 4 - GLOBAL(tr_flags) .space 4 -END(trampoline_header) +SYM_DATA_START(trampoline_header) + SYM_DATA_LOCAL(tr_start, .space 8) + SYM_DATA(tr_efer, .space 8) + SYM_DATA(tr_cr4, .space 4) + SYM_DATA(tr_flags, .space 4) +SYM_DATA_END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 8d8208dcca24..5033e640f957 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,4 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ .section ".rodata","a" .balign 16 -tr_idt: .fill 1, 6, 0 +SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0) diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 05ac9c17c811..02d0ba16ae33 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -17,7 +17,7 @@ .section ".data", "aw" .balign 16 -GLOBAL(wakeup_header) +SYM_DATA_START(wakeup_header) video_mode: .short 0 /* Video mode number */ pmode_entry: .long 0 pmode_cs: .short __KERNEL_CS @@ -31,13 +31,13 @@ GLOBAL(wakeup_header) realmode_flags: .long 0 real_magic: .long 0 signature: .long WAKEUP_HEADER_SIGNATURE -END(wakeup_header) +SYM_DATA_END(wakeup_header) .text .code16 .balign 16 -ENTRY(wakeup_start) +SYM_CODE_START(wakeup_start) cli cld @@ -73,7 +73,7 @@ ENTRY(wakeup_start) movw %ax, %fs movw %ax, %gs - lidtl wakeup_idt + lidtl .Lwakeup_idt /* Clear the EFLAGS */ pushl $0 @@ -135,6 +135,7 @@ ENTRY(wakeup_start) #else jmp trampoline_start #endif +SYM_CODE_END(wakeup_start) bogus_real_magic: 1: @@ -152,7 +153,7 @@ bogus_real_magic: */ .balign 16 -GLOBAL(wakeup_gdt) +SYM_DATA_START(wakeup_gdt) .word 3*8-1 /* Self-descriptor */ .long pa_wakeup_gdt .word 0 @@ -164,15 +165,15 @@ GLOBAL(wakeup_gdt) .word 0xffff /* 16-bit data segment @ real_mode_base */ .long 0x93000000 + pa_real_mode_base .word 0x008f /* big real mode */ -END(wakeup_gdt) +SYM_DATA_END(wakeup_gdt) .section ".rodata","a" .balign 8 /* This is the standard real-mode IDT */ .balign 16 -GLOBAL(wakeup_idt) +SYM_DATA_START_LOCAL(.Lwakeup_idt) .word 0xffff /* limit */ .long 0 /* address */ .word 0 -END(wakeup_idt) +SYM_DATA_END(.Lwakeup_idt) diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index c078dba40cef..c8fef76743f6 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -10,12 +10,10 @@ .balign PAGE_SIZE -GLOBAL(real_mode_blob) +SYM_DATA_START(real_mode_blob) .incbin "arch/x86/realmode/rm/realmode.bin" -END(real_mode_blob) +SYM_DATA_END_LABEL(real_mode_blob, SYM_L_GLOBAL, real_mode_blob_end) -GLOBAL(real_mode_blob_end); - -GLOBAL(real_mode_relocs) +SYM_DATA_START(real_mode_relocs) .incbin "arch/x86/realmode/rm/realmode.relocs" -END(real_mode_relocs) +SYM_DATA_END(real_mode_relocs) diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index b02a36b2c14f..a42015b305f4 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -69,7 +69,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" + lprefix3_expr = "\\((F2|!F3|66&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -257,7 +257,7 @@ function convert_operands(count,opnd, i,j,imm,mod) return add_flags(imm, mod) } -/^[0-9a-f]+\:/ { +/^[0-9a-f]+:/ { if (NR == 1) next # get index diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S index a4a3870dc059..a6eaf293a73b 100644 --- a/arch/x86/um/vdso/vdso.S +++ b/arch/x86/um/vdso/vdso.S @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> +#include <linux/linkage.h> __INITDATA - .globl vdso_start, vdso_end -vdso_start: +SYM_DATA_START(vdso_start) .incbin "arch/x86/um/vdso/vdso.so" -vdso_end: +SYM_DATA_END_LABEL(vdso_start, SYM_L_GLOBAL, vdso_end) __FINIT diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5bfea374a160..ae4a41ca19f6 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -837,15 +837,6 @@ static void xen_load_sp0(unsigned long sp0) this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } -void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - static void xen_io_delay(void) { } @@ -1055,7 +1046,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_idt_entry = xen_write_idt_entry, .load_sp0 = xen_load_sp0, - .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, /* Xen takes care of %gs when switching to usermode for us */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 548d1e0a5ba1..33b0e20df7fc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -412,7 +412,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { - pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, cur_pfn + left, nr_pages); break; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index be104eef80be..508fe204520b 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -19,7 +19,7 @@ * event status with one and operation. If there are pending events, * then enter the hypervisor to get them handled. */ -ENTRY(xen_irq_enable_direct) +SYM_FUNC_START(xen_irq_enable_direct) FRAME_BEGIN /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask @@ -38,17 +38,17 @@ ENTRY(xen_irq_enable_direct) 1: FRAME_END ret - ENDPROC(xen_irq_enable_direct) +SYM_FUNC_END(xen_irq_enable_direct) /* * Disabling events is simply a matter of making the event mask * non-zero. */ -ENTRY(xen_irq_disable_direct) +SYM_FUNC_START(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ret -ENDPROC(xen_irq_disable_direct) +SYM_FUNC_END(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -59,12 +59,12 @@ ENDPROC(xen_irq_disable_direct) * undefined. We need to toggle the state of the bit, because Xen and * x86 use opposite senses (mask vs enable). */ -ENTRY(xen_save_fl_direct) +SYM_FUNC_START(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah ret - ENDPROC(xen_save_fl_direct) +SYM_FUNC_END(xen_save_fl_direct) /* @@ -74,7 +74,7 @@ ENTRY(xen_save_fl_direct) * interrupt mask state, it checks for unmasked pending events and * enters the hypervisor to get them delivered if so. */ -ENTRY(xen_restore_fl_direct) +SYM_FUNC_START(xen_restore_fl_direct) FRAME_BEGIN #ifdef CONFIG_X86_64 testw $X86_EFLAGS_IF, %di @@ -95,14 +95,14 @@ ENTRY(xen_restore_fl_direct) 1: FRAME_END ret - ENDPROC(xen_restore_fl_direct) +SYM_FUNC_END(xen_restore_fl_direct) /* * Force an event check by making a hypercall, but preserve regs * before making the call. */ -ENTRY(check_events) +SYM_FUNC_START(check_events) FRAME_BEGIN #ifdef CONFIG_X86_32 push %eax @@ -135,19 +135,19 @@ ENTRY(check_events) #endif FRAME_END ret -ENDPROC(check_events) +SYM_FUNC_END(check_events) -ENTRY(xen_read_cr2) +SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX FRAME_END ret - ENDPROC(xen_read_cr2); +SYM_FUNC_END(xen_read_cr2); -ENTRY(xen_read_cr2_direct) +SYM_FUNC_START(xen_read_cr2_direct) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX FRAME_END ret - ENDPROC(xen_read_cr2_direct); +SYM_FUNC_END(xen_read_cr2_direct); diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index c15db060a242..2712e9155306 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -56,7 +56,7 @@ _ASM_EXTABLE(1b,2b) .endm -ENTRY(xen_iret) +SYM_CODE_START(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) jnz hyper_iret @@ -122,14 +122,14 @@ xen_iret_end_crit: hyper_iret: /* put this out of line since its very rarely used */ jmp hypercall_page + __HYPERVISOR_iret * 32 +SYM_CODE_END(xen_iret) .globl xen_iret_start_crit, xen_iret_end_crit /* - * This is called by xen_hypervisor_callback in entry.S when it sees + * This is called by xen_hypervisor_callback in entry_32.S when it sees * that the EIP at the time of interrupt was between - * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in - * %eax so we can do a more refined determination of what to do. + * xen_iret_start_crit and xen_iret_end_crit. * * The stack format at this point is: * ---------------- @@ -138,70 +138,46 @@ hyper_iret: * eflags } outer exception info * cs } * eip } - * ---------------- <- edi (copy dest) - * eax : outer eax if it hasn't been restored * ---------------- - * eflags } nested exception info - * cs } (no ss/esp because we're nested - * eip } from the same ring) - * orig_eax }<- esi (copy src) - * - - - - - - - - - * fs } - * es } - * ds } SAVE_ALL state - * eax } - * : : - * ebx }<- esp + * eax : outer eax if it hasn't been restored * ---------------- + * eflags } + * cs } nested exception info + * eip } + * return address : (into xen_hypervisor_callback) * - * In order to deliver the nested exception properly, we need to shift - * everything from the return addr up to the error code so it sits - * just under the outer exception info. This means that when we - * handle the exception, we do it in the context of the outer - * exception rather than starting a new one. + * In order to deliver the nested exception properly, we need to discard the + * nested exception frame such that when we handle the exception, we do it + * in the context of the outer exception rather than starting a new one. * - * The only caveat is that if the outer eax hasn't been restored yet - * (ie, it's still on stack), we need to insert its value into the - * SAVE_ALL state before going on, since it's usermode state which we - * eventually need to restore. + * The only caveat is that if the outer eax hasn't been restored yet (i.e. + * it's still on stack), we need to restore its value here. */ -ENTRY(xen_iret_crit_fixup) +SYM_CODE_START(xen_iret_crit_fixup) /* * Paranoia: Make sure we're really coming from kernel space. * One could imagine a case where userspace jumps into the * critical range address, but just before the CPU delivers a - * GP, it decides to deliver an interrupt instead. Unlikely? - * Definitely. Easy to avoid? Yes. The Intel documents - * explicitly say that the reported EIP for a bad jump is the - * jump instruction itself, not the destination, but some - * virtual environments get this wrong. + * PF, it decides to deliver an interrupt instead. Unlikely? + * Definitely. Easy to avoid? Yes. */ - movl PT_CS(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX(%esp), %esi - lea PT_EFLAGS(%esp), %edi + testb $2, 2*4(%esp) /* nested CS */ + jnz 2f /* * If eip is before iret_restore_end then stack * hasn't been restored yet. */ - cmp $iret_restore_end, %eax + cmpl $iret_restore_end, 1*4(%esp) jae 1f - movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ - movl %eax, PT_EAX(%esp) + movl 4*4(%esp), %eax /* load outer EAX */ + ret $4*4 /* discard nested EIP, CS, and EFLAGS as + * well as the just restored EAX */ - lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi), %esp /* point esp to new frame */ -2: jmp xen_do_upcall +1: + ret $3*4 /* discard nested EIP, CS, and EFLAGS */ +2: + ret +SYM_CODE_END(xen_iret_crit_fixup) diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index ebf610b49c06..0a0fd168683a 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -20,11 +20,11 @@ #include <linux/linkage.h> .macro xen_pv_trap name -ENTRY(xen_\name) +SYM_CODE_START(xen_\name) pop %rcx pop %r11 jmp \name -END(xen_\name) +SYM_CODE_END(xen_\name) _ASM_NOKPROBE(xen_\name) .endm @@ -57,7 +57,7 @@ xen_pv_trap entry_INT80_compat xen_pv_trap hypervisor_callback __INIT -ENTRY(xen_early_idt_handler_array) +SYM_CODE_START(xen_early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS pop %rcx @@ -66,7 +66,7 @@ ENTRY(xen_early_idt_handler_array) i = i + 1 .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -END(xen_early_idt_handler_array) +SYM_CODE_END(xen_early_idt_handler_array) __FINIT hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 @@ -85,11 +85,12 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * r11 }<-- pushed by hypercall page * rsp->rax } */ -ENTRY(xen_iret) +SYM_CODE_START(xen_iret) pushq $0 jmp hypercall_iret +SYM_CODE_END(xen_iret) -ENTRY(xen_sysret64) +SYM_CODE_START(xen_sysret64) /* * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back. @@ -107,6 +108,7 @@ ENTRY(xen_sysret64) pushq $VGCF_in_syscall jmp hypercall_iret +SYM_CODE_END(xen_sysret64) /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -124,7 +126,7 @@ ENTRY(xen_sysret64) */ /* Normal 64-bit system call target */ -ENTRY(xen_syscall_target) +SYM_FUNC_START(xen_syscall_target) popq %rcx popq %r11 @@ -137,12 +139,12 @@ ENTRY(xen_syscall_target) movq $__USER_CS, 1*8(%rsp) jmp entry_SYSCALL_64_after_hwframe -ENDPROC(xen_syscall_target) +SYM_FUNC_END(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ -ENTRY(xen_syscall32_target) +SYM_FUNC_START(xen_syscall32_target) popq %rcx popq %r11 @@ -155,25 +157,25 @@ ENTRY(xen_syscall32_target) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe -ENDPROC(xen_syscall32_target) +SYM_FUNC_END(xen_syscall32_target) /* 32-bit compat sysenter target */ -ENTRY(xen_sysenter_target) +SYM_FUNC_START(xen_sysenter_target) mov 0*8(%rsp), %rcx mov 1*8(%rsp), %r11 mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat -ENDPROC(xen_sysenter_target) +SYM_FUNC_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ -ENTRY(xen_syscall32_target) -ENTRY(xen_sysenter_target) +SYM_FUNC_START_ALIAS(xen_syscall32_target) +SYM_FUNC_START(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax pushq $0 jmp hypercall_iret -ENDPROC(xen_syscall32_target) -ENDPROC(xen_sysenter_target) +SYM_FUNC_END(xen_sysenter_target) +SYM_FUNC_END_ALIAS(xen_syscall32_target) #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index c1d8b90aa4e2..1d0cee3163e4 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -22,7 +22,7 @@ #ifdef CONFIG_XEN_PV __INIT -ENTRY(startup_xen) +SYM_CODE_START(startup_xen) UNWIND_HINT_EMPTY cld @@ -52,13 +52,13 @@ ENTRY(startup_xen) #endif jmp xen_start_kernel -END(startup_xen) +SYM_CODE_END(startup_xen) __FINIT #endif .pushsection .text .balign PAGE_SIZE -ENTRY(hypercall_page) +SYM_CODE_START(hypercall_page) .rept (PAGE_SIZE / 32) UNWIND_HINT_EMPTY .skip 32 @@ -69,7 +69,7 @@ ENTRY(hypercall_page) .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 #include <asm/xen-hypercalls.h> #undef HYPERCALL -END(hypercall_page) +SYM_CODE_END(hypercall_page) .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 943f10639a93..0043d5858f14 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -14,6 +14,8 @@ * Joe Taylor <joe@tensilica.com, joetylr@yahoo.com> */ +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/page.h> #include <asm/thread_info.h> @@ -124,18 +126,16 @@ SECTIONS . = ALIGN(16); - RODATA + RO_DATA(4096) /* Relocation table */ .fixup : { *(.fixup) } - EXCEPTION_TABLE(16) - NOTES /* Data section */ _sdata = .; - RW_DATA_SECTION(XCHAL_ICACHE_LINESIZE, PAGE_SIZE, THREAD_SIZE) + RW_DATA(XCHAL_ICACHE_LINESIZE, PAGE_SIZE, THREAD_SIZE) _edata = .; /* Initialization code and data: */ |