diff options
Diffstat (limited to 'arch/x86')
45 files changed, 3221 insertions, 187 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d3b9186e4c23..940e50ebfafa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 def_bool y select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING @@ -279,13 +280,13 @@ config SMP bool "Symmetric multi-processing support" ---help--- This enables support for systems with more than one CPU. If you have - a system with only one CPU, like most personal computers, say N. If - you have a system with more than one CPU, say Y. + a system with only one CPU, say N. If you have a system with more + than one CPU, say Y. - If you say N here, the kernel will run on single and multiprocessor + If you say N here, the kernel will run on uni- and multiprocessor machines, but will use only one CPU of a multiprocessor machine. If you say Y here, the kernel will run on many, but not all, - singleprocessor machines. On a singleprocessor machine, the kernel + uniprocessor machines. On a uniprocessor machine, the kernel will run faster if you say N here. Note that if you say Y here and choose architecture "586" or @@ -731,6 +732,7 @@ config APB_TIMER # The code disables itself when not needed. config DMI default y + select DMI_SCAN_MACHINE_NON_EFI_FALLBACK bool "Enable DMI scanning" if EXPERT ---help--- Enabled scanning of DMI to identify machine quirks. Say Y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 13b22e0f681d..eeda43abed6e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -11,6 +11,28 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# How to compile the 16-bit code. Note we always compile for -march=i386; +# that way we can complain to the user if the CPU is insufficient. +# +# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For +# older versions of GCC, we need to play evil and unreliable tricks to +# attempt to ensure that our asm(".code16gcc") is first in the asm +# output. +CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \ + $(call cc-option, -fno-toplevel-reorder,\ + $(call cc-option, -fno-unit-at-a-time)) +M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) + +REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ + -DDISABLE_BRANCH_PROFILING \ + -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ + -mno-mmx -mno-sse \ + $(call cc-option, -ffreestanding) \ + $(call cc-option, -fno-stack-protector) \ + $(call cc-option, -mpreferred-stack-boundary=2) +export REALMODE_CFLAGS + # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index de7066918005..878df7e88cd4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -51,20 +51,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE # --------------------------------------------------------------------------- -# How to compile the 16-bit code. Note we always compile for -march=i386, -# that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(USERINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \ - -DDISABLE_BRANCH_PROFILING \ - -Wall -Wstrict-prototypes \ - -march=i386 -mregparm=3 \ - -include $(srctree)/$(src)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a9fcb7cfb241..431fa5f84537 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -28,20 +28,35 @@ static int has_fpu(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +/* + * For building the 16-bit code we want to explicitly specify 32-bit + * push/pop operations, rather than just saying 'pushf' or 'popf' and + * letting the compiler choose. But this is also included from the + * compressed/ directory where it may be 64-bit code, and thus needs + * to be 'pushfq' or 'popfq' in that case. + */ +#ifdef __x86_64__ +#define PUSHF "pushfq" +#define POPF "popfq" +#else +#define PUSHF "pushfl" +#define POPF "popfl" +#endif + int has_eflag(unsigned long mask) { unsigned long f0, f1; - asm volatile("pushf \n\t" - "pushf \n\t" + asm volatile(PUSHF " \n\t" + PUSHF " \n\t" "pop %0 \n\t" "mov %0,%1 \n\t" "xor %2,%1 \n\t" "push %1 \n\t" - "popf \n\t" - "pushf \n\t" + POPF " \n\t" + PUSHF " \n\t" "pop %1 \n\t" - "popf" + POPF : "=&r" (f0), "=&r" (f1) : "ri" (mask)); diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index ff339c5db311..0bb25491262d 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -80,7 +80,7 @@ struct card_info { u16 xmode_n; /* Size of unprobed mode range */ }; -#define __videocard struct card_info __attribute__((section(".videocards"))) +#define __videocard struct card_info __attribute__((used,section(".videocards"))) extern struct card_info video_cards[], video_cards_end[]; int mode_defined(u16 mode); /* video.c */ diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e0fc24db234a..6ba54d640383 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -76,6 +76,7 @@ ifeq ($(avx2_supported),yes) endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o crc32c-intel-y := crc32c-intel_glue.o diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S new file mode 100644 index 000000000000..522ab68d1c88 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -0,0 +1,2811 @@ +######################################################################## +# Copyright (c) 2013, Intel Corporation +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR +# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +## +## Authors: +## Erdinc Ozturk <erdinc.ozturk@intel.com> +## Vinodh Gopal <vinodh.gopal@intel.com> +## James Guilford <james.guilford@intel.com> +## Tim Chen <tim.c.chen@linux.intel.com> +## +## References: +## This code was derived and highly optimized from the code described in paper: +## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation +## on Intel Architecture Processors. August, 2010 +## The details of the implementation is explained in: +## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode +## on Intel Architecture Processors. October, 2012. +## +## Assumptions: +## +## +## +## iv: +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Salt (From the SA) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Initialization Vector | +## | (This is the sequence number from IPSec header) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x1 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## +## +## AAD: +## AAD padded to 128 bits with 0 +## for example, assume AAD is a u32 vector +## +## if AAD is 8 bytes: +## AAD[3] = {A0, A1}# +## padded AAD in xmm register = {A1 A0 0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A1) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 32-bit Sequence Number (A0) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 32-bit Sequence Number +## +## if AAD is 12 bytes: +## AAD[3] = {A0, A1, A2}# +## padded AAD in xmm register = {A2 A1 A0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A2) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 64-bit Extended Sequence Number {A1,A0} | +## | | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 64-bit Extended Sequence Number +## +## +## aadLen: +## from the definition of the spec, aadLen can only be 8 or 12 bytes. +## The code additionally supports aadLen of length 16 bytes. +## +## TLen: +## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +## +## poly = x^128 + x^127 + x^126 + x^121 + 1 +## throughout the code, one tab and two tab indentations are used. one tab is +## for GHASH part, two tabs is for AES part. +## + +#include <linux/linkage.h> +#include <asm/inst.h> + +.data +.align 16 + +POLY: .octa 0xC2000000000000000000000000000001 +POLY2: .octa 0xC20000000000000000000001C2000000 +TWOONE: .octa 0x00000001000000000000000000000001 + +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff +ZERO: .octa 0x00000000000000000000000000000000 +ONE: .octa 0x00000000000000000000000000000001 +ONEf: .octa 0x01000000000000000000000000000000 + +.text + + +##define the fields of the gcm aes context +#{ +# u8 expanded_keys[16*11] store expanded keys +# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here +# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here +# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here +# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here +# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here +# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here +# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here +# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here +# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#} gcm_ctx# + +HashKey = 16*11 # store HashKey <<1 mod poly here +HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx +#define arg5 %r8 +#define arg6 %r9 +#define arg7 STACK_OFFSET+8*1(%r14) +#define arg8 STACK_OFFSET+8*2(%r14) +#define arg9 STACK_OFFSET+8*3(%r14) + +i = 0 +j = 0 + +out_order = 0 +in_order = 1 +DEC = 0 +ENC = 1 + +.macro define_reg r n +reg_\r = %xmm\n +.endm + +.macro setreg +.altmacro +define_reg i %i +define_reg j %j +.noaltmacro +.endm + +# need to push 4 registers into stack to maintain +STACK_OFFSET = 8*4 + +TMP1 = 16*0 # Temporary storage for AAD +TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) +TMP3 = 16*2 # Temporary storage for AES State 3 +TMP4 = 16*3 # Temporary storage for AES State 4 +TMP5 = 16*4 # Temporary storage for AES State 5 +TMP6 = 16*5 # Temporary storage for AES State 6 +TMP7 = 16*6 # Temporary storage for AES State 7 +TMP8 = 16*7 # Temporary storage for AES State 8 + +VARIABLE_OFFSET = 16*8 + +################################ +# Utility Macros +################################ + +# Encryption of a single block +.macro ENCRYPT_SINGLE_BLOCK XMM0 + vpxor (arg1), \XMM0, \XMM0 + i = 1 + setreg +.rep 9 + vaesenc 16*i(arg1), \XMM0, \XMM0 + i = (i+1) + setreg +.endr + vaesenclast 16*10(arg1), \XMM0, \XMM0 +.endm + +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 + + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) + + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK + + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete + + #second phase of the reduction + + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_2_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_3_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_4_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_5_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_6_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_7_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_8_k(arg1) + +.endm + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + +.endm + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + + vmovdqa HashKey_8_k(arg1), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_7_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_6_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_5_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_4_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_3_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_2_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + + + + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T6, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vmovdqa HashKey_8_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_7_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_6_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_5_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_4_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_3_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_2_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of + # the accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + +.endm + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen2 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen2) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen2) + +############################################################################### +#void aesni_gcm_enc_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen2) + GCM_ENC_DEC_AVX ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen2) + +############################################################################### +#void aesni_gcm_dec_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen2) + GCM_ENC_DEC_AVX DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen2) +#endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX2 +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 + + vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 + vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 + vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 + vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 + vpxor \T3, \GH, \GH + + + vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs + vpslldq $8 , \GH, \GH # shift-L GH 2 DWs + + vpxor \T3, \T1, \T1 + vpxor \T2, \GH, \GH + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \GH, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L T2 2 DWs + + vpxor \T2, \GH, \GH # first phase of the reduction complete + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \GH, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \GH, \T3, \GH + vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \GH, \GH # second phase of the reduction complete + ####################################################################### + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + +.endm + + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + # num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with + # the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + + +.endm + + + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 + vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 + vpxor \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T1 + + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 + + + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T1, \T1 # the result is in T1 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T1, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + vmovdqa HashKey_8(arg1), \T5 + + vpshufd $0b01001110, \XMM1, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM1, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vmovdqa HashKey_7(arg1), \T5 + vpshufd $0b01001110, \XMM2, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_6(arg1), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_5(arg1), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_4(arg1), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_3(arg1), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_2(arg1), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey(arg1), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the + # accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T6, \T6 # the result is in T6 +.endm + + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX2 ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer + # to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen4 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. +# Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen4) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen4) + + +############################################################################### +#void aesni_gcm_enc_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen4) + GCM_ENC_DEC_AVX2 ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen4) + +############################################################################### +#void aesni_gcm_dec_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen4) + GCM_ENC_DEC_AVX2 DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen4) + +#endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 835488b745ee..948ad0e77741 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, int crypto_fpu_init(void); void crypto_fpu_exit(void); +#define AVX_GEN2_OPTSIZE 640 +#define AVX_GEN4_OPTSIZE 4096 + #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); + +#ifdef CONFIG_AS_AVX +/* + * asmlinkage void aesni_gcm_precomp_avx_gen2() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +#ifdef CONFIG_AS_AVX2 +/* + * asmlinkage void aesni_gcm_precomp_avx_gen4() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else if (plaintext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, + aad, aad_len, auth_tag, auth_tag_len); + } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) dst = src; } - aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, + aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + ((unsigned long)req->cryptlen), auth_tag_len); @@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) dst = src; } - aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, + aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, authTag, auth_tag_len); /* Compare generated tag with passed in tag. */ - retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? + retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? -EBADMSG : 0; if (one_entry_in_sg) { @@ -1353,6 +1473,27 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_AS_AVX2 + if (boot_cpu_has(X86_FEATURE_AVX2)) { + pr_info("AVX2 version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + } else +#endif +#ifdef CONFIG_AS_AVX + if (boot_cpu_has(X86_FEATURE_AVX)) { + pr_info("AVX version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + } else +#endif + { + pr_info("SSE version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc; + aesni_gcm_dec_tfm = aesni_gcm_dec; + } +#endif err = crypto_fpu_init(); if (err) diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index fd8f9e2ca35f..535192f6bfad 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,7 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_ioremap early_ioremap -#define dmi_iounmap early_iounmap +#define dmi_early_remap early_ioremap +#define dmi_early_unmap early_iounmap +#define dmi_remap ioremap +#define dmi_unmap iounmap #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e846225265ed..7252cd339175 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -175,64 +175,7 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) - -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -#define clear_fixmap(idx) \ - __set_fixmap(idx, 0, __pgprot(0)) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without translation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static __always_inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - -static inline unsigned long virt_to_fix(const unsigned long vaddr) -{ - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); - return __virt_to_fix(vaddr); -} - -/* Return an pointer with offset calculated */ -static __always_inline unsigned long -__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) -{ - __set_fixmap(idx, phys, flags); - return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); -} - -#define set_fixmap_offset(idx, phys) \ - __set_fixmap_offset(idx, phys, PAGE_KERNEL) - -#define set_fixmap_offset_nocache(idx, phys) \ - __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) +#include <asm-generic/fixmap.h> #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h new file mode 100644 index 000000000000..e8c58f88b1d4 --- /dev/null +++ b/arch/x86/include/asm/hash.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_HASH_H +#define _ASM_X86_HASH_H + +struct fast_hash_ops; +extern void setup_arch_fast_hash(struct fast_hash_ops *ops); + +#endif /* _ASM_X86_HASH_H */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2f59cce3b38a..f97fbe3abb67 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -static inline phys_addr_t get_max_low_mapped(void) +static inline phys_addr_t get_max_mapped(void) { - return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT; + return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 401f350ef71b..cd6e1610e29e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -781,9 +781,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, */ #define PV_CALLEE_SAVE_REGS_THUNK(func) \ extern typeof(func) __raw_callee_save_##func; \ - static void *__##func##__ __used = func; \ \ asm(".pushsection .text;" \ + ".globl __raw_callee_save_" #func " ; " \ "__raw_callee_save_" #func ": " \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index aab8f671b523..7549b8b369e4 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -388,10 +388,11 @@ extern struct pv_lock_ops pv_lock_ops; _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") /* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[] __visible, \ - end_##ops##_##name[] __visible; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") +#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" + +#define DEF_NATIVE(ops, name, code) \ + __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) unsigned paravirt_patch_nop(void); unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a83aa44bb1fb..1aa9ccd43223 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -121,7 +121,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SOFT_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 3ba3de457d05..e1940c06ed02 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -163,9 +163,11 @@ struct thread_info { */ #ifndef __ASSEMBLY__ - -/* how to get the current stack pointer from C */ -register unsigned long current_stack_pointer asm("esp") __used; +#define current_stack_pointer ({ \ + unsigned long sp; \ + asm("mov %%esp,%0" : "=g" (sp)); \ + sp; \ +}) /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h index ee50c801f7b7..cc2d6a3aeae7 100644 --- a/arch/x86/include/uapi/asm/sembuf.h +++ b/arch/x86/include/uapi/asm/sembuf.h @@ -13,12 +13,12 @@ struct semid64_ds { struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ __kernel_time_t sem_otime; /* last semop time */ - unsigned long __unused1; + __kernel_ulong_t __unused1; __kernel_time_t sem_ctime; /* last change time */ - unsigned long __unused2; - unsigned long sem_nsems; /* no. of semaphores in array */ - unsigned long __unused3; - unsigned long __unused4; + __kernel_ulong_t __unused2; + __kernel_ulong_t sem_nsems; /* no. of semaphores in array */ + __kernel_ulong_t __unused3; + __kernel_ulong_t __unused4; }; #endif /* _ASM_X86_SEMBUF_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d359d0fffa50..1dac94265b59 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -46,7 +46,6 @@ #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; -u32 acpi_rsdt_forced; int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); @@ -1562,7 +1561,7 @@ static int __init parse_acpi(char *arg) } /* acpi=rsdt use RSDT instead of XSDT */ else if (strcmp(arg, "rsdt") == 0) { - acpi_rsdt_forced = 1; + acpi_gbl_do_not_use_xsdt = TRUE; } /* "acpi=noirq" disables ACPI interrupt routing */ else if (strcmp(arg, "noirq") == 0) { diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5d5b9eb2b7a4..2c621a6b901a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -20,9 +20,7 @@ #include <asm/apic.h> #include <asm/ipi.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif +#include <linux/acpi.h> static struct apic apic_physflat; static struct apic apic_flat; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a43f068ebec1..6ad4658de705 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -37,9 +37,6 @@ #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif #include <linux/bootmem.h> #include <linux/dmar.h> #include <linux/hpet.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f81cadebcc0e..713f1b3bad52 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -705,7 +705,7 @@ static cpumask_t waiting_cpus; /* Track spinlock on which a cpu is waiting */ static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); -static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { struct kvm_lock_waiting *w; int cpu; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c9675594d7ca..06853e670354 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock_set_current_limit(get_max_low_mapped()); + memblock_set_current_limit(get_max_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a3acbac2ee72..19e5adb49a27 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -180,7 +180,7 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) static void cyc2ns_data_init(struct cyc2ns_data *data) { - data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_mul = 0; data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; data->cyc2ns_offset = 0; data->__count = 0; diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 992f890283e9..f6584a90aba3 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -33,7 +33,7 @@ * and vice versa. */ -static unsigned long vsmp_save_fl(void) +asmlinkage unsigned long vsmp_save_fl(void) { unsigned long flags = native_save_fl(); @@ -43,7 +43,7 @@ static unsigned long vsmp_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); -static void vsmp_restore_fl(unsigned long flags) +__visible void vsmp_restore_fl(unsigned long flags) { if (flags & X86_EFLAGS_IF) flags &= ~X86_EFLAGS_AC; @@ -53,7 +53,7 @@ static void vsmp_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -static void vsmp_irq_disable(void) +asmlinkage void vsmp_irq_disable(void) { unsigned long flags = native_save_fl(); @@ -61,7 +61,7 @@ static void vsmp_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -static void vsmp_irq_enable(void) +asmlinkage void vsmp_irq_enable(void) { unsigned long flags = native_save_fl(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index bdf8532494fe..ad1fb5f53925 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next) * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ -static unsigned long save_fl(void) +asmlinkage unsigned long lguest_save_fl(void) { return lguest_data.irq_enabled; } /* Interrupts go off... */ -static void irq_disable(void) +asmlinkage void lguest_irq_disable(void) { lguest_data.irq_enabled = 0; } @@ -253,8 +253,8 @@ static void irq_disable(void) * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the * C function, then restores it. */ -PV_CALLEE_SAVE_REGS_THUNK(save_fl); -PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); +PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); /*:*/ /* These are in i386_head.S */ @@ -1291,9 +1291,9 @@ __init void lguest_init(void) */ /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); + pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); + pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 992d63bb154f..eabcb6e6a900 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ lib-$(CONFIG_SMP) += rwlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hash.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c new file mode 100644 index 000000000000..3056702e81fb --- /dev/null +++ b/arch/x86/lib/hash.c @@ -0,0 +1,88 @@ +/* + * Some portions derived from code covered by the following notice: + * + * Copyright (c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/hash.h> + +#include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/hash.h> + +static inline u32 crc32_u32(u32 crc, u32 val) +{ + asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val)); + return crc; +} + +static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed) +{ + const u32 *p32 = (const u32 *) data; + u32 i, tmp = 0; + + for (i = 0; i < len / 4; i++) + seed = crc32_u32(*p32++, seed); + + switch (3 - (len & 0x03)) { + case 0: + tmp |= *((const u8 *) p32 + 2) << 16; + /* fallthrough */ + case 1: + tmp |= *((const u8 *) p32 + 1) << 8; + /* fallthrough */ + case 2: + tmp |= *((const u8 *) p32); + seed = crc32_u32(tmp, seed); + default: + break; + } + + return seed; +} + +static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed) +{ + const u32 *p32 = (const u32 *) data; + u32 i; + + for (i = 0; i < len; i++) + seed = crc32_u32(*p32++, seed); + + return seed; +} + +void setup_arch_fast_hash(struct fast_hash_ops *ops) +{ + if (cpu_has_xmm4_2) { + ops->hash = intel_crc4_2_hash; + ops->hash2 = intel_crc4_2_hash2; + } +} diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 59d353d2c599..a5449089cd9f 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -330,11 +330,6 @@ asmlinkage void FPU_exception(int n) RE_ENTRANT_CHECK_OFF; if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) { -#ifdef PRINT_MESSAGES - /* My message from the sponsor */ - printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n"); -#endif /* PRINT_MESSAGES */ - /* Get a name string for error reporting */ for (i = 0; exception_names[i].type; i++) if ((exception_names[i].type & n) == diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 0596e8e0cc19..207d9aef662d 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, static inline void get_head_page_multiple(struct page *page, int nr) { - VM_BUG_ON(page != compound_head(page)); - VM_BUG_ON(page_count(page) == 0); + VM_BUG_ON_PAGE(page != compound_head(page), page); + VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_add(nr, &page->_count); SetPageReferenced(page); } @@ -135,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); @@ -212,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 082e88129712..248642f4bab7 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -12,7 +12,6 @@ #include <linux/pci.h> #include <linux/init.h> -#include <linux/acpi.h> #include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/dmi.h> diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 5c90975cdf0f..43984bc1665a 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -14,7 +14,6 @@ #include <linux/rcupdate.h> #include <asm/e820.h> #include <asm/pci_x86.h> -#include <acpi/acpi.h> /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h index 8f568dd79605..79bb09d4f718 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h +++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h @@ -12,6 +12,7 @@ #ifndef _PLATFORM_IPC_H_ #define _PLATFORM_IPC_H_ -extern void __init ipc_device_handler(struct sfi_device_table_entry *pentry, - struct devs_id *dev) __attribute__((weak)); +void __init +ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev); + #endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h index 917eb56d77da..b7be1d041da2 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic.h +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h @@ -14,6 +14,6 @@ extern struct intel_msic_platform_data msic_pdata; -extern void *msic_generic_platform_data(void *info, - enum intel_msic_block block) __attribute__((weak)); +void *msic_generic_platform_data(void *info, enum intel_msic_block block); + #endif diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index a537ffc16299..46aa25c8ce06 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -14,6 +14,6 @@ /* For every CPU addition a new get_<cpuname>_ops interface needs * to be added. */ -extern void * __cpuinit get_penwell_ops(void) __attribute__((weak)); -extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak)); -extern void * __init get_tangier_ops(void) __attribute__((weak)); +extern void *get_penwell_ops(void) __attribute__((weak)); +extern void *get_cloverview_ops(void) __attribute__((weak)); +extern void *get_tangier_ops(void) __attribute__((weak)); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 4f7884eebc14..23381d2174ae 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -58,18 +58,18 @@ static unsigned long __init mfld_calibrate_tsc(void) return 0; } -static void __init penwell_arch_setup() +static void __init penwell_arch_setup(void) { x86_platform.calibrate_tsc = mfld_calibrate_tsc; pm_power_off = mfld_power_off; } -void * __cpuinit get_penwell_ops() +void *get_penwell_ops(void) { return &penwell_ops; } -void * __cpuinit get_cloverview_ops() +void *get_cloverview_ops(void) { return &penwell_ops; } diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c index 09d10159e7b7..aaca91753d32 100644 --- a/arch/x86/platform/intel-mid/mrfl.c +++ b/arch/x86/platform/intel-mid/mrfl.c @@ -97,7 +97,7 @@ static struct intel_mid_ops tangier_ops = { .arch_setup = tangier_arch_setup, }; -void * __cpuinit get_tangier_ops() +void *get_tangier_ops(void) { return &tangier_ops; } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 649a12befba9..08e350e757dc 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -15,8 +15,7 @@ #include <linux/power_supply.h> #include <linux/olpc-ec.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> +#include <linux/acpi.h> #include <asm/olpc.h> #define DRV_NAME "olpc-xo15-sci" diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 9cac82588cbc..3497f14e4dea 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -64,20 +64,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # --------------------------------------------------------------------------- -# How to compile the 16-bit code. Note we always compile for -march=i386, -# that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ - -I$(srctree)/arch/x86/boot \ - -DDISABLE_BRANCH_PROFILING \ - -Wall -Wstrict-prototypes \ - -march=i386 -mregparm=3 \ - -include $(srctree)/$(src)/../../boot/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ + -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 11f9285a2ff6..cfbdbdb4e173 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -1025,6 +1025,29 @@ static void emit_relocs(int as_text, int use_real_mode) } } +/* + * As an aid to debugging problems with different linkers + * print summary information about the relocs. + * Since different linkers tend to emit the sections in + * different orders we use the section names in the output. + */ +static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, + const char *symname) +{ + printf("%s\t%s\t%s\t%s\n", + sec_name(sec->shdr.sh_info), + rel_type(ELF_R_TYPE(rel->r_info)), + symname, + sec_name(sym->st_shndx)); + return 0; +} + +static void print_reloc_info(void) +{ + printf("reloc section\treloc type\tsymbol\tsymbol section\n"); + walk_relocs(do_reloc_info); +} + #if ELF_BITS == 64 # define process process_64 #else @@ -1032,7 +1055,8 @@ static void emit_relocs(int as_text, int use_real_mode) #endif void process(FILE *fp, int use_real_mode, int as_text, - int show_absolute_syms, int show_absolute_relocs) + int show_absolute_syms, int show_absolute_relocs, + int show_reloc_info) { regex_init(use_real_mode); read_ehdr(fp); @@ -1050,5 +1074,9 @@ void process(FILE *fp, int use_real_mode, int as_text, print_absolute_relocs(); return; } + if (show_reloc_info) { + print_reloc_info(); + return; + } emit_relocs(as_text, use_real_mode); } diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 07cdb1eca4fa..f59590645b68 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -29,8 +29,9 @@ enum symtype { }; void process_32(FILE *fp, int use_real_mode, int as_text, - int show_absolute_syms, int show_absolute_relocs); + int show_absolute_syms, int show_absolute_relocs, + int show_reloc_info); void process_64(FILE *fp, int use_real_mode, int as_text, - int show_absolute_syms, int show_absolute_relocs); - + int show_absolute_syms, int show_absolute_relocs, + int show_reloc_info); #endif /* RELOCS_H */ diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c index 44d396823a53..acab636bcb34 100644 --- a/arch/x86/tools/relocs_common.c +++ b/arch/x86/tools/relocs_common.c @@ -11,12 +11,13 @@ void die(char *fmt, ...) static void usage(void) { - die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); + die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode]" \ + " vmlinux\n"); } int main(int argc, char **argv) { - int show_absolute_syms, show_absolute_relocs; + int show_absolute_syms, show_absolute_relocs, show_reloc_info; int as_text, use_real_mode; const char *fname; FILE *fp; @@ -25,6 +26,7 @@ int main(int argc, char **argv) show_absolute_syms = 0; show_absolute_relocs = 0; + show_reloc_info = 0; as_text = 0; use_real_mode = 0; fname = NULL; @@ -39,6 +41,10 @@ int main(int argc, char **argv) show_absolute_relocs = 1; continue; } + if (strcmp(arg, "--reloc-info") == 0) { + show_reloc_info = 1; + continue; + } if (strcmp(arg, "--text") == 0) { as_text = 1; continue; @@ -67,10 +73,12 @@ int main(int argc, char **argv) rewind(fp); if (e_ident[EI_CLASS] == ELFCLASS64) process_64(fp, use_real_mode, as_text, - show_absolute_syms, show_absolute_relocs); + show_absolute_syms, show_absolute_relocs, + show_reloc_info); else process_32(fp, use_real_mode, as_text, - show_absolute_syms, show_absolute_relocs); + show_absolute_syms, show_absolute_relocs, + show_reloc_info); fclose(fp); return 0; } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 76ca326105f7..08f763de26fe 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -23,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -static unsigned long xen_save_fl(void) +asmlinkage unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -41,7 +41,7 @@ static unsigned long xen_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -static void xen_restore_fl(unsigned long flags) +__visible void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; @@ -63,7 +63,7 @@ static void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -static void xen_irq_disable(void) +asmlinkage void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -74,7 +74,7 @@ static void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -static void xen_irq_enable(void) +asmlinkage void xen_irq_enable(void) { struct vcpu_info *vcpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c1d406f35523..2423ef04ffea 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -431,7 +431,7 @@ static pteval_t iomap_pte(pteval_t val) return val; } -static pteval_t xen_pte_val(pte_t pte) +__visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; #if 0 @@ -448,7 +448,7 @@ static pteval_t xen_pte_val(pte_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -static pgdval_t xen_pgd_val(pgd_t pgd) +__visible pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } @@ -479,7 +479,7 @@ void xen_set_pat(u64 pat) WARN_ON(pat != 0x0007010600070106ull); } -static pte_t xen_make_pte(pteval_t pte) +__visible pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); #if 0 @@ -514,14 +514,14 @@ static pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -static pgd_t xen_make_pgd(pgdval_t pgd) +__visible pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -static pmdval_t xen_pmd_val(pmd_t pmd) +__visible pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } @@ -580,7 +580,7 @@ static void xen_pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -static pmd_t xen_make_pmd(pmdval_t pmd) +__visible pmd_t xen_make_pmd(pmdval_t pmd) { pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); @@ -588,13 +588,13 @@ static pmd_t xen_make_pmd(pmdval_t pmd) PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 -static pudval_t xen_pud_val(pud_t pud) +__visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -static pud_t xen_make_pud(pudval_t pud) +__visible pud_t xen_make_pud(pudval_t pud) { pud = pte_pfn_to_mfn(pud); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dd5f905e33d5..0982233b9b84 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -35,7 +35,7 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; #ifdef CONFIG_X86_64 -extern const char nmi[]; +extern asmlinkage void nmi(void); #endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); @@ -577,7 +577,7 @@ void xen_enable_syscall(void) void xen_enable_nmi(void) { #ifdef CONFIG_X86_64 - if (register_callback(CALLBACKTYPE_nmi, nmi)) + if (register_callback(CALLBACKTYPE_nmi, (char *)nmi)) BUG(); #endif } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0e36cde12f7e..581521c843a5 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -106,7 +106,7 @@ static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; static bool xen_pvspin = true; -static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); |