diff options
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 896 |
1 files changed, 896 insertions, 0 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S new file mode 100644 index 000000000000..caba99601703 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -0,0 +1,896 @@ +/* + * Implement AES algorithm in Intel AES-NI instructions. + * + * The white paper of AES-NI instructions can be downloaded from: + * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf + * + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Kahraman Akdemir + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.text + +#define STATE1 %xmm0 +#define STATE2 %xmm4 +#define STATE3 %xmm5 +#define STATE4 %xmm6 +#define STATE STATE1 +#define IN1 %xmm1 +#define IN2 %xmm7 +#define IN3 %xmm8 +#define IN4 %xmm9 +#define IN IN1 +#define KEY %xmm2 +#define IV %xmm3 + +#define KEYP %rdi +#define OUTP %rsi +#define INP %rdx +#define LEN %rcx +#define IVP %r8 +#define KLEN %r9d +#define T1 %r10 +#define TKEYP T1 +#define T2 %r11 + +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + movaps %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movaps %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movaps %xmm1, 16(%rcx) + add $0x20, %rcx + ret + +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movaps %xmm2, (%rcx) + add $0x10, %rcx + ret + +/* + * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) + */ +ENTRY(aesni_set_key) + movups (%rsi), %xmm0 # user key (first 16 bytes) + movaps %xmm0, (%rdi) + lea 0x10(%rdi), %rcx # key addr + movl %edx, 480(%rdi) + pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x + cmp $24, %dl + jb .Lenc_key128 + je .Lenc_key192 + movups 0x10(%rsi), %xmm2 # other user key + movaps %xmm2, (%rcx) + add $0x10, %rcx + # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + call _key_expansion_256a + # aeskeygenassist $0x1, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + call _key_expansion_256b + # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + call _key_expansion_256a + # aeskeygenassist $0x2, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + call _key_expansion_256b + # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + call _key_expansion_256a + # aeskeygenassist $0x4, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + call _key_expansion_256b + # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + call _key_expansion_256a + # aeskeygenassist $0x8, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + call _key_expansion_256b + # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + call _key_expansion_256a + # aeskeygenassist $0x10, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + call _key_expansion_256b + # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + call _key_expansion_256a + # aeskeygenassist $0x20, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + call _key_expansion_256b + # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + call _key_expansion_256a + jmp .Ldec_key +.Lenc_key192: + movq 0x10(%rsi), %xmm2 # other user key + # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + call _key_expansion_192a + # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + call _key_expansion_192b + # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + call _key_expansion_192a + # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + call _key_expansion_192b + # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + call _key_expansion_192a + # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + call _key_expansion_192b + # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + call _key_expansion_192a + # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80 + call _key_expansion_192b + jmp .Ldec_key +.Lenc_key128: + # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + call _key_expansion_128 + # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + call _key_expansion_128 + # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + call _key_expansion_128 + # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + call _key_expansion_128 + # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + call _key_expansion_128 + # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + call _key_expansion_128 + # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40 + call _key_expansion_128 + # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80 + call _key_expansion_128 + # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b + call _key_expansion_128 + # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36 + call _key_expansion_128 +.Ldec_key: + sub $0x10, %rcx + movaps (%rdi), %xmm0 + movaps (%rcx), %xmm1 + movaps %xmm0, 240(%rcx) + movaps %xmm1, 240(%rdi) + add $0x10, %rdi + lea 240-16(%rcx), %rsi +.align 4 +.Ldec_key_loop: + movaps (%rdi), %xmm0 + # aesimc %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8 + movaps %xmm1, (%rsi) + add $0x10, %rdi + sub $0x10, %rsi + cmp %rcx, %rdi + jb .Ldec_key_loop + xor %rax, %rax + ret + +/* + * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + */ +ENTRY(aesni_enc) + movl 480(KEYP), KLEN # key length + movups (INP), STATE # input + call _aesni_enc1 + movups STATE, (OUTP) # output + ret + +/* + * _aesni_enc1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Lenc128 + lea 0x20(TKEYP), TKEYP + je .Lenc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x50(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 +.align 4 +.Lenc192: + movaps -0x40(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x30(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 +.align 4 +.Lenc128: + movaps -0x20(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x10(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps (TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x10(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x20(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x30(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x40(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x50(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x60(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x70(TKEYP), KEY + # aesenclast KEY, STATE # last round + .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + ret + +/* + * _aesni_enc4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4enc128 + lea 0x20(TKEYP), TKEYP + je .L4enc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x50(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 +#.align 4 +.L4enc192: + movaps -0x40(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x30(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 +#.align 4 +.L4enc128: + movaps -0x20(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x10(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps (TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x10(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x20(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x30(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x40(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x50(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x60(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x70(TKEYP), KEY + # aesenclast KEY, STATE1 # last round + .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + # aesenclast KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 + # aesenclast KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xea + # aesenclast KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2 + ret + +/* + * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + */ +ENTRY(aesni_dec) + mov 480(KEYP), KLEN # key length + add $240, KEYP + movups (INP), STATE # input + call _aesni_dec1 + movups STATE, (OUTP) #output + ret + +/* + * _aesni_dec1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Ldec128 + lea 0x20(TKEYP), TKEYP + je .Ldec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x50(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 +.align 4 +.Ldec192: + movaps -0x40(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x30(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 +.align 4 +.Ldec128: + movaps -0x20(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x10(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps (TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x10(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x20(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x30(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x40(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x50(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x60(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x70(TKEYP), KEY + # aesdeclast KEY, STATE # last round + .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + ret + +/* + * _aesni_dec4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4dec128 + lea 0x20(TKEYP), TKEYP + je .L4dec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x50(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 +.align 4 +.L4dec192: + movaps -0x40(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x30(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 +.align 4 +.L4dec128: + movaps -0x20(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x10(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps (TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x10(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x20(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x30(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x40(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x50(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x60(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x70(TKEYP), KEY + # aesdeclast KEY, STATE1 # last round + .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + # aesdeclast KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 + # aesdeclast KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdf, 0xea + # aesdeclast KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2 + ret + +/* + * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len) + */ +ENTRY(aesni_ecb_enc) + test LEN, LEN # check length + jz .Lecb_enc_ret + mov 480(KEYP), KLEN + cmp $16, LEN + jb .Lecb_enc_ret + cmp $64, LEN + jb .Lecb_enc_loop1 +.align 4 +.Lecb_enc_loop4: + movups (INP), STATE1 + movups 0x10(INP), STATE2 + movups 0x20(INP), STATE3 + movups 0x30(INP), STATE4 + call _aesni_enc4 + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lecb_enc_loop4 + cmp $16, LEN + jb .Lecb_enc_ret +.align 4 +.Lecb_enc_loop1: + movups (INP), STATE1 + call _aesni_enc1 + movups STATE1, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lecb_enc_loop1 +.Lecb_enc_ret: + ret + +/* + * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len); + */ +ENTRY(aesni_ecb_dec) + test LEN, LEN + jz .Lecb_dec_ret + mov 480(KEYP), KLEN + add $240, KEYP + cmp $16, LEN + jb .Lecb_dec_ret + cmp $64, LEN + jb .Lecb_dec_loop1 +.align 4 +.Lecb_dec_loop4: + movups (INP), STATE1 + movups 0x10(INP), STATE2 + movups 0x20(INP), STATE3 + movups 0x30(INP), STATE4 + call _aesni_dec4 + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lecb_dec_loop4 + cmp $16, LEN + jb .Lecb_dec_ret +.align 4 +.Lecb_dec_loop1: + movups (INP), STATE1 + call _aesni_dec1 + movups STATE1, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lecb_dec_loop1 +.Lecb_dec_ret: + ret + +/* + * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +ENTRY(aesni_cbc_enc) + cmp $16, LEN + jb .Lcbc_enc_ret + mov 480(KEYP), KLEN + movups (IVP), STATE # load iv as initial state +.align 4 +.Lcbc_enc_loop: + movups (INP), IN # load input + pxor IN, STATE + call _aesni_enc1 + movups STATE, (OUTP) # store output + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lcbc_enc_loop + movups STATE, (IVP) +.Lcbc_enc_ret: + ret + +/* + * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +ENTRY(aesni_cbc_dec) + cmp $16, LEN + jb .Lcbc_dec_ret + mov 480(KEYP), KLEN + add $240, KEYP + movups (IVP), IV + cmp $64, LEN + jb .Lcbc_dec_loop1 +.align 4 +.Lcbc_dec_loop4: + movups (INP), IN1 + movaps IN1, STATE1 + movups 0x10(INP), IN2 + movaps IN2, STATE2 + movups 0x20(INP), IN3 + movaps IN3, STATE3 + movups 0x30(INP), IN4 + movaps IN4, STATE4 + call _aesni_dec4 + pxor IV, STATE1 + pxor IN1, STATE2 + pxor IN2, STATE3 + pxor IN3, STATE4 + movaps IN4, IV + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lcbc_dec_loop4 + cmp $16, LEN + jb .Lcbc_dec_ret +.align 4 +.Lcbc_dec_loop1: + movups (INP), IN + movaps IN, STATE + call _aesni_dec1 + pxor IV, STATE + movups STATE, (OUTP) + movaps IN, IV + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lcbc_dec_loop1 + movups IV, (IVP) +.Lcbc_dec_ret: + ret |