summaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/ghash-ce-core.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-07-29 16:52:30 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2018-08-07 17:26:23 +0800
commitf10dc56c64bb662822475304508c1ce99f194e70 (patch)
tree56cea69b3ba97105e84a7d28ac782000c745b8d6 /arch/arm64/crypto/ghash-ce-core.S
parent46d8c4b28652d35dc6cfb5adf7f54e102fc04384 (diff)
downloadlinux-f10dc56c64bb662822475304508c1ce99f194e70.tar.bz2
crypto: arm64 - revert NEON yield for fast AEAD implementations
As it turns out, checking the TIF_NEED_RESCHED flag after each iteration results in a significant performance regression (~10%) when running fast algorithms (i.e., ones that use special instructions and operate in the < 4 cycles per byte range) on in-order cores with comparatively slow memory accesses such as the Cortex-A53. Given the speed of these ciphers, and the fact that the page based nature of the AEAD scatterwalk API guarantees that the core NEON transform is never invoked with more than a single page's worth of input, we can estimate the worst case duration of any resulting scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages, processing a page's worth of input at 4 cycles per byte results in a delay of ~250 us, which is a reasonable upper bound. So let's remove the yield checks from the fused AES-CCM and AES-GCM routines entirely. This reverts commit 7b67ae4d5ce8e2f912377f5fbccb95811a92097f and partially reverts commit 7c50136a8aba8784f07fb66a950cc61a7f3d2ee3. Fixes: 7c50136a8aba ("crypto: arm64/aes-ghash - yield NEON after every ...") Fixes: 7b67ae4d5ce8 ("crypto: arm64/aes-ccm - yield NEON after every ...") Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S76
1 files changed, 25 insertions, 51 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index dcffb9e77589..c723647b37db 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
.endm
.macro pmull_gcm_do_crypt, enc
- frame_push 10
+ ld1 {SHASH.2d}, [x4]
+ ld1 {XL.2d}, [x1]
+ ldr x8, [x5, #8] // load lower counter
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
- mov x25, x6
- mov x26, x7
- .if \enc == 1
- ldr x27, [sp, #96] // first stacked arg
- .endif
-
- ldr x28, [x24, #8] // load lower counter
-CPU_LE( rev x28, x28 )
-
-0: mov x0, x25
- load_round_keys w26, x0
- ld1 {SHASH.2d}, [x23]
- ld1 {XL.2d}, [x20]
+ load_round_keys w7, x6
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b
.if \enc == 1
- ld1 {KS.16b}, [x27]
+ ldr x10, [sp]
+ ld1 {KS.16b}, [x10]
.endif
-1: ld1 {CTR.8b}, [x24] // load upper counter
- ld1 {INP.16b}, [x22], #16
- rev x9, x28
- add x28, x28, #1
- sub w19, w19, #1
+0: ld1 {CTR.8b}, [x5] // load upper counter
+ ld1 {INP.16b}, [x3], #16
+ rev x9, x8
+ add x8, x8, #1
+ sub w0, w0, #1
ins CTR.d[1], x9 // set lower counter
.if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input
- st1 {INP.16b}, [x21], #16
+ st1 {INP.16b}, [x2], #16
.endif
rev64 T1.16b, INP.16b
- cmp w26, #12
- b.ge 4f // AES-192/256?
+ cmp w7, #12
+ b.ge 2f // AES-192/256?
-2: enc_round CTR, v21
+1: enc_round CTR, v21
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 )
.if \enc == 0
eor INP.16b, INP.16b, KS.16b
- st1 {INP.16b}, [x21], #16
+ st1 {INP.16b}, [x2], #16
.endif
- cbz w19, 3f
+ cbnz w0, 0b
- if_will_cond_yield_neon
- st1 {XL.2d}, [x20]
- .if \enc == 1
- st1 {KS.16b}, [x27]
- .endif
- do_cond_yield_neon
- b 0b
- endif_yield_neon
+CPU_LE( rev x8, x8 )
+ st1 {XL.2d}, [x1]
+ str x8, [x5, #8] // store lower counter
- b 1b
-
-3: st1 {XL.2d}, [x20]
.if \enc == 1
- st1 {KS.16b}, [x27]
+ st1 {KS.16b}, [x10]
.endif
-CPU_LE( rev x28, x28 )
- str x28, [x24, #8] // store lower counter
-
- frame_pop
ret
-4: b.eq 5f // AES-192?
+2: b.eq 3f // AES-192?
enc_round CTR, v17
enc_round CTR, v18
-5: enc_round CTR, v19
+3: enc_round CTR, v19
enc_round CTR, v20
- b 2b
+ b 1b
.endm
/*