draconisplusplus/subprojects/openssl-3.0.8/generated-config/archs/linux-aarch64/asm/crypto/chacha/chacha-armv8.S

2035 lines
41 KiB
ArmAsm

#ifndef __KERNEL__
# include "arm_arch.h"
.hidden OPENSSL_armcap_P
#endif
.text
.align 5
.Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
.long 1,2,3,4
.Lrot24:
.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
.align 2
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,%function
.align 5
ChaCha20_ctr32:
cbz x2,.Labort
cmp x2,#192
b.lo .Lshort
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
tst w17,#ARMV7_NEON
b.ne .LChaCha20_neon
#endif
.Lshort:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#64
ldp x22,x23,[x5] // load sigma
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ldp x28,x30,[x4] // load counter
#ifdef __AARCH64EB__
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
.Loop_outer:
mov w5,w22 // unpack key block
lsr x6,x22,#32
mov w7,w23
lsr x8,x23,#32
mov w9,w24
lsr x10,x24,#32
mov w11,w25
lsr x12,x25,#32
mov w13,w26
lsr x14,x26,#32
mov w15,w27
lsr x16,x27,#32
mov w17,w28
lsr x19,x28,#32
mov w20,w30
lsr x21,x30,#32
mov x4,#10
subs x2,x2,#64
.Loop:
sub x4,x4,#1
add w5,w5,w9
add w6,w6,w10
add w7,w7,w11
add w8,w8,w12
eor w17,w17,w5
eor w19,w19,w6
eor w20,w20,w7
eor w21,w21,w8
ror w17,w17,#16
ror w19,w19,#16
ror w20,w20,#16
ror w21,w21,#16
add w13,w13,w17
add w14,w14,w19
add w15,w15,w20
add w16,w16,w21
eor w9,w9,w13
eor w10,w10,w14
eor w11,w11,w15
eor w12,w12,w16
ror w9,w9,#20
ror w10,w10,#20
ror w11,w11,#20
ror w12,w12,#20
add w5,w5,w9
add w6,w6,w10
add w7,w7,w11
add w8,w8,w12
eor w17,w17,w5
eor w19,w19,w6
eor w20,w20,w7
eor w21,w21,w8
ror w17,w17,#24
ror w19,w19,#24
ror w20,w20,#24
ror w21,w21,#24
add w13,w13,w17
add w14,w14,w19
add w15,w15,w20
add w16,w16,w21
eor w9,w9,w13
eor w10,w10,w14
eor w11,w11,w15
eor w12,w12,w16
ror w9,w9,#25
ror w10,w10,#25
ror w11,w11,#25
ror w12,w12,#25
add w5,w5,w10
add w6,w6,w11
add w7,w7,w12
add w8,w8,w9
eor w21,w21,w5
eor w17,w17,w6
eor w19,w19,w7
eor w20,w20,w8
ror w21,w21,#16
ror w17,w17,#16
ror w19,w19,#16
ror w20,w20,#16
add w15,w15,w21
add w16,w16,w17
add w13,w13,w19
add w14,w14,w20
eor w10,w10,w15
eor w11,w11,w16
eor w12,w12,w13
eor w9,w9,w14
ror w10,w10,#20
ror w11,w11,#20
ror w12,w12,#20
ror w9,w9,#20
add w5,w5,w10
add w6,w6,w11
add w7,w7,w12
add w8,w8,w9
eor w21,w21,w5
eor w17,w17,w6
eor w19,w19,w7
eor w20,w20,w8
ror w21,w21,#24
ror w17,w17,#24
ror w19,w19,#24
ror w20,w20,#24
add w15,w15,w21
add w16,w16,w17
add w13,w13,w19
add w14,w14,w20
eor w10,w10,w15
eor w11,w11,w16
eor w12,w12,w13
eor w9,w9,w14
ror w10,w10,#25
ror w11,w11,#25
ror w12,w12,#25
ror w9,w9,#25
cbnz x4,.Loop
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
add w7,w7,w23
add x8,x8,x23,lsr#32
add w9,w9,w24
add x10,x10,x24,lsr#32
add w11,w11,w25
add x12,x12,x25,lsr#32
add w13,w13,w26
add x14,x14,x26,lsr#32
add w15,w15,w27
add x16,x16,x27,lsr#32
add w17,w17,w28
add x19,x19,x28,lsr#32
add w20,w20,w30
add x21,x21,x30,lsr#32
b.lo .Ltail
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add x28,x28,#1 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
b.hi .Loop_outer
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.inst 0xd50323bf // autiasp
.Labort:
ret
.align 4
.Ltail:
add x2,x2,#64
.Less_than_64:
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
add x4,sp,x2
neg x2,x2
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
stp x5,x7,[sp,#0]
stp x9,x11,[sp,#16]
stp x13,x15,[sp,#32]
stp x17,x20,[sp,#48]
.Loop_tail:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,.Loop_tail
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.inst 0xd50323bf // autiasp
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
#ifdef __KERNEL__
.globl ChaCha20_neon
#endif
.type ChaCha20_neon,%function
.align 5
ChaCha20_neon:
.LChaCha20_neon:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
cmp x2,#512
b.hs .L512_or_more_neon
sub sp,sp,#64
ldp x22,x23,[x5] // load sigma
ld1 {v0.4s},[x5],#16
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ld1 {v1.4s,v2.4s},[x3]
ldp x28,x30,[x4] // load counter
ld1 {v3.4s},[x4]
stp d8,d9,[sp] // meet ABI requirements
ld1 {v8.4s,v9.4s},[x5]
#ifdef __AARCH64EB__
rev64 v0.4s,v0.4s
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
.Loop_outer_neon:
dup v16.4s,v0.s[0] // unpack key block
mov w5,w22
dup v20.4s,v0.s[1]
lsr x6,x22,#32
dup v24.4s,v0.s[2]
mov w7,w23
dup v28.4s,v0.s[3]
lsr x8,x23,#32
dup v17.4s,v1.s[0]
mov w9,w24
dup v21.4s,v1.s[1]
lsr x10,x24,#32
dup v25.4s,v1.s[2]
mov w11,w25
dup v29.4s,v1.s[3]
lsr x12,x25,#32
dup v19.4s,v3.s[0]
mov w13,w26
dup v23.4s,v3.s[1]
lsr x14,x26,#32
dup v27.4s,v3.s[2]
mov w15,w27
dup v31.4s,v3.s[3]
lsr x16,x27,#32
add v19.4s,v19.4s,v8.4s
mov w17,w28
dup v18.4s,v2.s[0]
lsr x19,x28,#32
dup v22.4s,v2.s[1]
mov w20,w30
dup v26.4s,v2.s[2]
lsr x21,x30,#32
dup v30.4s,v2.s[3]
mov x4,#10
subs x2,x2,#320
.Loop_neon:
sub x4,x4,#1
add v16.4s,v16.4s,v17.4s
add w5,w5,w9
add v20.4s,v20.4s,v21.4s
add w6,w6,w10
add v24.4s,v24.4s,v25.4s
add w7,w7,w11
add v28.4s,v28.4s,v29.4s
add w8,w8,w12
eor v19.16b,v19.16b,v16.16b
eor w17,w17,w5
eor v23.16b,v23.16b,v20.16b
eor w19,w19,w6
eor v27.16b,v27.16b,v24.16b
eor w20,w20,w7
eor v31.16b,v31.16b,v28.16b
eor w21,w21,w8
rev32 v19.8h,v19.8h
ror w17,w17,#16
rev32 v23.8h,v23.8h
ror w19,w19,#16
rev32 v27.8h,v27.8h
ror w20,w20,#16
rev32 v31.8h,v31.8h
ror w21,w21,#16
add v18.4s,v18.4s,v19.4s
add w13,w13,w17
add v22.4s,v22.4s,v23.4s
add w14,w14,w19
add v26.4s,v26.4s,v27.4s
add w15,w15,w20
add v30.4s,v30.4s,v31.4s
add w16,w16,w21
eor v4.16b,v17.16b,v18.16b
eor w9,w9,w13
eor v5.16b,v21.16b,v22.16b
eor w10,w10,w14
eor v6.16b,v25.16b,v26.16b
eor w11,w11,w15
eor v7.16b,v29.16b,v30.16b
eor w12,w12,w16
ushr v17.4s,v4.4s,#20
ror w9,w9,#20
ushr v21.4s,v5.4s,#20
ror w10,w10,#20
ushr v25.4s,v6.4s,#20
ror w11,w11,#20
ushr v29.4s,v7.4s,#20
ror w12,w12,#20
sli v17.4s,v4.4s,#12
add w5,w5,w9
sli v21.4s,v5.4s,#12
add w6,w6,w10
sli v25.4s,v6.4s,#12
add w7,w7,w11
sli v29.4s,v7.4s,#12
add w8,w8,w12
add v16.4s,v16.4s,v17.4s
eor w17,w17,w5
add v20.4s,v20.4s,v21.4s
eor w19,w19,w6
add v24.4s,v24.4s,v25.4s
eor w20,w20,w7
add v28.4s,v28.4s,v29.4s
eor w21,w21,w8
eor v4.16b,v19.16b,v16.16b
ror w17,w17,#24
eor v5.16b,v23.16b,v20.16b
ror w19,w19,#24
eor v6.16b,v27.16b,v24.16b
ror w20,w20,#24
eor v7.16b,v31.16b,v28.16b
ror w21,w21,#24
tbl v19.16b,{v4.16b},v9.16b
add w13,w13,w17
tbl v23.16b,{v5.16b},v9.16b
add w14,w14,w19
tbl v27.16b,{v6.16b},v9.16b
add w15,w15,w20
tbl v31.16b,{v7.16b},v9.16b
add w16,w16,w21
add v18.4s,v18.4s,v19.4s
eor w9,w9,w13
add v22.4s,v22.4s,v23.4s
eor w10,w10,w14
add v26.4s,v26.4s,v27.4s
eor w11,w11,w15
add v30.4s,v30.4s,v31.4s
eor w12,w12,w16
eor v4.16b,v17.16b,v18.16b
ror w9,w9,#25
eor v5.16b,v21.16b,v22.16b
ror w10,w10,#25
eor v6.16b,v25.16b,v26.16b
ror w11,w11,#25
eor v7.16b,v29.16b,v30.16b
ror w12,w12,#25
ushr v17.4s,v4.4s,#25
ushr v21.4s,v5.4s,#25
ushr v25.4s,v6.4s,#25
ushr v29.4s,v7.4s,#25
sli v17.4s,v4.4s,#7
sli v21.4s,v5.4s,#7
sli v25.4s,v6.4s,#7
sli v29.4s,v7.4s,#7
add v16.4s,v16.4s,v21.4s
add w5,w5,w10
add v20.4s,v20.4s,v25.4s
add w6,w6,w11
add v24.4s,v24.4s,v29.4s
add w7,w7,w12
add v28.4s,v28.4s,v17.4s
add w8,w8,w9
eor v31.16b,v31.16b,v16.16b
eor w21,w21,w5
eor v19.16b,v19.16b,v20.16b
eor w17,w17,w6
eor v23.16b,v23.16b,v24.16b
eor w19,w19,w7
eor v27.16b,v27.16b,v28.16b
eor w20,w20,w8
rev32 v31.8h,v31.8h
ror w21,w21,#16
rev32 v19.8h,v19.8h
ror w17,w17,#16
rev32 v23.8h,v23.8h
ror w19,w19,#16
rev32 v27.8h,v27.8h
ror w20,w20,#16
add v26.4s,v26.4s,v31.4s
add w15,w15,w21
add v30.4s,v30.4s,v19.4s
add w16,w16,w17
add v18.4s,v18.4s,v23.4s
add w13,w13,w19
add v22.4s,v22.4s,v27.4s
add w14,w14,w20
eor v4.16b,v21.16b,v26.16b
eor w10,w10,w15
eor v5.16b,v25.16b,v30.16b
eor w11,w11,w16
eor v6.16b,v29.16b,v18.16b
eor w12,w12,w13
eor v7.16b,v17.16b,v22.16b
eor w9,w9,w14
ushr v21.4s,v4.4s,#20
ror w10,w10,#20
ushr v25.4s,v5.4s,#20
ror w11,w11,#20
ushr v29.4s,v6.4s,#20
ror w12,w12,#20
ushr v17.4s,v7.4s,#20
ror w9,w9,#20
sli v21.4s,v4.4s,#12
add w5,w5,w10
sli v25.4s,v5.4s,#12
add w6,w6,w11
sli v29.4s,v6.4s,#12
add w7,w7,w12
sli v17.4s,v7.4s,#12
add w8,w8,w9
add v16.4s,v16.4s,v21.4s
eor w21,w21,w5
add v20.4s,v20.4s,v25.4s
eor w17,w17,w6
add v24.4s,v24.4s,v29.4s
eor w19,w19,w7
add v28.4s,v28.4s,v17.4s
eor w20,w20,w8
eor v4.16b,v31.16b,v16.16b
ror w21,w21,#24
eor v5.16b,v19.16b,v20.16b
ror w17,w17,#24
eor v6.16b,v23.16b,v24.16b
ror w19,w19,#24
eor v7.16b,v27.16b,v28.16b
ror w20,w20,#24
tbl v31.16b,{v4.16b},v9.16b
add w15,w15,w21
tbl v19.16b,{v5.16b},v9.16b
add w16,w16,w17
tbl v23.16b,{v6.16b},v9.16b
add w13,w13,w19
tbl v27.16b,{v7.16b},v9.16b
add w14,w14,w20
add v26.4s,v26.4s,v31.4s
eor w10,w10,w15
add v30.4s,v30.4s,v19.4s
eor w11,w11,w16
add v18.4s,v18.4s,v23.4s
eor w12,w12,w13
add v22.4s,v22.4s,v27.4s
eor w9,w9,w14
eor v4.16b,v21.16b,v26.16b
ror w10,w10,#25
eor v5.16b,v25.16b,v30.16b
ror w11,w11,#25
eor v6.16b,v29.16b,v18.16b
ror w12,w12,#25
eor v7.16b,v17.16b,v22.16b
ror w9,w9,#25
ushr v21.4s,v4.4s,#25
ushr v25.4s,v5.4s,#25
ushr v29.4s,v6.4s,#25
ushr v17.4s,v7.4s,#25
sli v21.4s,v4.4s,#7
sli v25.4s,v5.4s,#7
sli v29.4s,v6.4s,#7
sli v17.4s,v7.4s,#7
cbnz x4,.Loop_neon
add v19.4s,v19.4s,v8.4s
zip1 v4.4s,v16.4s,v20.4s // transpose data
zip1 v5.4s,v24.4s,v28.4s
zip2 v6.4s,v16.4s,v20.4s
zip2 v7.4s,v24.4s,v28.4s
zip1 v16.2d,v4.2d,v5.2d
zip2 v20.2d,v4.2d,v5.2d
zip1 v24.2d,v6.2d,v7.2d
zip2 v28.2d,v6.2d,v7.2d
zip1 v4.4s,v17.4s,v21.4s
zip1 v5.4s,v25.4s,v29.4s
zip2 v6.4s,v17.4s,v21.4s
zip2 v7.4s,v25.4s,v29.4s
zip1 v17.2d,v4.2d,v5.2d
zip2 v21.2d,v4.2d,v5.2d
zip1 v25.2d,v6.2d,v7.2d
zip2 v29.2d,v6.2d,v7.2d
zip1 v4.4s,v18.4s,v22.4s
add w5,w5,w22 // accumulate key block
zip1 v5.4s,v26.4s,v30.4s
add x6,x6,x22,lsr#32
zip2 v6.4s,v18.4s,v22.4s
add w7,w7,w23
zip2 v7.4s,v26.4s,v30.4s
add x8,x8,x23,lsr#32
zip1 v18.2d,v4.2d,v5.2d
add w9,w9,w24
zip2 v22.2d,v4.2d,v5.2d
add x10,x10,x24,lsr#32
zip1 v26.2d,v6.2d,v7.2d
add w11,w11,w25
zip2 v30.2d,v6.2d,v7.2d
add x12,x12,x25,lsr#32
zip1 v4.4s,v19.4s,v23.4s
add w13,w13,w26
zip1 v5.4s,v27.4s,v31.4s
add x14,x14,x26,lsr#32
zip2 v6.4s,v19.4s,v23.4s
add w15,w15,w27
zip2 v7.4s,v27.4s,v31.4s
add x16,x16,x27,lsr#32
zip1 v19.2d,v4.2d,v5.2d
add w17,w17,w28
zip2 v23.2d,v4.2d,v5.2d
add x19,x19,x28,lsr#32
zip1 v27.2d,v6.2d,v7.2d
add w20,w20,w30
zip2 v31.2d,v6.2d,v7.2d
add x21,x21,x30,lsr#32
b.lo .Ltail_neon
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add v16.4s,v16.4s,v0.4s // accumulate key block
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add v17.4s,v17.4s,v1.4s
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add v18.4s,v18.4s,v2.4s
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add v19.4s,v19.4s,v3.4s
add x1,x1,#64
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
eor x5,x5,x6
add v20.4s,v20.4s,v0.4s
eor x7,x7,x8
add v21.4s,v21.4s,v1.4s
eor x9,x9,x10
add v22.4s,v22.4s,v2.4s
eor x11,x11,x12
add v23.4s,v23.4s,v3.4s
eor x13,x13,x14
eor v16.16b,v16.16b,v4.16b
movi v4.4s,#5
eor x15,x15,x16
eor v17.16b,v17.16b,v5.16b
eor x17,x17,x19
eor v18.16b,v18.16b,v6.16b
eor x20,x20,x21
eor v19.16b,v19.16b,v7.16b
add v8.4s,v8.4s,v4.4s // += 5
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
stp x5,x7,[x0,#0] // store output
add x28,x28,#5 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
add v24.4s,v24.4s,v0.4s
add v25.4s,v25.4s,v1.4s
add v26.4s,v26.4s,v2.4s
add v27.4s,v27.4s,v3.4s
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
eor v20.16b,v20.16b,v4.16b
eor v21.16b,v21.16b,v5.16b
eor v22.16b,v22.16b,v6.16b
eor v23.16b,v23.16b,v7.16b
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
add v28.4s,v28.4s,v0.4s
add v29.4s,v29.4s,v1.4s
add v30.4s,v30.4s,v2.4s
add v31.4s,v31.4s,v3.4s
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v24.16b,v24.16b,v16.16b
eor v25.16b,v25.16b,v17.16b
eor v26.16b,v26.16b,v18.16b
eor v27.16b,v27.16b,v19.16b
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
eor v28.16b,v28.16b,v20.16b
eor v29.16b,v29.16b,v21.16b
eor v30.16b,v30.16b,v22.16b
eor v31.16b,v31.16b,v23.16b
st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
b.hi .Loop_outer_neon
ldp d8,d9,[sp] // meet ABI requirements
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.inst 0xd50323bf // autiasp
ret
.align 4
.Ltail_neon:
add x2,x2,#320
ldp d8,d9,[sp] // meet ABI requirements
cmp x2,#64
b.lo .Less_than_64
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add v16.4s,v16.4s,v0.4s // accumulate key block
stp x9,x11,[x0,#16]
add v17.4s,v17.4s,v1.4s
stp x13,x15,[x0,#32]
add v18.4s,v18.4s,v2.4s
stp x17,x20,[x0,#48]
add v19.4s,v19.4s,v3.4s
add x0,x0,#64
b.eq .Ldone_neon
sub x2,x2,#64
cmp x2,#64
b.lo .Last_neon
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
eor v16.16b,v16.16b,v4.16b
eor v17.16b,v17.16b,v5.16b
eor v18.16b,v18.16b,v6.16b
eor v19.16b,v19.16b,v7.16b
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
b.eq .Ldone_neon
add v16.4s,v20.4s,v0.4s
add v17.4s,v21.4s,v1.4s
sub x2,x2,#64
add v18.4s,v22.4s,v2.4s
cmp x2,#64
add v19.4s,v23.4s,v3.4s
b.lo .Last_neon
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
eor v20.16b,v16.16b,v4.16b
eor v21.16b,v17.16b,v5.16b
eor v22.16b,v18.16b,v6.16b
eor v23.16b,v19.16b,v7.16b
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
b.eq .Ldone_neon
add v16.4s,v24.4s,v0.4s
add v17.4s,v25.4s,v1.4s
sub x2,x2,#64
add v18.4s,v26.4s,v2.4s
cmp x2,#64
add v19.4s,v27.4s,v3.4s
b.lo .Last_neon
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
eor v24.16b,v16.16b,v4.16b
eor v25.16b,v17.16b,v5.16b
eor v26.16b,v18.16b,v6.16b
eor v27.16b,v19.16b,v7.16b
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
b.eq .Ldone_neon
add v16.4s,v28.4s,v0.4s
add v17.4s,v29.4s,v1.4s
add v18.4s,v30.4s,v2.4s
add v19.4s,v31.4s,v3.4s
sub x2,x2,#64
.Last_neon:
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
add x4,sp,x2
neg x2,x2
.Loop_tail_neon:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,.Loop_tail_neon
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
.Ldone_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.inst 0xd50323bf // autiasp
ret
.size ChaCha20_neon,.-ChaCha20_neon
.type ChaCha20_512_neon,%function
.align 5
ChaCha20_512_neon:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
.L512_or_more_neon:
sub sp,sp,#128+64
eor v7.16b,v7.16b,v7.16b
ldp x22,x23,[x5] // load sigma
ld1 {v0.4s},[x5],#16
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ld1 {v1.4s,v2.4s},[x3]
ldp x28,x30,[x4] // load counter
ld1 {v3.4s},[x4]
ld1 {v7.s}[0],[x5]
add x3,x5,#16 // .Lrot24
#ifdef __AARCH64EB__
rev64 v0.4s,v0.4s
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
add v3.4s,v3.4s,v7.4s // += 1
stp q0,q1,[sp,#0] // off-load key block, invariant part
add v3.4s,v3.4s,v7.4s // not typo
str q2,[sp,#32]
add v4.4s,v3.4s,v7.4s
add v5.4s,v4.4s,v7.4s
add v6.4s,v5.4s,v7.4s
shl v7.4s,v7.4s,#2 // 1 -> 4
stp d8,d9,[sp,#128+0] // meet ABI requirements
stp d10,d11,[sp,#128+16]
stp d12,d13,[sp,#128+32]
stp d14,d15,[sp,#128+48]
sub x2,x2,#512 // not typo
.Loop_outer_512_neon:
mov v8.16b,v0.16b
mov v12.16b,v0.16b
mov v16.16b,v0.16b
mov v20.16b,v0.16b
mov v24.16b,v0.16b
mov v28.16b,v0.16b
mov v9.16b,v1.16b
mov w5,w22 // unpack key block
mov v13.16b,v1.16b
lsr x6,x22,#32
mov v17.16b,v1.16b
mov w7,w23
mov v21.16b,v1.16b
lsr x8,x23,#32
mov v25.16b,v1.16b
mov w9,w24
mov v29.16b,v1.16b
lsr x10,x24,#32
mov v11.16b,v3.16b
mov w11,w25
mov v15.16b,v4.16b
lsr x12,x25,#32
mov v19.16b,v5.16b
mov w13,w26
mov v23.16b,v6.16b
lsr x14,x26,#32
mov v10.16b,v2.16b
mov w15,w27
mov v14.16b,v2.16b
lsr x16,x27,#32
add v27.4s,v11.4s,v7.4s // +4
mov w17,w28
add v31.4s,v15.4s,v7.4s // +4
lsr x19,x28,#32
mov v18.16b,v2.16b
mov w20,w30
mov v22.16b,v2.16b
lsr x21,x30,#32
mov v26.16b,v2.16b
stp q3,q4,[sp,#48] // off-load key block, variable part
mov v30.16b,v2.16b
stp q5,q6,[sp,#80]
mov x4,#5
ld1 {v6.4s},[x3]
subs x2,x2,#512
.Loop_upper_neon:
sub x4,x4,#1
add v8.4s,v8.4s,v9.4s
add w5,w5,w9
add v12.4s,v12.4s,v13.4s
add w6,w6,w10
add v16.4s,v16.4s,v17.4s
add w7,w7,w11
add v20.4s,v20.4s,v21.4s
add w8,w8,w12
add v24.4s,v24.4s,v25.4s
eor w17,w17,w5
add v28.4s,v28.4s,v29.4s
eor w19,w19,w6
eor v11.16b,v11.16b,v8.16b
eor w20,w20,w7
eor v15.16b,v15.16b,v12.16b
eor w21,w21,w8
eor v19.16b,v19.16b,v16.16b
ror w17,w17,#16
eor v23.16b,v23.16b,v20.16b
ror w19,w19,#16
eor v27.16b,v27.16b,v24.16b
ror w20,w20,#16
eor v31.16b,v31.16b,v28.16b
ror w21,w21,#16
rev32 v11.8h,v11.8h
add w13,w13,w17
rev32 v15.8h,v15.8h
add w14,w14,w19
rev32 v19.8h,v19.8h
add w15,w15,w20
rev32 v23.8h,v23.8h
add w16,w16,w21
rev32 v27.8h,v27.8h
eor w9,w9,w13
rev32 v31.8h,v31.8h
eor w10,w10,w14
add v10.4s,v10.4s,v11.4s
eor w11,w11,w15
add v14.4s,v14.4s,v15.4s
eor w12,w12,w16
add v18.4s,v18.4s,v19.4s
ror w9,w9,#20
add v22.4s,v22.4s,v23.4s
ror w10,w10,#20
add v26.4s,v26.4s,v27.4s
ror w11,w11,#20
add v30.4s,v30.4s,v31.4s
ror w12,w12,#20
eor v0.16b,v9.16b,v10.16b
add w5,w5,w9
eor v1.16b,v13.16b,v14.16b
add w6,w6,w10
eor v2.16b,v17.16b,v18.16b
add w7,w7,w11
eor v3.16b,v21.16b,v22.16b
add w8,w8,w12
eor v4.16b,v25.16b,v26.16b
eor w17,w17,w5
eor v5.16b,v29.16b,v30.16b
eor w19,w19,w6
ushr v9.4s,v0.4s,#20
eor w20,w20,w7
ushr v13.4s,v1.4s,#20
eor w21,w21,w8
ushr v17.4s,v2.4s,#20
ror w17,w17,#24
ushr v21.4s,v3.4s,#20
ror w19,w19,#24
ushr v25.4s,v4.4s,#20
ror w20,w20,#24
ushr v29.4s,v5.4s,#20
ror w21,w21,#24
sli v9.4s,v0.4s,#12
add w13,w13,w17
sli v13.4s,v1.4s,#12
add w14,w14,w19
sli v17.4s,v2.4s,#12
add w15,w15,w20
sli v21.4s,v3.4s,#12
add w16,w16,w21
sli v25.4s,v4.4s,#12
eor w9,w9,w13
sli v29.4s,v5.4s,#12
eor w10,w10,w14
add v8.4s,v8.4s,v9.4s
eor w11,w11,w15
add v12.4s,v12.4s,v13.4s
eor w12,w12,w16
add v16.4s,v16.4s,v17.4s
ror w9,w9,#25
add v20.4s,v20.4s,v21.4s
ror w10,w10,#25
add v24.4s,v24.4s,v25.4s
ror w11,w11,#25
add v28.4s,v28.4s,v29.4s
ror w12,w12,#25
eor v11.16b,v11.16b,v8.16b
add w5,w5,w10
eor v15.16b,v15.16b,v12.16b
add w6,w6,w11
eor v19.16b,v19.16b,v16.16b
add w7,w7,w12
eor v23.16b,v23.16b,v20.16b
add w8,w8,w9
eor v27.16b,v27.16b,v24.16b
eor w21,w21,w5
eor v31.16b,v31.16b,v28.16b
eor w17,w17,w6
tbl v11.16b,{v11.16b},v6.16b
eor w19,w19,w7
tbl v15.16b,{v15.16b},v6.16b
eor w20,w20,w8
tbl v19.16b,{v19.16b},v6.16b
ror w21,w21,#16
tbl v23.16b,{v23.16b},v6.16b
ror w17,w17,#16
tbl v27.16b,{v27.16b},v6.16b
ror w19,w19,#16
tbl v31.16b,{v31.16b},v6.16b
ror w20,w20,#16
add v10.4s,v10.4s,v11.4s
add w15,w15,w21
add v14.4s,v14.4s,v15.4s
add w16,w16,w17
add v18.4s,v18.4s,v19.4s
add w13,w13,w19
add v22.4s,v22.4s,v23.4s
add w14,w14,w20
add v26.4s,v26.4s,v27.4s
eor w10,w10,w15
add v30.4s,v30.4s,v31.4s
eor w11,w11,w16
eor v0.16b,v9.16b,v10.16b
eor w12,w12,w13
eor v1.16b,v13.16b,v14.16b
eor w9,w9,w14
eor v2.16b,v17.16b,v18.16b
ror w10,w10,#20
eor v3.16b,v21.16b,v22.16b
ror w11,w11,#20
eor v4.16b,v25.16b,v26.16b
ror w12,w12,#20
eor v5.16b,v29.16b,v30.16b
ror w9,w9,#20
ushr v9.4s,v0.4s,#25
add w5,w5,w10
ushr v13.4s,v1.4s,#25
add w6,w6,w11
ushr v17.4s,v2.4s,#25
add w7,w7,w12
ushr v21.4s,v3.4s,#25
add w8,w8,w9
ushr v25.4s,v4.4s,#25
eor w21,w21,w5
ushr v29.4s,v5.4s,#25
eor w17,w17,w6
sli v9.4s,v0.4s,#7
eor w19,w19,w7
sli v13.4s,v1.4s,#7
eor w20,w20,w8
sli v17.4s,v2.4s,#7
ror w21,w21,#24
sli v21.4s,v3.4s,#7
ror w17,w17,#24
sli v25.4s,v4.4s,#7
ror w19,w19,#24
sli v29.4s,v5.4s,#7
ror w20,w20,#24
ext v10.16b,v10.16b,v10.16b,#8
add w15,w15,w21
ext v14.16b,v14.16b,v14.16b,#8
add w16,w16,w17
ext v18.16b,v18.16b,v18.16b,#8
add w13,w13,w19
ext v22.16b,v22.16b,v22.16b,#8
add w14,w14,w20
ext v26.16b,v26.16b,v26.16b,#8
eor w10,w10,w15
ext v30.16b,v30.16b,v30.16b,#8
eor w11,w11,w16
ext v11.16b,v11.16b,v11.16b,#12
eor w12,w12,w13
ext v15.16b,v15.16b,v15.16b,#12
eor w9,w9,w14
ext v19.16b,v19.16b,v19.16b,#12
ror w10,w10,#25
ext v23.16b,v23.16b,v23.16b,#12
ror w11,w11,#25
ext v27.16b,v27.16b,v27.16b,#12
ror w12,w12,#25
ext v31.16b,v31.16b,v31.16b,#12
ror w9,w9,#25
ext v9.16b,v9.16b,v9.16b,#4
ext v13.16b,v13.16b,v13.16b,#4
ext v17.16b,v17.16b,v17.16b,#4
ext v21.16b,v21.16b,v21.16b,#4
ext v25.16b,v25.16b,v25.16b,#4
ext v29.16b,v29.16b,v29.16b,#4
add v8.4s,v8.4s,v9.4s
add w5,w5,w9
add v12.4s,v12.4s,v13.4s
add w6,w6,w10
add v16.4s,v16.4s,v17.4s
add w7,w7,w11
add v20.4s,v20.4s,v21.4s
add w8,w8,w12
add v24.4s,v24.4s,v25.4s
eor w17,w17,w5
add v28.4s,v28.4s,v29.4s
eor w19,w19,w6
eor v11.16b,v11.16b,v8.16b
eor w20,w20,w7
eor v15.16b,v15.16b,v12.16b
eor w21,w21,w8
eor v19.16b,v19.16b,v16.16b
ror w17,w17,#16
eor v23.16b,v23.16b,v20.16b
ror w19,w19,#16
eor v27.16b,v27.16b,v24.16b
ror w20,w20,#16
eor v31.16b,v31.16b,v28.16b
ror w21,w21,#16
rev32 v11.8h,v11.8h
add w13,w13,w17
rev32 v15.8h,v15.8h
add w14,w14,w19
rev32 v19.8h,v19.8h
add w15,w15,w20
rev32 v23.8h,v23.8h
add w16,w16,w21
rev32 v27.8h,v27.8h
eor w9,w9,w13
rev32 v31.8h,v31.8h
eor w10,w10,w14
add v10.4s,v10.4s,v11.4s
eor w11,w11,w15
add v14.4s,v14.4s,v15.4s
eor w12,w12,w16
add v18.4s,v18.4s,v19.4s
ror w9,w9,#20
add v22.4s,v22.4s,v23.4s
ror w10,w10,#20
add v26.4s,v26.4s,v27.4s
ror w11,w11,#20
add v30.4s,v30.4s,v31.4s
ror w12,w12,#20
eor v0.16b,v9.16b,v10.16b
add w5,w5,w9
eor v1.16b,v13.16b,v14.16b
add w6,w6,w10
eor v2.16b,v17.16b,v18.16b
add w7,w7,w11
eor v3.16b,v21.16b,v22.16b
add w8,w8,w12
eor v4.16b,v25.16b,v26.16b
eor w17,w17,w5
eor v5.16b,v29.16b,v30.16b
eor w19,w19,w6
ushr v9.4s,v0.4s,#20
eor w20,w20,w7
ushr v13.4s,v1.4s,#20
eor w21,w21,w8
ushr v17.4s,v2.4s,#20
ror w17,w17,#24
ushr v21.4s,v3.4s,#20
ror w19,w19,#24
ushr v25.4s,v4.4s,#20
ror w20,w20,#24
ushr v29.4s,v5.4s,#20
ror w21,w21,#24
sli v9.4s,v0.4s,#12
add w13,w13,w17
sli v13.4s,v1.4s,#12
add w14,w14,w19
sli v17.4s,v2.4s,#12
add w15,w15,w20
sli v21.4s,v3.4s,#12
add w16,w16,w21
sli v25.4s,v4.4s,#12
eor w9,w9,w13
sli v29.4s,v5.4s,#12
eor w10,w10,w14
add v8.4s,v8.4s,v9.4s
eor w11,w11,w15
add v12.4s,v12.4s,v13.4s
eor w12,w12,w16
add v16.4s,v16.4s,v17.4s
ror w9,w9,#25
add v20.4s,v20.4s,v21.4s
ror w10,w10,#25
add v24.4s,v24.4s,v25.4s
ror w11,w11,#25
add v28.4s,v28.4s,v29.4s
ror w12,w12,#25
eor v11.16b,v11.16b,v8.16b
add w5,w5,w10
eor v15.16b,v15.16b,v12.16b
add w6,w6,w11
eor v19.16b,v19.16b,v16.16b
add w7,w7,w12
eor v23.16b,v23.16b,v20.16b
add w8,w8,w9
eor v27.16b,v27.16b,v24.16b
eor w21,w21,w5
eor v31.16b,v31.16b,v28.16b
eor w17,w17,w6
tbl v11.16b,{v11.16b},v6.16b
eor w19,w19,w7
tbl v15.16b,{v15.16b},v6.16b
eor w20,w20,w8
tbl v19.16b,{v19.16b},v6.16b
ror w21,w21,#16
tbl v23.16b,{v23.16b},v6.16b
ror w17,w17,#16
tbl v27.16b,{v27.16b},v6.16b
ror w19,w19,#16
tbl v31.16b,{v31.16b},v6.16b
ror w20,w20,#16
add v10.4s,v10.4s,v11.4s
add w15,w15,w21
add v14.4s,v14.4s,v15.4s
add w16,w16,w17
add v18.4s,v18.4s,v19.4s
add w13,w13,w19
add v22.4s,v22.4s,v23.4s
add w14,w14,w20
add v26.4s,v26.4s,v27.4s
eor w10,w10,w15
add v30.4s,v30.4s,v31.4s
eor w11,w11,w16
eor v0.16b,v9.16b,v10.16b
eor w12,w12,w13
eor v1.16b,v13.16b,v14.16b
eor w9,w9,w14
eor v2.16b,v17.16b,v18.16b
ror w10,w10,#20
eor v3.16b,v21.16b,v22.16b
ror w11,w11,#20
eor v4.16b,v25.16b,v26.16b
ror w12,w12,#20
eor v5.16b,v29.16b,v30.16b
ror w9,w9,#20
ushr v9.4s,v0.4s,#25
add w5,w5,w10
ushr v13.4s,v1.4s,#25
add w6,w6,w11
ushr v17.4s,v2.4s,#25
add w7,w7,w12
ushr v21.4s,v3.4s,#25
add w8,w8,w9
ushr v25.4s,v4.4s,#25
eor w21,w21,w5
ushr v29.4s,v5.4s,#25
eor w17,w17,w6
sli v9.4s,v0.4s,#7
eor w19,w19,w7
sli v13.4s,v1.4s,#7
eor w20,w20,w8
sli v17.4s,v2.4s,#7
ror w21,w21,#24
sli v21.4s,v3.4s,#7
ror w17,w17,#24
sli v25.4s,v4.4s,#7
ror w19,w19,#24
sli v29.4s,v5.4s,#7
ror w20,w20,#24
ext v10.16b,v10.16b,v10.16b,#8
add w15,w15,w21
ext v14.16b,v14.16b,v14.16b,#8
add w16,w16,w17
ext v18.16b,v18.16b,v18.16b,#8
add w13,w13,w19
ext v22.16b,v22.16b,v22.16b,#8
add w14,w14,w20
ext v26.16b,v26.16b,v26.16b,#8
eor w10,w10,w15
ext v30.16b,v30.16b,v30.16b,#8
eor w11,w11,w16
ext v11.16b,v11.16b,v11.16b,#4
eor w12,w12,w13
ext v15.16b,v15.16b,v15.16b,#4
eor w9,w9,w14
ext v19.16b,v19.16b,v19.16b,#4
ror w10,w10,#25
ext v23.16b,v23.16b,v23.16b,#4
ror w11,w11,#25
ext v27.16b,v27.16b,v27.16b,#4
ror w12,w12,#25
ext v31.16b,v31.16b,v31.16b,#4
ror w9,w9,#25
ext v9.16b,v9.16b,v9.16b,#12
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
ext v25.16b,v25.16b,v25.16b,#12
ext v29.16b,v29.16b,v29.16b,#12
cbnz x4,.Loop_upper_neon
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
add w7,w7,w23
add x8,x8,x23,lsr#32
add w9,w9,w24
add x10,x10,x24,lsr#32
add w11,w11,w25
add x12,x12,x25,lsr#32
add w13,w13,w26
add x14,x14,x26,lsr#32
add w15,w15,w27
add x16,x16,x27,lsr#32
add w17,w17,w28
add x19,x19,x28,lsr#32
add w20,w20,w30
add x21,x21,x30,lsr#32
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add x28,x28,#1 // increment counter
mov w5,w22 // unpack key block
lsr x6,x22,#32
stp x9,x11,[x0,#16]
mov w7,w23
lsr x8,x23,#32
stp x13,x15,[x0,#32]
mov w9,w24
lsr x10,x24,#32
stp x17,x20,[x0,#48]
add x0,x0,#64
mov w11,w25
lsr x12,x25,#32
mov w13,w26
lsr x14,x26,#32
mov w15,w27
lsr x16,x27,#32
mov w17,w28
lsr x19,x28,#32
mov w20,w30
lsr x21,x30,#32
mov x4,#5
.Loop_lower_neon:
sub x4,x4,#1
add v8.4s,v8.4s,v9.4s
add w5,w5,w9
add v12.4s,v12.4s,v13.4s
add w6,w6,w10
add v16.4s,v16.4s,v17.4s
add w7,w7,w11
add v20.4s,v20.4s,v21.4s
add w8,w8,w12
add v24.4s,v24.4s,v25.4s
eor w17,w17,w5
add v28.4s,v28.4s,v29.4s
eor w19,w19,w6
eor v11.16b,v11.16b,v8.16b
eor w20,w20,w7
eor v15.16b,v15.16b,v12.16b
eor w21,w21,w8
eor v19.16b,v19.16b,v16.16b
ror w17,w17,#16
eor v23.16b,v23.16b,v20.16b
ror w19,w19,#16
eor v27.16b,v27.16b,v24.16b
ror w20,w20,#16
eor v31.16b,v31.16b,v28.16b
ror w21,w21,#16
rev32 v11.8h,v11.8h
add w13,w13,w17
rev32 v15.8h,v15.8h
add w14,w14,w19
rev32 v19.8h,v19.8h
add w15,w15,w20
rev32 v23.8h,v23.8h
add w16,w16,w21
rev32 v27.8h,v27.8h
eor w9,w9,w13
rev32 v31.8h,v31.8h
eor w10,w10,w14
add v10.4s,v10.4s,v11.4s
eor w11,w11,w15
add v14.4s,v14.4s,v15.4s
eor w12,w12,w16
add v18.4s,v18.4s,v19.4s
ror w9,w9,#20
add v22.4s,v22.4s,v23.4s
ror w10,w10,#20
add v26.4s,v26.4s,v27.4s
ror w11,w11,#20
add v30.4s,v30.4s,v31.4s
ror w12,w12,#20
eor v0.16b,v9.16b,v10.16b
add w5,w5,w9
eor v1.16b,v13.16b,v14.16b
add w6,w6,w10
eor v2.16b,v17.16b,v18.16b
add w7,w7,w11
eor v3.16b,v21.16b,v22.16b
add w8,w8,w12
eor v4.16b,v25.16b,v26.16b
eor w17,w17,w5
eor v5.16b,v29.16b,v30.16b
eor w19,w19,w6
ushr v9.4s,v0.4s,#20
eor w20,w20,w7
ushr v13.4s,v1.4s,#20
eor w21,w21,w8
ushr v17.4s,v2.4s,#20
ror w17,w17,#24
ushr v21.4s,v3.4s,#20
ror w19,w19,#24
ushr v25.4s,v4.4s,#20
ror w20,w20,#24
ushr v29.4s,v5.4s,#20
ror w21,w21,#24
sli v9.4s,v0.4s,#12
add w13,w13,w17
sli v13.4s,v1.4s,#12
add w14,w14,w19
sli v17.4s,v2.4s,#12
add w15,w15,w20
sli v21.4s,v3.4s,#12
add w16,w16,w21
sli v25.4s,v4.4s,#12
eor w9,w9,w13
sli v29.4s,v5.4s,#12
eor w10,w10,w14
add v8.4s,v8.4s,v9.4s
eor w11,w11,w15
add v12.4s,v12.4s,v13.4s
eor w12,w12,w16
add v16.4s,v16.4s,v17.4s
ror w9,w9,#25
add v20.4s,v20.4s,v21.4s
ror w10,w10,#25
add v24.4s,v24.4s,v25.4s
ror w11,w11,#25
add v28.4s,v28.4s,v29.4s
ror w12,w12,#25
eor v11.16b,v11.16b,v8.16b
add w5,w5,w10
eor v15.16b,v15.16b,v12.16b
add w6,w6,w11
eor v19.16b,v19.16b,v16.16b
add w7,w7,w12
eor v23.16b,v23.16b,v20.16b
add w8,w8,w9
eor v27.16b,v27.16b,v24.16b
eor w21,w21,w5
eor v31.16b,v31.16b,v28.16b
eor w17,w17,w6
tbl v11.16b,{v11.16b},v6.16b
eor w19,w19,w7
tbl v15.16b,{v15.16b},v6.16b
eor w20,w20,w8
tbl v19.16b,{v19.16b},v6.16b
ror w21,w21,#16
tbl v23.16b,{v23.16b},v6.16b
ror w17,w17,#16
tbl v27.16b,{v27.16b},v6.16b
ror w19,w19,#16
tbl v31.16b,{v31.16b},v6.16b
ror w20,w20,#16
add v10.4s,v10.4s,v11.4s
add w15,w15,w21
add v14.4s,v14.4s,v15.4s
add w16,w16,w17
add v18.4s,v18.4s,v19.4s
add w13,w13,w19
add v22.4s,v22.4s,v23.4s
add w14,w14,w20
add v26.4s,v26.4s,v27.4s
eor w10,w10,w15
add v30.4s,v30.4s,v31.4s
eor w11,w11,w16
eor v0.16b,v9.16b,v10.16b
eor w12,w12,w13
eor v1.16b,v13.16b,v14.16b
eor w9,w9,w14
eor v2.16b,v17.16b,v18.16b
ror w10,w10,#20
eor v3.16b,v21.16b,v22.16b
ror w11,w11,#20
eor v4.16b,v25.16b,v26.16b
ror w12,w12,#20
eor v5.16b,v29.16b,v30.16b
ror w9,w9,#20
ushr v9.4s,v0.4s,#25
add w5,w5,w10
ushr v13.4s,v1.4s,#25
add w6,w6,w11
ushr v17.4s,v2.4s,#25
add w7,w7,w12
ushr v21.4s,v3.4s,#25
add w8,w8,w9
ushr v25.4s,v4.4s,#25
eor w21,w21,w5
ushr v29.4s,v5.4s,#25
eor w17,w17,w6
sli v9.4s,v0.4s,#7
eor w19,w19,w7
sli v13.4s,v1.4s,#7
eor w20,w20,w8
sli v17.4s,v2.4s,#7
ror w21,w21,#24
sli v21.4s,v3.4s,#7
ror w17,w17,#24
sli v25.4s,v4.4s,#7
ror w19,w19,#24
sli v29.4s,v5.4s,#7
ror w20,w20,#24
ext v10.16b,v10.16b,v10.16b,#8
add w15,w15,w21
ext v14.16b,v14.16b,v14.16b,#8
add w16,w16,w17
ext v18.16b,v18.16b,v18.16b,#8
add w13,w13,w19
ext v22.16b,v22.16b,v22.16b,#8
add w14,w14,w20
ext v26.16b,v26.16b,v26.16b,#8
eor w10,w10,w15
ext v30.16b,v30.16b,v30.16b,#8
eor w11,w11,w16
ext v11.16b,v11.16b,v11.16b,#12
eor w12,w12,w13
ext v15.16b,v15.16b,v15.16b,#12
eor w9,w9,w14
ext v19.16b,v19.16b,v19.16b,#12
ror w10,w10,#25
ext v23.16b,v23.16b,v23.16b,#12
ror w11,w11,#25
ext v27.16b,v27.16b,v27.16b,#12
ror w12,w12,#25
ext v31.16b,v31.16b,v31.16b,#12
ror w9,w9,#25
ext v9.16b,v9.16b,v9.16b,#4
ext v13.16b,v13.16b,v13.16b,#4
ext v17.16b,v17.16b,v17.16b,#4
ext v21.16b,v21.16b,v21.16b,#4
ext v25.16b,v25.16b,v25.16b,#4
ext v29.16b,v29.16b,v29.16b,#4
add v8.4s,v8.4s,v9.4s
add w5,w5,w9
add v12.4s,v12.4s,v13.4s
add w6,w6,w10
add v16.4s,v16.4s,v17.4s
add w7,w7,w11
add v20.4s,v20.4s,v21.4s
add w8,w8,w12
add v24.4s,v24.4s,v25.4s
eor w17,w17,w5
add v28.4s,v28.4s,v29.4s
eor w19,w19,w6
eor v11.16b,v11.16b,v8.16b
eor w20,w20,w7
eor v15.16b,v15.16b,v12.16b
eor w21,w21,w8
eor v19.16b,v19.16b,v16.16b
ror w17,w17,#16
eor v23.16b,v23.16b,v20.16b
ror w19,w19,#16
eor v27.16b,v27.16b,v24.16b
ror w20,w20,#16
eor v31.16b,v31.16b,v28.16b
ror w21,w21,#16
rev32 v11.8h,v11.8h
add w13,w13,w17
rev32 v15.8h,v15.8h
add w14,w14,w19
rev32 v19.8h,v19.8h
add w15,w15,w20
rev32 v23.8h,v23.8h
add w16,w16,w21
rev32 v27.8h,v27.8h
eor w9,w9,w13
rev32 v31.8h,v31.8h
eor w10,w10,w14
add v10.4s,v10.4s,v11.4s
eor w11,w11,w15
add v14.4s,v14.4s,v15.4s
eor w12,w12,w16
add v18.4s,v18.4s,v19.4s
ror w9,w9,#20
add v22.4s,v22.4s,v23.4s
ror w10,w10,#20
add v26.4s,v26.4s,v27.4s
ror w11,w11,#20
add v30.4s,v30.4s,v31.4s
ror w12,w12,#20
eor v0.16b,v9.16b,v10.16b
add w5,w5,w9
eor v1.16b,v13.16b,v14.16b
add w6,w6,w10
eor v2.16b,v17.16b,v18.16b
add w7,w7,w11
eor v3.16b,v21.16b,v22.16b
add w8,w8,w12
eor v4.16b,v25.16b,v26.16b
eor w17,w17,w5
eor v5.16b,v29.16b,v30.16b
eor w19,w19,w6
ushr v9.4s,v0.4s,#20
eor w20,w20,w7
ushr v13.4s,v1.4s,#20
eor w21,w21,w8
ushr v17.4s,v2.4s,#20
ror w17,w17,#24
ushr v21.4s,v3.4s,#20
ror w19,w19,#24
ushr v25.4s,v4.4s,#20
ror w20,w20,#24
ushr v29.4s,v5.4s,#20
ror w21,w21,#24
sli v9.4s,v0.4s,#12
add w13,w13,w17
sli v13.4s,v1.4s,#12
add w14,w14,w19
sli v17.4s,v2.4s,#12
add w15,w15,w20
sli v21.4s,v3.4s,#12
add w16,w16,w21
sli v25.4s,v4.4s,#12
eor w9,w9,w13
sli v29.4s,v5.4s,#12
eor w10,w10,w14
add v8.4s,v8.4s,v9.4s
eor w11,w11,w15
add v12.4s,v12.4s,v13.4s
eor w12,w12,w16
add v16.4s,v16.4s,v17.4s
ror w9,w9,#25
add v20.4s,v20.4s,v21.4s
ror w10,w10,#25
add v24.4s,v24.4s,v25.4s
ror w11,w11,#25
add v28.4s,v28.4s,v29.4s
ror w12,w12,#25
eor v11.16b,v11.16b,v8.16b
add w5,w5,w10
eor v15.16b,v15.16b,v12.16b
add w6,w6,w11
eor v19.16b,v19.16b,v16.16b
add w7,w7,w12
eor v23.16b,v23.16b,v20.16b
add w8,w8,w9
eor v27.16b,v27.16b,v24.16b
eor w21,w21,w5
eor v31.16b,v31.16b,v28.16b
eor w17,w17,w6
tbl v11.16b,{v11.16b},v6.16b
eor w19,w19,w7
tbl v15.16b,{v15.16b},v6.16b
eor w20,w20,w8
tbl v19.16b,{v19.16b},v6.16b
ror w21,w21,#16
tbl v23.16b,{v23.16b},v6.16b
ror w17,w17,#16
tbl v27.16b,{v27.16b},v6.16b
ror w19,w19,#16
tbl v31.16b,{v31.16b},v6.16b
ror w20,w20,#16
add v10.4s,v10.4s,v11.4s
add w15,w15,w21
add v14.4s,v14.4s,v15.4s
add w16,w16,w17
add v18.4s,v18.4s,v19.4s
add w13,w13,w19
add v22.4s,v22.4s,v23.4s
add w14,w14,w20
add v26.4s,v26.4s,v27.4s
eor w10,w10,w15
add v30.4s,v30.4s,v31.4s
eor w11,w11,w16
eor v0.16b,v9.16b,v10.16b
eor w12,w12,w13
eor v1.16b,v13.16b,v14.16b
eor w9,w9,w14
eor v2.16b,v17.16b,v18.16b
ror w10,w10,#20
eor v3.16b,v21.16b,v22.16b
ror w11,w11,#20
eor v4.16b,v25.16b,v26.16b
ror w12,w12,#20
eor v5.16b,v29.16b,v30.16b
ror w9,w9,#20
ushr v9.4s,v0.4s,#25
add w5,w5,w10
ushr v13.4s,v1.4s,#25
add w6,w6,w11
ushr v17.4s,v2.4s,#25
add w7,w7,w12
ushr v21.4s,v3.4s,#25
add w8,w8,w9
ushr v25.4s,v4.4s,#25
eor w21,w21,w5
ushr v29.4s,v5.4s,#25
eor w17,w17,w6
sli v9.4s,v0.4s,#7
eor w19,w19,w7
sli v13.4s,v1.4s,#7
eor w20,w20,w8
sli v17.4s,v2.4s,#7
ror w21,w21,#24
sli v21.4s,v3.4s,#7
ror w17,w17,#24
sli v25.4s,v4.4s,#7
ror w19,w19,#24
sli v29.4s,v5.4s,#7
ror w20,w20,#24
ext v10.16b,v10.16b,v10.16b,#8
add w15,w15,w21
ext v14.16b,v14.16b,v14.16b,#8
add w16,w16,w17
ext v18.16b,v18.16b,v18.16b,#8
add w13,w13,w19
ext v22.16b,v22.16b,v22.16b,#8
add w14,w14,w20
ext v26.16b,v26.16b,v26.16b,#8
eor w10,w10,w15
ext v30.16b,v30.16b,v30.16b,#8
eor w11,w11,w16
ext v11.16b,v11.16b,v11.16b,#4
eor w12,w12,w13
ext v15.16b,v15.16b,v15.16b,#4
eor w9,w9,w14
ext v19.16b,v19.16b,v19.16b,#4
ror w10,w10,#25
ext v23.16b,v23.16b,v23.16b,#4
ror w11,w11,#25
ext v27.16b,v27.16b,v27.16b,#4
ror w12,w12,#25
ext v31.16b,v31.16b,v31.16b,#4
ror w9,w9,#25
ext v9.16b,v9.16b,v9.16b,#12
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
ext v25.16b,v25.16b,v25.16b,#12
ext v29.16b,v29.16b,v29.16b,#12
cbnz x4,.Loop_lower_neon
add w5,w5,w22 // accumulate key block
ldp q0,q1,[sp,#0]
add x6,x6,x22,lsr#32
ldp q2,q3,[sp,#32]
add w7,w7,w23
ldp q4,q5,[sp,#64]
add x8,x8,x23,lsr#32
ldr q6,[sp,#96]
add v8.4s,v8.4s,v0.4s
add w9,w9,w24
add v12.4s,v12.4s,v0.4s
add x10,x10,x24,lsr#32
add v16.4s,v16.4s,v0.4s
add w11,w11,w25
add v20.4s,v20.4s,v0.4s
add x12,x12,x25,lsr#32
add v24.4s,v24.4s,v0.4s
add w13,w13,w26
add v28.4s,v28.4s,v0.4s
add x14,x14,x26,lsr#32
add v10.4s,v10.4s,v2.4s
add w15,w15,w27
add v14.4s,v14.4s,v2.4s
add x16,x16,x27,lsr#32
add v18.4s,v18.4s,v2.4s
add w17,w17,w28
add v22.4s,v22.4s,v2.4s
add x19,x19,x28,lsr#32
add v26.4s,v26.4s,v2.4s
add w20,w20,w30
add v30.4s,v30.4s,v2.4s
add x21,x21,x30,lsr#32
add v27.4s,v27.4s,v7.4s // +4
add x5,x5,x6,lsl#32 // pack
add v31.4s,v31.4s,v7.4s // +4
add x7,x7,x8,lsl#32
add v11.4s,v11.4s,v3.4s
ldp x6,x8,[x1,#0] // load input
add v15.4s,v15.4s,v4.4s
add x9,x9,x10,lsl#32
add v19.4s,v19.4s,v5.4s
add x11,x11,x12,lsl#32
add v23.4s,v23.4s,v6.4s
ldp x10,x12,[x1,#16]
add v27.4s,v27.4s,v3.4s
add x13,x13,x14,lsl#32
add v31.4s,v31.4s,v4.4s
add x15,x15,x16,lsl#32
add v9.4s,v9.4s,v1.4s
ldp x14,x16,[x1,#32]
add v13.4s,v13.4s,v1.4s
add x17,x17,x19,lsl#32
add v17.4s,v17.4s,v1.4s
add x20,x20,x21,lsl#32
add v21.4s,v21.4s,v1.4s
ldp x19,x21,[x1,#48]
add v25.4s,v25.4s,v1.4s
add x1,x1,#64
add v29.4s,v29.4s,v1.4s
#ifdef __AARCH64EB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor v8.16b,v8.16b,v0.16b
eor x15,x15,x16
eor v9.16b,v9.16b,v1.16b
eor x17,x17,x19
eor v10.16b,v10.16b,v2.16b
eor x20,x20,x21
eor v11.16b,v11.16b,v3.16b
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
stp x5,x7,[x0,#0] // store output
add x28,x28,#7 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
eor v12.16b,v12.16b,v0.16b
eor v13.16b,v13.16b,v1.16b
eor v14.16b,v14.16b,v2.16b
eor v15.16b,v15.16b,v3.16b
st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
eor v16.16b,v16.16b,v8.16b
ldp q0,q1,[sp,#0]
eor v17.16b,v17.16b,v9.16b
ldp q2,q3,[sp,#32]
eor v18.16b,v18.16b,v10.16b
eor v19.16b,v19.16b,v11.16b
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
eor v20.16b,v20.16b,v12.16b
eor v21.16b,v21.16b,v13.16b
eor v22.16b,v22.16b,v14.16b
eor v23.16b,v23.16b,v15.16b
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v24.16b,v24.16b,v16.16b
eor v25.16b,v25.16b,v17.16b
eor v26.16b,v26.16b,v18.16b
eor v27.16b,v27.16b,v19.16b
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
shl v8.4s,v7.4s,#1 // 4 -> 8
eor v28.16b,v28.16b,v20.16b
eor v29.16b,v29.16b,v21.16b
eor v30.16b,v30.16b,v22.16b
eor v31.16b,v31.16b,v23.16b
st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
add v3.4s,v3.4s,v8.4s // += 8
add v4.4s,v4.4s,v8.4s
add v5.4s,v5.4s,v8.4s
add v6.4s,v6.4s,v8.4s
b.hs .Loop_outer_512_neon
adds x2,x2,#512
ushr v7.4s,v7.4s,#1 // 4 -> 2
ldp d10,d11,[sp,#128+16] // meet ABI requirements
ldp d12,d13,[sp,#128+32]
ldp d14,d15,[sp,#128+48]
stp q0,q0,[sp,#0] // wipe off-load area
stp q0,q0,[sp,#32]
stp q0,q0,[sp,#64]
b.eq .Ldone_512_neon
sub x3,x3,#16 // .Lone
cmp x2,#192
add sp,sp,#128
sub v3.4s,v3.4s,v7.4s // -= 2
ld1 {v8.4s,v9.4s},[x3]
b.hs .Loop_outer_neon
ldp d8,d9,[sp,#0] // meet ABI requirements
eor v1.16b,v1.16b,v1.16b
eor v2.16b,v2.16b,v2.16b
eor v3.16b,v3.16b,v3.16b
eor v4.16b,v4.16b,v4.16b
eor v5.16b,v5.16b,v5.16b
eor v6.16b,v6.16b,v6.16b
b .Loop_outer
.Ldone_512_neon:
ldp d8,d9,[sp,#128+0] // meet ABI requirements
ldp x19,x20,[x29,#16]
add sp,sp,#128+64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.inst 0xd50323bf // autiasp
ret
.size ChaCha20_512_neon,.-ChaCha20_512_neon