draconisplusplus/subprojects/openssl-3.0.8/generated-config/archs/VC-WIN32/asm/crypto/poly1305/poly1305-x86.asm

1904 lines
37 KiB
NASM
Raw Normal View History

2025-02-20 14:49:18 -05:00
%ifidn __OUTPUT_FORMAT__,obj
section code use32 class=code align=64
%elifidn __OUTPUT_FORMAT__,win32
$@feat.00 equ 1
section .text code align=64
%else
section .text code
%endif
;extern _OPENSSL_ia32cap_P
align 64
global _poly1305_init
align 16
_poly1305_init:
L$_poly1305_init_begin:
push ebp
push ebx
push esi
push edi
mov edi,DWORD [20+esp]
mov esi,DWORD [24+esp]
mov ebp,DWORD [28+esp]
xor eax,eax
mov DWORD [edi],eax
mov DWORD [4+edi],eax
mov DWORD [8+edi],eax
mov DWORD [12+edi],eax
mov DWORD [16+edi],eax
mov DWORD [20+edi],eax
cmp esi,0
je NEAR L$000nokey
call L$001pic_point
L$001pic_point:
pop ebx
lea eax,[(_poly1305_blocks-L$001pic_point)+ebx]
lea edx,[(_poly1305_emit-L$001pic_point)+ebx]
lea edi,[_OPENSSL_ia32cap_P]
mov ecx,DWORD [edi]
and ecx,83886080
cmp ecx,83886080
jne NEAR L$002no_sse2
lea eax,[(__poly1305_blocks_sse2-L$001pic_point)+ebx]
lea edx,[(__poly1305_emit_sse2-L$001pic_point)+ebx]
mov ecx,DWORD [8+edi]
test ecx,32
jz NEAR L$002no_sse2
lea eax,[(__poly1305_blocks_avx2-L$001pic_point)+ebx]
L$002no_sse2:
mov edi,DWORD [20+esp]
mov DWORD [ebp],eax
mov DWORD [4+ebp],edx
mov eax,DWORD [esi]
mov ebx,DWORD [4+esi]
mov ecx,DWORD [8+esi]
mov edx,DWORD [12+esi]
and eax,268435455
and ebx,268435452
and ecx,268435452
and edx,268435452
mov DWORD [24+edi],eax
mov DWORD [28+edi],ebx
mov DWORD [32+edi],ecx
mov DWORD [36+edi],edx
mov eax,1
L$000nokey:
pop edi
pop esi
pop ebx
pop ebp
ret
global _poly1305_blocks
align 16
_poly1305_blocks:
L$_poly1305_blocks_begin:
push ebp
push ebx
push esi
push edi
mov edi,DWORD [20+esp]
mov esi,DWORD [24+esp]
mov ecx,DWORD [28+esp]
L$enter_blocks:
and ecx,-15
jz NEAR L$003nodata
sub esp,64
mov eax,DWORD [24+edi]
mov ebx,DWORD [28+edi]
lea ebp,[ecx*1+esi]
mov ecx,DWORD [32+edi]
mov edx,DWORD [36+edi]
mov DWORD [92+esp],ebp
mov ebp,esi
mov DWORD [36+esp],eax
mov eax,ebx
shr eax,2
mov DWORD [40+esp],ebx
add eax,ebx
mov ebx,ecx
shr ebx,2
mov DWORD [44+esp],ecx
add ebx,ecx
mov ecx,edx
shr ecx,2
mov DWORD [48+esp],edx
add ecx,edx
mov DWORD [52+esp],eax
mov DWORD [56+esp],ebx
mov DWORD [60+esp],ecx
mov eax,DWORD [edi]
mov ebx,DWORD [4+edi]
mov ecx,DWORD [8+edi]
mov esi,DWORD [12+edi]
mov edi,DWORD [16+edi]
jmp NEAR L$004loop
align 32
L$004loop:
add eax,DWORD [ebp]
adc ebx,DWORD [4+ebp]
adc ecx,DWORD [8+ebp]
adc esi,DWORD [12+ebp]
lea ebp,[16+ebp]
adc edi,DWORD [96+esp]
mov DWORD [esp],eax
mov DWORD [12+esp],esi
mul DWORD [36+esp]
mov DWORD [16+esp],edi
mov edi,eax
mov eax,ebx
mov esi,edx
mul DWORD [60+esp]
add edi,eax
mov eax,ecx
adc esi,edx
mul DWORD [56+esp]
add edi,eax
mov eax,DWORD [12+esp]
adc esi,edx
mul DWORD [52+esp]
add edi,eax
mov eax,DWORD [esp]
adc esi,edx
mul DWORD [40+esp]
mov DWORD [20+esp],edi
xor edi,edi
add esi,eax
mov eax,ebx
adc edi,edx
mul DWORD [36+esp]
add esi,eax
mov eax,ecx
adc edi,edx
mul DWORD [60+esp]
add esi,eax
mov eax,DWORD [12+esp]
adc edi,edx
mul DWORD [56+esp]
add esi,eax
mov eax,DWORD [16+esp]
adc edi,edx
imul eax,DWORD [52+esp]
add esi,eax
mov eax,DWORD [esp]
adc edi,0
mul DWORD [44+esp]
mov DWORD [24+esp],esi
xor esi,esi
add edi,eax
mov eax,ebx
adc esi,edx
mul DWORD [40+esp]
add edi,eax
mov eax,ecx
adc esi,edx
mul DWORD [36+esp]
add edi,eax
mov eax,DWORD [12+esp]
adc esi,edx
mul DWORD [60+esp]
add edi,eax
mov eax,DWORD [16+esp]
adc esi,edx
imul eax,DWORD [56+esp]
add edi,eax
mov eax,DWORD [esp]
adc esi,0
mul DWORD [48+esp]
mov DWORD [28+esp],edi
xor edi,edi
add esi,eax
mov eax,ebx
adc edi,edx
mul DWORD [44+esp]
add esi,eax
mov eax,ecx
adc edi,edx
mul DWORD [40+esp]
add esi,eax
mov eax,DWORD [12+esp]
adc edi,edx
mul DWORD [36+esp]
add esi,eax
mov ecx,DWORD [16+esp]
adc edi,edx
mov edx,ecx
imul ecx,DWORD [60+esp]
add esi,ecx
mov eax,DWORD [20+esp]
adc edi,0
imul edx,DWORD [36+esp]
add edx,edi
mov ebx,DWORD [24+esp]
mov ecx,DWORD [28+esp]
mov edi,edx
shr edx,2
and edi,3
lea edx,[edx*4+edx]
add eax,edx
adc ebx,0
adc ecx,0
adc esi,0
adc edi,0
cmp ebp,DWORD [92+esp]
jne NEAR L$004loop
mov edx,DWORD [84+esp]
add esp,64
mov DWORD [edx],eax
mov DWORD [4+edx],ebx
mov DWORD [8+edx],ecx
mov DWORD [12+edx],esi
mov DWORD [16+edx],edi
L$003nodata:
pop edi
pop esi
pop ebx
pop ebp
ret
global _poly1305_emit
align 16
_poly1305_emit:
L$_poly1305_emit_begin:
push ebp
push ebx
push esi
push edi
mov ebp,DWORD [20+esp]
L$enter_emit:
mov edi,DWORD [24+esp]
mov eax,DWORD [ebp]
mov ebx,DWORD [4+ebp]
mov ecx,DWORD [8+ebp]
mov edx,DWORD [12+ebp]
mov esi,DWORD [16+ebp]
add eax,5
adc ebx,0
adc ecx,0
adc edx,0
adc esi,0
shr esi,2
neg esi
and eax,esi
and ebx,esi
and ecx,esi
and edx,esi
mov DWORD [edi],eax
mov DWORD [4+edi],ebx
mov DWORD [8+edi],ecx
mov DWORD [12+edi],edx
not esi
mov eax,DWORD [ebp]
mov ebx,DWORD [4+ebp]
mov ecx,DWORD [8+ebp]
mov edx,DWORD [12+ebp]
mov ebp,DWORD [28+esp]
and eax,esi
and ebx,esi
and ecx,esi
and edx,esi
or eax,DWORD [edi]
or ebx,DWORD [4+edi]
or ecx,DWORD [8+edi]
or edx,DWORD [12+edi]
add eax,DWORD [ebp]
adc ebx,DWORD [4+ebp]
adc ecx,DWORD [8+ebp]
adc edx,DWORD [12+ebp]
mov DWORD [edi],eax
mov DWORD [4+edi],ebx
mov DWORD [8+edi],ecx
mov DWORD [12+edi],edx
pop edi
pop esi
pop ebx
pop ebp
ret
align 32
align 16
__poly1305_init_sse2:
movdqu xmm4,[24+edi]
lea edi,[48+edi]
mov ebp,esp
sub esp,224
and esp,-16
movq xmm7,[64+ebx]
movdqa xmm0,xmm4
movdqa xmm1,xmm4
movdqa xmm2,xmm4
pand xmm0,xmm7
psrlq xmm1,26
psrldq xmm2,6
pand xmm1,xmm7
movdqa xmm3,xmm2
psrlq xmm2,4
psrlq xmm3,30
pand xmm2,xmm7
pand xmm3,xmm7
psrldq xmm4,13
lea edx,[144+esp]
mov ecx,2
L$005square:
movdqa [esp],xmm0
movdqa [16+esp],xmm1
movdqa [32+esp],xmm2
movdqa [48+esp],xmm3
movdqa [64+esp],xmm4
movdqa xmm6,xmm1
movdqa xmm5,xmm2
pslld xmm6,2
pslld xmm5,2
paddd xmm6,xmm1
paddd xmm5,xmm2
movdqa [80+esp],xmm6
movdqa [96+esp],xmm5
movdqa xmm6,xmm3
movdqa xmm5,xmm4
pslld xmm6,2
pslld xmm5,2
paddd xmm6,xmm3
paddd xmm5,xmm4
movdqa [112+esp],xmm6
movdqa [128+esp],xmm5
pshufd xmm6,xmm0,68
movdqa xmm5,xmm1
pshufd xmm1,xmm1,68
pshufd xmm2,xmm2,68
pshufd xmm3,xmm3,68
pshufd xmm4,xmm4,68
movdqa [edx],xmm6
movdqa [16+edx],xmm1
movdqa [32+edx],xmm2
movdqa [48+edx],xmm3
movdqa [64+edx],xmm4
pmuludq xmm4,xmm0
pmuludq xmm3,xmm0
pmuludq xmm2,xmm0
pmuludq xmm1,xmm0
pmuludq xmm0,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[48+edx]
movdqa xmm7,xmm6
pmuludq xmm6,[32+edx]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+edx]
paddq xmm3,xmm6
movdqa xmm6,[80+esp]
pmuludq xmm5,[edx]
paddq xmm2,xmm7
pmuludq xmm6,[64+edx]
movdqa xmm7,[32+esp]
paddq xmm1,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[32+edx]
paddq xmm0,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[16+edx]
paddq xmm4,xmm7
movdqa xmm7,[96+esp]
pmuludq xmm6,[edx]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[64+edx]
paddq xmm2,xmm6
pmuludq xmm5,[48+edx]
movdqa xmm6,[48+esp]
paddq xmm1,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[16+edx]
paddq xmm0,xmm5
movdqa xmm5,[112+esp]
pmuludq xmm7,[edx]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[64+edx]
paddq xmm3,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[48+edx]
paddq xmm2,xmm5
pmuludq xmm7,[32+edx]
movdqa xmm5,[64+esp]
paddq xmm1,xmm6
movdqa xmm6,[128+esp]
pmuludq xmm5,[edx]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[64+edx]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+edx]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[32+edx]
paddq xmm0,xmm7
pmuludq xmm6,[48+edx]
movdqa xmm7,[64+ebx]
paddq xmm1,xmm5
paddq xmm2,xmm6
movdqa xmm5,xmm3
pand xmm3,xmm7
psrlq xmm5,26
paddq xmm5,xmm4
movdqa xmm6,xmm0
pand xmm0,xmm7
psrlq xmm6,26
movdqa xmm4,xmm5
paddq xmm6,xmm1
psrlq xmm5,26
pand xmm4,xmm7
movdqa xmm1,xmm6
psrlq xmm6,26
paddd xmm0,xmm5
psllq xmm5,2
paddq xmm6,xmm2
paddq xmm5,xmm0
pand xmm1,xmm7
movdqa xmm2,xmm6
psrlq xmm6,26
pand xmm2,xmm7
paddd xmm6,xmm3
movdqa xmm0,xmm5
psrlq xmm5,26
movdqa xmm3,xmm6
psrlq xmm6,26
pand xmm0,xmm7
paddd xmm1,xmm5
pand xmm3,xmm7
paddd xmm4,xmm6
dec ecx
jz NEAR L$006square_break
punpcklqdq xmm0,[esp]
punpcklqdq xmm1,[16+esp]
punpcklqdq xmm2,[32+esp]
punpcklqdq xmm3,[48+esp]
punpcklqdq xmm4,[64+esp]
jmp NEAR L$005square
L$006square_break:
psllq xmm0,32
psllq xmm1,32
psllq xmm2,32
psllq xmm3,32
psllq xmm4,32
por xmm0,[esp]
por xmm1,[16+esp]
por xmm2,[32+esp]
por xmm3,[48+esp]
por xmm4,[64+esp]
pshufd xmm0,xmm0,141
pshufd xmm1,xmm1,141
pshufd xmm2,xmm2,141
pshufd xmm3,xmm3,141
pshufd xmm4,xmm4,141
movdqu [edi],xmm0
movdqu [16+edi],xmm1
movdqu [32+edi],xmm2
movdqu [48+edi],xmm3
movdqu [64+edi],xmm4
movdqa xmm6,xmm1
movdqa xmm5,xmm2
pslld xmm6,2
pslld xmm5,2
paddd xmm6,xmm1
paddd xmm5,xmm2
movdqu [80+edi],xmm6
movdqu [96+edi],xmm5
movdqa xmm6,xmm3
movdqa xmm5,xmm4
pslld xmm6,2
pslld xmm5,2
paddd xmm6,xmm3
paddd xmm5,xmm4
movdqu [112+edi],xmm6
movdqu [128+edi],xmm5
mov esp,ebp
lea edi,[edi-48]
ret
align 32
align 16
__poly1305_blocks_sse2:
push ebp
push ebx
push esi
push edi
mov edi,DWORD [20+esp]
mov esi,DWORD [24+esp]
mov ecx,DWORD [28+esp]
mov eax,DWORD [20+edi]
and ecx,-16
jz NEAR L$007nodata
cmp ecx,64
jae NEAR L$008enter_sse2
test eax,eax
jz NEAR L$enter_blocks
align 16
L$008enter_sse2:
call L$009pic_point
L$009pic_point:
pop ebx
lea ebx,[(L$const_sse2-L$009pic_point)+ebx]
test eax,eax
jnz NEAR L$010base2_26
call __poly1305_init_sse2
mov eax,DWORD [edi]
mov ecx,DWORD [3+edi]
mov edx,DWORD [6+edi]
mov esi,DWORD [9+edi]
mov ebp,DWORD [13+edi]
mov DWORD [20+edi],1
shr ecx,2
and eax,67108863
shr edx,4
and ecx,67108863
shr esi,6
and edx,67108863
movd xmm0,eax
movd xmm1,ecx
movd xmm2,edx
movd xmm3,esi
movd xmm4,ebp
mov esi,DWORD [24+esp]
mov ecx,DWORD [28+esp]
jmp NEAR L$011base2_32
align 16
L$010base2_26:
movd xmm0,DWORD [edi]
movd xmm1,DWORD [4+edi]
movd xmm2,DWORD [8+edi]
movd xmm3,DWORD [12+edi]
movd xmm4,DWORD [16+edi]
movdqa xmm7,[64+ebx]
L$011base2_32:
mov eax,DWORD [32+esp]
mov ebp,esp
sub esp,528
and esp,-16
lea edi,[48+edi]
shl eax,24
test ecx,31
jz NEAR L$012even
movdqu xmm6,[esi]
lea esi,[16+esi]
movdqa xmm5,xmm6
pand xmm6,xmm7
paddd xmm0,xmm6
movdqa xmm6,xmm5
psrlq xmm5,26
psrldq xmm6,6
pand xmm5,xmm7
paddd xmm1,xmm5
movdqa xmm5,xmm6
psrlq xmm6,4
pand xmm6,xmm7
paddd xmm2,xmm6
movdqa xmm6,xmm5
psrlq xmm5,30
pand xmm5,xmm7
psrldq xmm6,7
paddd xmm3,xmm5
movd xmm5,eax
paddd xmm4,xmm6
movd xmm6,DWORD [12+edi]
paddd xmm4,xmm5
movdqa [esp],xmm0
movdqa [16+esp],xmm1
movdqa [32+esp],xmm2
movdqa [48+esp],xmm3
movdqa [64+esp],xmm4
pmuludq xmm0,xmm6
pmuludq xmm1,xmm6
pmuludq xmm2,xmm6
movd xmm5,DWORD [28+edi]
pmuludq xmm3,xmm6
pmuludq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[48+esp]
movdqa xmm7,xmm6
pmuludq xmm6,[32+esp]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+esp]
paddq xmm3,xmm6
movd xmm6,DWORD [92+edi]
pmuludq xmm5,[esp]
paddq xmm2,xmm7
pmuludq xmm6,[64+esp]
movd xmm7,DWORD [44+edi]
paddq xmm1,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[32+esp]
paddq xmm0,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[16+esp]
paddq xmm4,xmm7
movd xmm7,DWORD [108+edi]
pmuludq xmm6,[esp]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[64+esp]
paddq xmm2,xmm6
pmuludq xmm5,[48+esp]
movd xmm6,DWORD [60+edi]
paddq xmm1,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[16+esp]
paddq xmm0,xmm5
movd xmm5,DWORD [124+edi]
pmuludq xmm7,[esp]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[64+esp]
paddq xmm3,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[48+esp]
paddq xmm2,xmm5
pmuludq xmm7,[32+esp]
movd xmm5,DWORD [76+edi]
paddq xmm1,xmm6
movd xmm6,DWORD [140+edi]
pmuludq xmm5,[esp]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[64+esp]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+esp]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[32+esp]
paddq xmm0,xmm7
pmuludq xmm6,[48+esp]
movdqa xmm7,[64+ebx]
paddq xmm1,xmm5
paddq xmm2,xmm6
movdqa xmm5,xmm3
pand xmm3,xmm7
psrlq xmm5,26
paddq xmm5,xmm4
movdqa xmm6,xmm0
pand xmm0,xmm7
psrlq xmm6,26
movdqa xmm4,xmm5
paddq xmm6,xmm1
psrlq xmm5,26
pand xmm4,xmm7
movdqa xmm1,xmm6
psrlq xmm6,26
paddd xmm0,xmm5
psllq xmm5,2
paddq xmm6,xmm2
paddq xmm5,xmm0
pand xmm1,xmm7
movdqa xmm2,xmm6
psrlq xmm6,26
pand xmm2,xmm7
paddd xmm6,xmm3
movdqa xmm0,xmm5
psrlq xmm5,26
movdqa xmm3,xmm6
psrlq xmm6,26
pand xmm0,xmm7
paddd xmm1,xmm5
pand xmm3,xmm7
paddd xmm4,xmm6
sub ecx,16
jz NEAR L$013done
L$012even:
lea edx,[384+esp]
lea eax,[esi-32]
sub ecx,64
movdqu xmm5,[edi]
pshufd xmm6,xmm5,68
cmovb esi,eax
pshufd xmm5,xmm5,238
movdqa [edx],xmm6
lea eax,[160+esp]
movdqu xmm6,[16+edi]
movdqa [edx-144],xmm5
pshufd xmm5,xmm6,68
pshufd xmm6,xmm6,238
movdqa [16+edx],xmm5
movdqu xmm5,[32+edi]
movdqa [edx-128],xmm6
pshufd xmm6,xmm5,68
pshufd xmm5,xmm5,238
movdqa [32+edx],xmm6
movdqu xmm6,[48+edi]
movdqa [edx-112],xmm5
pshufd xmm5,xmm6,68
pshufd xmm6,xmm6,238
movdqa [48+edx],xmm5
movdqu xmm5,[64+edi]
movdqa [edx-96],xmm6
pshufd xmm6,xmm5,68
pshufd xmm5,xmm5,238
movdqa [64+edx],xmm6
movdqu xmm6,[80+edi]
movdqa [edx-80],xmm5
pshufd xmm5,xmm6,68
pshufd xmm6,xmm6,238
movdqa [80+edx],xmm5
movdqu xmm5,[96+edi]
movdqa [edx-64],xmm6
pshufd xmm6,xmm5,68
pshufd xmm5,xmm5,238
movdqa [96+edx],xmm6
movdqu xmm6,[112+edi]
movdqa [edx-48],xmm5
pshufd xmm5,xmm6,68
pshufd xmm6,xmm6,238
movdqa [112+edx],xmm5
movdqu xmm5,[128+edi]
movdqa [edx-32],xmm6
pshufd xmm6,xmm5,68
pshufd xmm5,xmm5,238
movdqa [128+edx],xmm6
movdqa [edx-16],xmm5
movdqu xmm5,[32+esi]
movdqu xmm6,[48+esi]
lea esi,[32+esi]
movdqa [112+esp],xmm2
movdqa [128+esp],xmm3
movdqa [144+esp],xmm4
movdqa xmm2,xmm5
movdqa xmm3,xmm6
psrldq xmm2,6
psrldq xmm3,6
movdqa xmm4,xmm5
punpcklqdq xmm2,xmm3
punpckhqdq xmm4,xmm6
punpcklqdq xmm5,xmm6
movdqa xmm3,xmm2
psrlq xmm2,4
psrlq xmm3,30
movdqa xmm6,xmm5
psrlq xmm4,40
psrlq xmm6,26
pand xmm5,xmm7
pand xmm6,xmm7
pand xmm2,xmm7
pand xmm3,xmm7
por xmm4,[ebx]
movdqa [80+esp],xmm0
movdqa [96+esp],xmm1
jbe NEAR L$014skip_loop
jmp NEAR L$015loop
align 32
L$015loop:
movdqa xmm7,[edx-144]
movdqa [16+eax],xmm6
movdqa [32+eax],xmm2
movdqa [48+eax],xmm3
movdqa [64+eax],xmm4
movdqa xmm1,xmm5
pmuludq xmm5,xmm7
movdqa xmm0,xmm6
pmuludq xmm6,xmm7
pmuludq xmm2,xmm7
pmuludq xmm3,xmm7
pmuludq xmm4,xmm7
pmuludq xmm0,[edx-16]
movdqa xmm7,xmm1
pmuludq xmm1,[edx-128]
paddq xmm0,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[edx-112]
paddq xmm1,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[edx-96]
paddq xmm2,xmm7
movdqa xmm7,[16+eax]
pmuludq xmm6,[edx-80]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[edx-128]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[edx-112]
paddq xmm2,xmm7
movdqa xmm7,[32+eax]
pmuludq xmm6,[edx-96]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[edx-32]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[edx-16]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[edx-128]
paddq xmm1,xmm5
movdqa xmm5,[48+eax]
pmuludq xmm7,[edx-112]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[edx-48]
paddq xmm4,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[edx-32]
paddq xmm0,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[edx-16]
paddq xmm1,xmm6
movdqa xmm6,[64+eax]
pmuludq xmm5,[edx-128]
paddq xmm2,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[edx-16]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[edx-64]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[edx-48]
paddq xmm0,xmm7
movdqa xmm7,[64+ebx]
pmuludq xmm6,[edx-32]
paddq xmm1,xmm5
paddq xmm2,xmm6
movdqu xmm5,[esi-32]
movdqu xmm6,[esi-16]
lea esi,[32+esi]
movdqa [32+esp],xmm2
movdqa [48+esp],xmm3
movdqa [64+esp],xmm4
movdqa xmm2,xmm5
movdqa xmm3,xmm6
psrldq xmm2,6
psrldq xmm3,6
movdqa xmm4,xmm5
punpcklqdq xmm2,xmm3
punpckhqdq xmm4,xmm6
punpcklqdq xmm5,xmm6
movdqa xmm3,xmm2
psrlq xmm2,4
psrlq xmm3,30
movdqa xmm6,xmm5
psrlq xmm4,40
psrlq xmm6,26
pand xmm5,xmm7
pand xmm6,xmm7
pand xmm2,xmm7
pand xmm3,xmm7
por xmm4,[ebx]
lea eax,[esi-32]
sub ecx,64
paddd xmm5,[80+esp]
paddd xmm6,[96+esp]
paddd xmm2,[112+esp]
paddd xmm3,[128+esp]
paddd xmm4,[144+esp]
cmovb esi,eax
lea eax,[160+esp]
movdqa xmm7,[edx]
movdqa [16+esp],xmm1
movdqa [16+eax],xmm6
movdqa [32+eax],xmm2
movdqa [48+eax],xmm3
movdqa [64+eax],xmm4
movdqa xmm1,xmm5
pmuludq xmm5,xmm7
paddq xmm5,xmm0
movdqa xmm0,xmm6
pmuludq xmm6,xmm7
pmuludq xmm2,xmm7
pmuludq xmm3,xmm7
pmuludq xmm4,xmm7
paddq xmm6,[16+esp]
paddq xmm2,[32+esp]
paddq xmm3,[48+esp]
paddq xmm4,[64+esp]
pmuludq xmm0,[128+edx]
movdqa xmm7,xmm1
pmuludq xmm1,[16+edx]
paddq xmm0,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[32+edx]
paddq xmm1,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[48+edx]
paddq xmm2,xmm7
movdqa xmm7,[16+eax]
pmuludq xmm6,[64+edx]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+edx]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[32+edx]
paddq xmm2,xmm7
movdqa xmm7,[32+eax]
pmuludq xmm6,[48+edx]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[112+edx]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[128+edx]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[16+edx]
paddq xmm1,xmm5
movdqa xmm5,[48+eax]
pmuludq xmm7,[32+edx]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[96+edx]
paddq xmm4,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[112+edx]
paddq xmm0,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[128+edx]
paddq xmm1,xmm6
movdqa xmm6,[64+eax]
pmuludq xmm5,[16+edx]
paddq xmm2,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[128+edx]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[80+edx]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[96+edx]
paddq xmm0,xmm7
movdqa xmm7,[64+ebx]
pmuludq xmm6,[112+edx]
paddq xmm1,xmm5
paddq xmm2,xmm6
movdqa xmm5,xmm3
pand xmm3,xmm7
psrlq xmm5,26
paddq xmm5,xmm4
movdqa xmm6,xmm0
pand xmm0,xmm7
psrlq xmm6,26
movdqa xmm4,xmm5
paddq xmm6,xmm1
psrlq xmm5,26
pand xmm4,xmm7
movdqa xmm1,xmm6
psrlq xmm6,26
paddd xmm0,xmm5
psllq xmm5,2
paddq xmm6,xmm2
paddq xmm5,xmm0
pand xmm1,xmm7
movdqa xmm2,xmm6
psrlq xmm6,26
pand xmm2,xmm7
paddd xmm6,xmm3
movdqa xmm0,xmm5
psrlq xmm5,26
movdqa xmm3,xmm6
psrlq xmm6,26
pand xmm0,xmm7
paddd xmm1,xmm5
pand xmm3,xmm7
paddd xmm4,xmm6
movdqu xmm5,[32+esi]
movdqu xmm6,[48+esi]
lea esi,[32+esi]
movdqa [112+esp],xmm2
movdqa [128+esp],xmm3
movdqa [144+esp],xmm4
movdqa xmm2,xmm5
movdqa xmm3,xmm6
psrldq xmm2,6
psrldq xmm3,6
movdqa xmm4,xmm5
punpcklqdq xmm2,xmm3
punpckhqdq xmm4,xmm6
punpcklqdq xmm5,xmm6
movdqa xmm3,xmm2
psrlq xmm2,4
psrlq xmm3,30
movdqa xmm6,xmm5
psrlq xmm4,40
psrlq xmm6,26
pand xmm5,xmm7
pand xmm6,xmm7
pand xmm2,xmm7
pand xmm3,xmm7
por xmm4,[ebx]
movdqa [80+esp],xmm0
movdqa [96+esp],xmm1
ja NEAR L$015loop
L$014skip_loop:
pshufd xmm7,[edx-144],16
add ecx,32
jnz NEAR L$016long_tail
paddd xmm5,xmm0
paddd xmm6,xmm1
paddd xmm2,[112+esp]
paddd xmm3,[128+esp]
paddd xmm4,[144+esp]
L$016long_tail:
movdqa [eax],xmm5
movdqa [16+eax],xmm6
movdqa [32+eax],xmm2
movdqa [48+eax],xmm3
movdqa [64+eax],xmm4
pmuludq xmm5,xmm7
pmuludq xmm6,xmm7
pmuludq xmm2,xmm7
movdqa xmm0,xmm5
pshufd xmm5,[edx-128],16
pmuludq xmm3,xmm7
movdqa xmm1,xmm6
pmuludq xmm4,xmm7
movdqa xmm6,xmm5
pmuludq xmm5,[48+eax]
movdqa xmm7,xmm6
pmuludq xmm6,[32+eax]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+eax]
paddq xmm3,xmm6
pshufd xmm6,[edx-64],16
pmuludq xmm5,[eax]
paddq xmm2,xmm7
pmuludq xmm6,[64+eax]
pshufd xmm7,[edx-112],16
paddq xmm1,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[32+eax]
paddq xmm0,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[16+eax]
paddq xmm4,xmm7
pshufd xmm7,[edx-48],16
pmuludq xmm6,[eax]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[64+eax]
paddq xmm2,xmm6
pmuludq xmm5,[48+eax]
pshufd xmm6,[edx-96],16
paddq xmm1,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[16+eax]
paddq xmm0,xmm5
pshufd xmm5,[edx-32],16
pmuludq xmm7,[eax]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[64+eax]
paddq xmm3,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[48+eax]
paddq xmm2,xmm5
pmuludq xmm7,[32+eax]
pshufd xmm5,[edx-80],16
paddq xmm1,xmm6
pshufd xmm6,[edx-16],16
pmuludq xmm5,[eax]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[64+eax]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+eax]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[32+eax]
paddq xmm0,xmm7
pmuludq xmm6,[48+eax]
movdqa xmm7,[64+ebx]
paddq xmm1,xmm5
paddq xmm2,xmm6
jz NEAR L$017short_tail
movdqu xmm5,[esi-32]
movdqu xmm6,[esi-16]
lea esi,[32+esi]
movdqa [32+esp],xmm2
movdqa [48+esp],xmm3
movdqa [64+esp],xmm4
movdqa xmm2,xmm5
movdqa xmm3,xmm6
psrldq xmm2,6
psrldq xmm3,6
movdqa xmm4,xmm5
punpcklqdq xmm2,xmm3
punpckhqdq xmm4,xmm6
punpcklqdq xmm5,xmm6
movdqa xmm3,xmm2
psrlq xmm2,4
psrlq xmm3,30
movdqa xmm6,xmm5
psrlq xmm4,40
psrlq xmm6,26
pand xmm5,xmm7
pand xmm6,xmm7
pand xmm2,xmm7
pand xmm3,xmm7
por xmm4,[ebx]
pshufd xmm7,[edx],16
paddd xmm5,[80+esp]
paddd xmm6,[96+esp]
paddd xmm2,[112+esp]
paddd xmm3,[128+esp]
paddd xmm4,[144+esp]
movdqa [esp],xmm5
pmuludq xmm5,xmm7
movdqa [16+esp],xmm6
pmuludq xmm6,xmm7
paddq xmm0,xmm5
movdqa xmm5,xmm2
pmuludq xmm2,xmm7
paddq xmm1,xmm6
movdqa xmm6,xmm3
pmuludq xmm3,xmm7
paddq xmm2,[32+esp]
movdqa [32+esp],xmm5
pshufd xmm5,[16+edx],16
paddq xmm3,[48+esp]
movdqa [48+esp],xmm6
movdqa xmm6,xmm4
pmuludq xmm4,xmm7
paddq xmm4,[64+esp]
movdqa [64+esp],xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[48+esp]
movdqa xmm7,xmm6
pmuludq xmm6,[32+esp]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+esp]
paddq xmm3,xmm6
pshufd xmm6,[80+edx],16
pmuludq xmm5,[esp]
paddq xmm2,xmm7
pmuludq xmm6,[64+esp]
pshufd xmm7,[32+edx],16
paddq xmm1,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[32+esp]
paddq xmm0,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[16+esp]
paddq xmm4,xmm7
pshufd xmm7,[96+edx],16
pmuludq xmm6,[esp]
paddq xmm3,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[64+esp]
paddq xmm2,xmm6
pmuludq xmm5,[48+esp]
pshufd xmm6,[48+edx],16
paddq xmm1,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[16+esp]
paddq xmm0,xmm5
pshufd xmm5,[112+edx],16
pmuludq xmm7,[esp]
paddq xmm4,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[64+esp]
paddq xmm3,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[48+esp]
paddq xmm2,xmm5
pmuludq xmm7,[32+esp]
pshufd xmm5,[64+edx],16
paddq xmm1,xmm6
pshufd xmm6,[128+edx],16
pmuludq xmm5,[esp]
paddq xmm0,xmm7
movdqa xmm7,xmm6
pmuludq xmm6,[64+esp]
paddq xmm4,xmm5
movdqa xmm5,xmm7
pmuludq xmm7,[16+esp]
paddq xmm3,xmm6
movdqa xmm6,xmm5
pmuludq xmm5,[32+esp]
paddq xmm0,xmm7
pmuludq xmm6,[48+esp]
movdqa xmm7,[64+ebx]
paddq xmm1,xmm5
paddq xmm2,xmm6
L$017short_tail:
pshufd xmm6,xmm4,78
pshufd xmm5,xmm3,78
paddq xmm4,xmm6
paddq xmm3,xmm5
pshufd xmm6,xmm0,78
pshufd xmm5,xmm1,78
paddq xmm0,xmm6
paddq xmm1,xmm5
pshufd xmm6,xmm2,78
movdqa xmm5,xmm3
pand xmm3,xmm7
psrlq xmm5,26
paddq xmm2,xmm6
paddq xmm5,xmm4
movdqa xmm6,xmm0
pand xmm0,xmm7
psrlq xmm6,26
movdqa xmm4,xmm5
paddq xmm6,xmm1
psrlq xmm5,26
pand xmm4,xmm7
movdqa xmm1,xmm6
psrlq xmm6,26
paddd xmm0,xmm5
psllq xmm5,2
paddq xmm6,xmm2
paddq xmm5,xmm0
pand xmm1,xmm7
movdqa xmm2,xmm6
psrlq xmm6,26
pand xmm2,xmm7
paddd xmm6,xmm3
movdqa xmm0,xmm5
psrlq xmm5,26
movdqa xmm3,xmm6
psrlq xmm6,26
pand xmm0,xmm7
paddd xmm1,xmm5
pand xmm3,xmm7
paddd xmm4,xmm6
L$013done:
movd DWORD [edi-48],xmm0
movd DWORD [edi-44],xmm1
movd DWORD [edi-40],xmm2
movd DWORD [edi-36],xmm3
movd DWORD [edi-32],xmm4
mov esp,ebp
L$007nodata:
pop edi
pop esi
pop ebx
pop ebp
ret
align 32
align 16
__poly1305_emit_sse2:
push ebp
push ebx
push esi
push edi
mov ebp,DWORD [20+esp]
cmp DWORD [20+ebp],0
je NEAR L$enter_emit
mov eax,DWORD [ebp]
mov edi,DWORD [4+ebp]
mov ecx,DWORD [8+ebp]
mov edx,DWORD [12+ebp]
mov esi,DWORD [16+ebp]
mov ebx,edi
shl edi,26
shr ebx,6
add eax,edi
mov edi,ecx
adc ebx,0
shl edi,20
shr ecx,12
add ebx,edi
mov edi,edx
adc ecx,0
shl edi,14
shr edx,18
add ecx,edi
mov edi,esi
adc edx,0
shl edi,8
shr esi,24
add edx,edi
adc esi,0
mov edi,esi
and esi,3
shr edi,2
lea ebp,[edi*4+edi]
mov edi,DWORD [24+esp]
add eax,ebp
mov ebp,DWORD [28+esp]
adc ebx,0
adc ecx,0
adc edx,0
adc esi,0
movd xmm0,eax
add eax,5
movd xmm1,ebx
adc ebx,0
movd xmm2,ecx
adc ecx,0
movd xmm3,edx
adc edx,0
adc esi,0
shr esi,2
neg esi
and eax,esi
and ebx,esi
and ecx,esi
and edx,esi
mov DWORD [edi],eax
movd eax,xmm0
mov DWORD [4+edi],ebx
movd ebx,xmm1
mov DWORD [8+edi],ecx
movd ecx,xmm2
mov DWORD [12+edi],edx
movd edx,xmm3
not esi
and eax,esi
and ebx,esi
or eax,DWORD [edi]
and ecx,esi
or ebx,DWORD [4+edi]
and edx,esi
or ecx,DWORD [8+edi]
or edx,DWORD [12+edi]
add eax,DWORD [ebp]
adc ebx,DWORD [4+ebp]
mov DWORD [edi],eax
adc ecx,DWORD [8+ebp]
mov DWORD [4+edi],ebx
adc edx,DWORD [12+ebp]
mov DWORD [8+edi],ecx
mov DWORD [12+edi],edx
pop edi
pop esi
pop ebx
pop ebp
ret
align 32
align 16
__poly1305_init_avx2:
vmovdqu xmm4,[24+edi]
lea edi,[48+edi]
mov ebp,esp
sub esp,224
and esp,-16
vmovdqa xmm7,[64+ebx]
vpand xmm0,xmm4,xmm7
vpsrlq xmm1,xmm4,26
vpsrldq xmm3,xmm4,6
vpand xmm1,xmm1,xmm7
vpsrlq xmm2,xmm3,4
vpsrlq xmm3,xmm3,30
vpand xmm2,xmm2,xmm7
vpand xmm3,xmm3,xmm7
vpsrldq xmm4,xmm4,13
lea edx,[144+esp]
mov ecx,2
L$018square:
vmovdqa [esp],xmm0
vmovdqa [16+esp],xmm1
vmovdqa [32+esp],xmm2
vmovdqa [48+esp],xmm3
vmovdqa [64+esp],xmm4
vpslld xmm6,xmm1,2
vpslld xmm5,xmm2,2
vpaddd xmm6,xmm6,xmm1
vpaddd xmm5,xmm5,xmm2
vmovdqa [80+esp],xmm6
vmovdqa [96+esp],xmm5
vpslld xmm6,xmm3,2
vpslld xmm5,xmm4,2
vpaddd xmm6,xmm6,xmm3
vpaddd xmm5,xmm5,xmm4
vmovdqa [112+esp],xmm6
vmovdqa [128+esp],xmm5
vpshufd xmm5,xmm0,68
vmovdqa xmm6,xmm1
vpshufd xmm1,xmm1,68
vpshufd xmm2,xmm2,68
vpshufd xmm3,xmm3,68
vpshufd xmm4,xmm4,68
vmovdqa [edx],xmm5
vmovdqa [16+edx],xmm1
vmovdqa [32+edx],xmm2
vmovdqa [48+edx],xmm3
vmovdqa [64+edx],xmm4
vpmuludq xmm4,xmm4,xmm0
vpmuludq xmm3,xmm3,xmm0
vpmuludq xmm2,xmm2,xmm0
vpmuludq xmm1,xmm1,xmm0
vpmuludq xmm0,xmm5,xmm0
vpmuludq xmm5,xmm6,[48+edx]
vpaddq xmm4,xmm4,xmm5
vpmuludq xmm7,xmm6,[32+edx]
vpaddq xmm3,xmm3,xmm7
vpmuludq xmm5,xmm6,[16+edx]
vpaddq xmm2,xmm2,xmm5
vmovdqa xmm7,[80+esp]
vpmuludq xmm6,xmm6,[edx]
vpaddq xmm1,xmm1,xmm6
vmovdqa xmm5,[32+esp]
vpmuludq xmm7,xmm7,[64+edx]
vpaddq xmm0,xmm0,xmm7
vpmuludq xmm6,xmm5,[32+edx]
vpaddq xmm4,xmm4,xmm6
vpmuludq xmm7,xmm5,[16+edx]
vpaddq xmm3,xmm3,xmm7
vmovdqa xmm6,[96+esp]
vpmuludq xmm5,xmm5,[edx]
vpaddq xmm2,xmm2,xmm5
vpmuludq xmm7,xmm6,[64+edx]
vpaddq xmm1,xmm1,xmm7
vmovdqa xmm5,[48+esp]
vpmuludq xmm6,xmm6,[48+edx]
vpaddq xmm0,xmm0,xmm6
vpmuludq xmm7,xmm5,[16+edx]
vpaddq xmm4,xmm4,xmm7
vmovdqa xmm6,[112+esp]
vpmuludq xmm5,xmm5,[edx]
vpaddq xmm3,xmm3,xmm5
vpmuludq xmm7,xmm6,[64+edx]
vpaddq xmm2,xmm2,xmm7
vpmuludq xmm5,xmm6,[48+edx]
vpaddq xmm1,xmm1,xmm5
vmovdqa xmm7,[64+esp]
vpmuludq xmm6,xmm6,[32+edx]
vpaddq xmm0,xmm0,xmm6
vmovdqa xmm5,[128+esp]
vpmuludq xmm7,xmm7,[edx]
vpaddq xmm4,xmm4,xmm7
vpmuludq xmm6,xmm5,[64+edx]
vpaddq xmm3,xmm3,xmm6
vpmuludq xmm7,xmm5,[16+edx]
vpaddq xmm0,xmm0,xmm7
vpmuludq xmm6,xmm5,[32+edx]
vpaddq xmm1,xmm1,xmm6
vmovdqa xmm7,[64+ebx]
vpmuludq xmm5,xmm5,[48+edx]
vpaddq xmm2,xmm2,xmm5
vpsrlq xmm5,xmm3,26
vpand xmm3,xmm3,xmm7
vpsrlq xmm6,xmm0,26
vpand xmm0,xmm0,xmm7
vpaddq xmm4,xmm4,xmm5
vpaddq xmm1,xmm1,xmm6
vpsrlq xmm5,xmm4,26
vpand xmm4,xmm4,xmm7
vpsrlq xmm6,xmm1,26
vpand xmm1,xmm1,xmm7
vpaddq xmm2,xmm2,xmm6
vpaddd xmm0,xmm0,xmm5
vpsllq xmm5,xmm5,2
vpsrlq xmm6,xmm2,26
vpand xmm2,xmm2,xmm7
vpaddd xmm0,xmm0,xmm5
vpaddd xmm3,xmm3,xmm6
vpsrlq xmm6,xmm3,26
vpsrlq xmm5,xmm0,26
vpand xmm0,xmm0,xmm7
vpand xmm3,xmm3,xmm7
vpaddd xmm1,xmm1,xmm5
vpaddd xmm4,xmm4,xmm6
dec ecx
jz NEAR L$019square_break
vpunpcklqdq xmm0,xmm0,[esp]
vpunpcklqdq xmm1,xmm1,[16+esp]
vpunpcklqdq xmm2,xmm2,[32+esp]
vpunpcklqdq xmm3,xmm3,[48+esp]
vpunpcklqdq xmm4,xmm4,[64+esp]
jmp NEAR L$018square
L$019square_break:
vpsllq xmm0,xmm0,32
vpsllq xmm1,xmm1,32
vpsllq xmm2,xmm2,32
vpsllq xmm3,xmm3,32
vpsllq xmm4,xmm4,32
vpor xmm0,xmm0,[esp]
vpor xmm1,xmm1,[16+esp]
vpor xmm2,xmm2,[32+esp]
vpor xmm3,xmm3,[48+esp]
vpor xmm4,xmm4,[64+esp]
vpshufd xmm0,xmm0,141
vpshufd xmm1,xmm1,141
vpshufd xmm2,xmm2,141
vpshufd xmm3,xmm3,141
vpshufd xmm4,xmm4,141
vmovdqu [edi],xmm0
vmovdqu [16+edi],xmm1
vmovdqu [32+edi],xmm2
vmovdqu [48+edi],xmm3
vmovdqu [64+edi],xmm4
vpslld xmm6,xmm1,2
vpslld xmm5,xmm2,2
vpaddd xmm6,xmm6,xmm1
vpaddd xmm5,xmm5,xmm2
vmovdqu [80+edi],xmm6
vmovdqu [96+edi],xmm5
vpslld xmm6,xmm3,2
vpslld xmm5,xmm4,2
vpaddd xmm6,xmm6,xmm3
vpaddd xmm5,xmm5,xmm4
vmovdqu [112+edi],xmm6
vmovdqu [128+edi],xmm5
mov esp,ebp
lea edi,[edi-48]
ret
align 32
align 16
__poly1305_blocks_avx2:
push ebp
push ebx
push esi
push edi
mov edi,DWORD [20+esp]
mov esi,DWORD [24+esp]
mov ecx,DWORD [28+esp]
mov eax,DWORD [20+edi]
and ecx,-16
jz NEAR L$020nodata
cmp ecx,64
jae NEAR L$021enter_avx2
test eax,eax
jz NEAR L$enter_blocks
L$021enter_avx2:
vzeroupper
call L$022pic_point
L$022pic_point:
pop ebx
lea ebx,[(L$const_sse2-L$022pic_point)+ebx]
test eax,eax
jnz NEAR L$023base2_26
call __poly1305_init_avx2
mov eax,DWORD [edi]
mov ecx,DWORD [3+edi]
mov edx,DWORD [6+edi]
mov esi,DWORD [9+edi]
mov ebp,DWORD [13+edi]
shr ecx,2
and eax,67108863
shr edx,4
and ecx,67108863
shr esi,6
and edx,67108863
mov DWORD [edi],eax
mov DWORD [4+edi],ecx
mov DWORD [8+edi],edx
mov DWORD [12+edi],esi
mov DWORD [16+edi],ebp
mov DWORD [20+edi],1
mov esi,DWORD [24+esp]
mov ecx,DWORD [28+esp]
L$023base2_26:
mov eax,DWORD [32+esp]
mov ebp,esp
sub esp,448
and esp,-512
vmovdqu xmm0,[48+edi]
lea edx,[288+esp]
vmovdqu xmm1,[64+edi]
vmovdqu xmm2,[80+edi]
vmovdqu xmm3,[96+edi]
vmovdqu xmm4,[112+edi]
lea edi,[48+edi]
vpermq ymm0,ymm0,64
vpermq ymm1,ymm1,64
vpermq ymm2,ymm2,64
vpermq ymm3,ymm3,64
vpermq ymm4,ymm4,64
vpshufd ymm0,ymm0,200
vpshufd ymm1,ymm1,200
vpshufd ymm2,ymm2,200
vpshufd ymm3,ymm3,200
vpshufd ymm4,ymm4,200
vmovdqa [edx-128],ymm0
vmovdqu xmm0,[80+edi]
vmovdqa [edx-96],ymm1
vmovdqu xmm1,[96+edi]
vmovdqa [edx-64],ymm2
vmovdqu xmm2,[112+edi]
vmovdqa [edx-32],ymm3
vmovdqu xmm3,[128+edi]
vmovdqa [edx],ymm4
vpermq ymm0,ymm0,64
vpermq ymm1,ymm1,64
vpermq ymm2,ymm2,64
vpermq ymm3,ymm3,64
vpshufd ymm0,ymm0,200
vpshufd ymm1,ymm1,200
vpshufd ymm2,ymm2,200
vpshufd ymm3,ymm3,200
vmovdqa [32+edx],ymm0
vmovd xmm0,DWORD [edi-48]
vmovdqa [64+edx],ymm1
vmovd xmm1,DWORD [edi-44]
vmovdqa [96+edx],ymm2
vmovd xmm2,DWORD [edi-40]
vmovdqa [128+edx],ymm3
vmovd xmm3,DWORD [edi-36]
vmovd xmm4,DWORD [edi-32]
vmovdqa ymm7,[64+ebx]
neg eax
test ecx,63
jz NEAR L$024even
mov edx,ecx
and ecx,-64
and edx,63
vmovdqu xmm5,[esi]
cmp edx,32
jb NEAR L$025one
vmovdqu xmm6,[16+esi]
je NEAR L$026two
vinserti128 ymm5,ymm5,[32+esi],1
lea esi,[48+esi]
lea ebx,[8+ebx]
lea edx,[296+esp]
jmp NEAR L$027tail
L$026two:
lea esi,[32+esi]
lea ebx,[16+ebx]
lea edx,[304+esp]
jmp NEAR L$027tail
L$025one:
lea esi,[16+esi]
vpxor ymm6,ymm6,ymm6
lea ebx,[32+eax*8+ebx]
lea edx,[312+esp]
jmp NEAR L$027tail
align 32
L$024even:
vmovdqu xmm5,[esi]
vmovdqu xmm6,[16+esi]
vinserti128 ymm5,ymm5,[32+esi],1
vinserti128 ymm6,ymm6,[48+esi],1
lea esi,[64+esi]
sub ecx,64
jz NEAR L$027tail
L$028loop:
vmovdqa [64+esp],ymm2
vpsrldq ymm2,ymm5,6
vmovdqa [esp],ymm0
vpsrldq ymm0,ymm6,6
vmovdqa [32+esp],ymm1
vpunpckhqdq ymm1,ymm5,ymm6
vpunpcklqdq ymm5,ymm5,ymm6
vpunpcklqdq ymm2,ymm2,ymm0
vpsrlq ymm0,ymm2,30
vpsrlq ymm2,ymm2,4
vpsrlq ymm6,ymm5,26
vpsrlq ymm1,ymm1,40
vpand ymm2,ymm2,ymm7
vpand ymm5,ymm5,ymm7
vpand ymm6,ymm6,ymm7
vpand ymm0,ymm0,ymm7
vpor ymm1,ymm1,[ebx]
vpaddq ymm2,ymm2,[64+esp]
vpaddq ymm5,ymm5,[esp]
vpaddq ymm6,ymm6,[32+esp]
vpaddq ymm0,ymm0,ymm3
vpaddq ymm1,ymm1,ymm4
vpmuludq ymm3,ymm2,[edx-96]
vmovdqa [32+esp],ymm6
vpmuludq ymm4,ymm2,[edx-64]
vmovdqa [96+esp],ymm0
vpmuludq ymm0,ymm2,[96+edx]
vmovdqa [128+esp],ymm1
vpmuludq ymm1,ymm2,[128+edx]
vpmuludq ymm2,ymm2,[edx-128]
vpmuludq ymm7,ymm5,[edx-32]
vpaddq ymm3,ymm3,ymm7
vpmuludq ymm6,ymm5,[edx]
vpaddq ymm4,ymm4,ymm6
vpmuludq ymm7,ymm5,[edx-128]
vpaddq ymm0,ymm0,ymm7
vmovdqa ymm7,[32+esp]
vpmuludq ymm6,ymm5,[edx-96]
vpaddq ymm1,ymm1,ymm6
vpmuludq ymm5,ymm5,[edx-64]
vpaddq ymm2,ymm2,ymm5
vpmuludq ymm6,ymm7,[edx-64]
vpaddq ymm3,ymm3,ymm6
vpmuludq ymm5,ymm7,[edx-32]
vpaddq ymm4,ymm4,ymm5
vpmuludq ymm6,ymm7,[128+edx]
vpaddq ymm0,ymm0,ymm6
vmovdqa ymm6,[96+esp]
vpmuludq ymm5,ymm7,[edx-128]
vpaddq ymm1,ymm1,ymm5
vpmuludq ymm7,ymm7,[edx-96]
vpaddq ymm2,ymm2,ymm7
vpmuludq ymm5,ymm6,[edx-128]
vpaddq ymm3,ymm3,ymm5
vpmuludq ymm7,ymm6,[edx-96]
vpaddq ymm4,ymm4,ymm7
vpmuludq ymm5,ymm6,[64+edx]
vpaddq ymm0,ymm0,ymm5
vmovdqa ymm5,[128+esp]
vpmuludq ymm7,ymm6,[96+edx]
vpaddq ymm1,ymm1,ymm7
vpmuludq ymm6,ymm6,[128+edx]
vpaddq ymm2,ymm2,ymm6
vpmuludq ymm7,ymm5,[128+edx]
vpaddq ymm3,ymm3,ymm7
vpmuludq ymm6,ymm5,[32+edx]
vpaddq ymm0,ymm0,ymm6
vpmuludq ymm7,ymm5,[edx-128]
vpaddq ymm4,ymm4,ymm7
vmovdqa ymm7,[64+ebx]
vpmuludq ymm6,ymm5,[64+edx]
vpaddq ymm1,ymm1,ymm6
vpmuludq ymm5,ymm5,[96+edx]
vpaddq ymm2,ymm2,ymm5
vpsrlq ymm5,ymm3,26
vpand ymm3,ymm3,ymm7
vpsrlq ymm6,ymm0,26
vpand ymm0,ymm0,ymm7
vpaddq ymm4,ymm4,ymm5
vpaddq ymm1,ymm1,ymm6
vpsrlq ymm5,ymm4,26
vpand ymm4,ymm4,ymm7
vpsrlq ymm6,ymm1,26
vpand ymm1,ymm1,ymm7
vpaddq ymm2,ymm2,ymm6
vpaddq ymm0,ymm0,ymm5
vpsllq ymm5,ymm5,2
vpsrlq ymm6,ymm2,26
vpand ymm2,ymm2,ymm7
vpaddq ymm0,ymm0,ymm5
vpaddq ymm3,ymm3,ymm6
vpsrlq ymm6,ymm3,26
vpsrlq ymm5,ymm0,26
vpand ymm0,ymm0,ymm7
vpand ymm3,ymm3,ymm7
vpaddq ymm1,ymm1,ymm5
vpaddq ymm4,ymm4,ymm6
vmovdqu xmm5,[esi]
vmovdqu xmm6,[16+esi]
vinserti128 ymm5,ymm5,[32+esi],1
vinserti128 ymm6,ymm6,[48+esi],1
lea esi,[64+esi]
sub ecx,64
jnz NEAR L$028loop
L$027tail:
vmovdqa [64+esp],ymm2
vpsrldq ymm2,ymm5,6
vmovdqa [esp],ymm0
vpsrldq ymm0,ymm6,6
vmovdqa [32+esp],ymm1
vpunpckhqdq ymm1,ymm5,ymm6
vpunpcklqdq ymm5,ymm5,ymm6
vpunpcklqdq ymm2,ymm2,ymm0
vpsrlq ymm0,ymm2,30
vpsrlq ymm2,ymm2,4
vpsrlq ymm6,ymm5,26
vpsrlq ymm1,ymm1,40
vpand ymm2,ymm2,ymm7
vpand ymm5,ymm5,ymm7
vpand ymm6,ymm6,ymm7
vpand ymm0,ymm0,ymm7
vpor ymm1,ymm1,[ebx]
and ebx,-64
vpaddq ymm2,ymm2,[64+esp]
vpaddq ymm5,ymm5,[esp]
vpaddq ymm6,ymm6,[32+esp]
vpaddq ymm0,ymm0,ymm3
vpaddq ymm1,ymm1,ymm4
vpmuludq ymm3,ymm2,[edx-92]
vmovdqa [32+esp],ymm6
vpmuludq ymm4,ymm2,[edx-60]
vmovdqa [96+esp],ymm0
vpmuludq ymm0,ymm2,[100+edx]
vmovdqa [128+esp],ymm1
vpmuludq ymm1,ymm2,[132+edx]
vpmuludq ymm2,ymm2,[edx-124]
vpmuludq ymm7,ymm5,[edx-28]
vpaddq ymm3,ymm3,ymm7
vpmuludq ymm6,ymm5,[4+edx]
vpaddq ymm4,ymm4,ymm6
vpmuludq ymm7,ymm5,[edx-124]
vpaddq ymm0,ymm0,ymm7
vmovdqa ymm7,[32+esp]
vpmuludq ymm6,ymm5,[edx-92]
vpaddq ymm1,ymm1,ymm6
vpmuludq ymm5,ymm5,[edx-60]
vpaddq ymm2,ymm2,ymm5
vpmuludq ymm6,ymm7,[edx-60]
vpaddq ymm3,ymm3,ymm6
vpmuludq ymm5,ymm7,[edx-28]
vpaddq ymm4,ymm4,ymm5
vpmuludq ymm6,ymm7,[132+edx]
vpaddq ymm0,ymm0,ymm6
vmovdqa ymm6,[96+esp]
vpmuludq ymm5,ymm7,[edx-124]
vpaddq ymm1,ymm1,ymm5
vpmuludq ymm7,ymm7,[edx-92]
vpaddq ymm2,ymm2,ymm7
vpmuludq ymm5,ymm6,[edx-124]
vpaddq ymm3,ymm3,ymm5
vpmuludq ymm7,ymm6,[edx-92]
vpaddq ymm4,ymm4,ymm7
vpmuludq ymm5,ymm6,[68+edx]
vpaddq ymm0,ymm0,ymm5
vmovdqa ymm5,[128+esp]
vpmuludq ymm7,ymm6,[100+edx]
vpaddq ymm1,ymm1,ymm7
vpmuludq ymm6,ymm6,[132+edx]
vpaddq ymm2,ymm2,ymm6
vpmuludq ymm7,ymm5,[132+edx]
vpaddq ymm3,ymm3,ymm7
vpmuludq ymm6,ymm5,[36+edx]
vpaddq ymm0,ymm0,ymm6
vpmuludq ymm7,ymm5,[edx-124]
vpaddq ymm4,ymm4,ymm7
vmovdqa ymm7,[64+ebx]
vpmuludq ymm6,ymm5,[68+edx]
vpaddq ymm1,ymm1,ymm6
vpmuludq ymm5,ymm5,[100+edx]
vpaddq ymm2,ymm2,ymm5
vpsrldq ymm5,ymm4,8
vpsrldq ymm6,ymm3,8
vpaddq ymm4,ymm4,ymm5
vpsrldq ymm5,ymm0,8
vpaddq ymm3,ymm3,ymm6
vpsrldq ymm6,ymm1,8
vpaddq ymm0,ymm0,ymm5
vpsrldq ymm5,ymm2,8
vpaddq ymm1,ymm1,ymm6
vpermq ymm6,ymm4,2
vpaddq ymm2,ymm2,ymm5
vpermq ymm5,ymm3,2
vpaddq ymm4,ymm4,ymm6
vpermq ymm6,ymm0,2
vpaddq ymm3,ymm3,ymm5
vpermq ymm5,ymm1,2
vpaddq ymm0,ymm0,ymm6
vpermq ymm6,ymm2,2
vpaddq ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm6
vpsrlq ymm5,ymm3,26
vpand ymm3,ymm3,ymm7
vpsrlq ymm6,ymm0,26
vpand ymm0,ymm0,ymm7
vpaddq ymm4,ymm4,ymm5
vpaddq ymm1,ymm1,ymm6
vpsrlq ymm5,ymm4,26
vpand ymm4,ymm4,ymm7
vpsrlq ymm6,ymm1,26
vpand ymm1,ymm1,ymm7
vpaddq ymm2,ymm2,ymm6
vpaddq ymm0,ymm0,ymm5
vpsllq ymm5,ymm5,2
vpsrlq ymm6,ymm2,26
vpand ymm2,ymm2,ymm7
vpaddq ymm0,ymm0,ymm5
vpaddq ymm3,ymm3,ymm6
vpsrlq ymm6,ymm3,26
vpsrlq ymm5,ymm0,26
vpand ymm0,ymm0,ymm7
vpand ymm3,ymm3,ymm7
vpaddq ymm1,ymm1,ymm5
vpaddq ymm4,ymm4,ymm6
cmp ecx,0
je NEAR L$029done
vpshufd xmm0,xmm0,252
lea edx,[288+esp]
vpshufd xmm1,xmm1,252
vpshufd xmm2,xmm2,252
vpshufd xmm3,xmm3,252
vpshufd xmm4,xmm4,252
jmp NEAR L$024even
align 16
L$029done:
vmovd DWORD [edi-48],xmm0
vmovd DWORD [edi-44],xmm1
vmovd DWORD [edi-40],xmm2
vmovd DWORD [edi-36],xmm3
vmovd DWORD [edi-32],xmm4
vzeroupper
mov esp,ebp
L$020nodata:
pop edi
pop esi
pop ebx
pop ebp
ret
align 64
L$const_sse2:
dd 16777216,0,16777216,0,16777216,0,16777216,0
dd 0,0,0,0,0,0,0,0
dd 67108863,0,67108863,0,67108863,0,67108863,0
dd 268435455,268435452,268435452,268435452
db 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
db 114,103,62,0
align 4
segment .bss
common _OPENSSL_ia32cap_P 16