%ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif ;extern _OPENSSL_ia32cap_P align 64 global _poly1305_init align 16 _poly1305_init: L$_poly1305_init_begin: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ebp,DWORD [28+esp] xor eax,eax mov DWORD [edi],eax mov DWORD [4+edi],eax mov DWORD [8+edi],eax mov DWORD [12+edi],eax mov DWORD [16+edi],eax mov DWORD [20+edi],eax cmp esi,0 je NEAR L$000nokey call L$001pic_point L$001pic_point: pop ebx lea eax,[(_poly1305_blocks-L$001pic_point)+ebx] lea edx,[(_poly1305_emit-L$001pic_point)+ebx] lea edi,[_OPENSSL_ia32cap_P] mov ecx,DWORD [edi] and ecx,83886080 cmp ecx,83886080 jne NEAR L$002no_sse2 lea eax,[(__poly1305_blocks_sse2-L$001pic_point)+ebx] lea edx,[(__poly1305_emit_sse2-L$001pic_point)+ebx] mov ecx,DWORD [8+edi] test ecx,32 jz NEAR L$002no_sse2 lea eax,[(__poly1305_blocks_avx2-L$001pic_point)+ebx] L$002no_sse2: mov edi,DWORD [20+esp] mov DWORD [ebp],eax mov DWORD [4+ebp],edx mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] mov edx,DWORD [12+esi] and eax,268435455 and ebx,268435452 and ecx,268435452 and edx,268435452 mov DWORD [24+edi],eax mov DWORD [28+edi],ebx mov DWORD [32+edi],ecx mov DWORD [36+edi],edx mov eax,1 L$000nokey: pop edi pop esi pop ebx pop ebp ret global _poly1305_blocks align 16 _poly1305_blocks: L$_poly1305_blocks_begin: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] L$enter_blocks: and ecx,-15 jz NEAR L$003nodata sub esp,64 mov eax,DWORD [24+edi] mov ebx,DWORD [28+edi] lea ebp,[ecx*1+esi] mov ecx,DWORD [32+edi] mov edx,DWORD [36+edi] mov DWORD [92+esp],ebp mov ebp,esi mov DWORD [36+esp],eax mov eax,ebx shr eax,2 mov DWORD [40+esp],ebx add eax,ebx mov ebx,ecx shr ebx,2 mov DWORD [44+esp],ecx add ebx,ecx mov ecx,edx shr ecx,2 mov DWORD [48+esp],edx add ecx,edx mov DWORD [52+esp],eax mov DWORD [56+esp],ebx mov DWORD [60+esp],ecx mov eax,DWORD [edi] mov ebx,DWORD [4+edi] mov ecx,DWORD [8+edi] mov esi,DWORD [12+edi] mov edi,DWORD [16+edi] jmp NEAR L$004loop align 32 L$004loop: add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] adc ecx,DWORD [8+ebp] adc esi,DWORD [12+ebp] lea ebp,[16+ebp] adc edi,DWORD [96+esp] mov DWORD [esp],eax mov DWORD [12+esp],esi mul DWORD [36+esp] mov DWORD [16+esp],edi mov edi,eax mov eax,ebx mov esi,edx mul DWORD [60+esp] add edi,eax mov eax,ecx adc esi,edx mul DWORD [56+esp] add edi,eax mov eax,DWORD [12+esp] adc esi,edx mul DWORD [52+esp] add edi,eax mov eax,DWORD [esp] adc esi,edx mul DWORD [40+esp] mov DWORD [20+esp],edi xor edi,edi add esi,eax mov eax,ebx adc edi,edx mul DWORD [36+esp] add esi,eax mov eax,ecx adc edi,edx mul DWORD [60+esp] add esi,eax mov eax,DWORD [12+esp] adc edi,edx mul DWORD [56+esp] add esi,eax mov eax,DWORD [16+esp] adc edi,edx imul eax,DWORD [52+esp] add esi,eax mov eax,DWORD [esp] adc edi,0 mul DWORD [44+esp] mov DWORD [24+esp],esi xor esi,esi add edi,eax mov eax,ebx adc esi,edx mul DWORD [40+esp] add edi,eax mov eax,ecx adc esi,edx mul DWORD [36+esp] add edi,eax mov eax,DWORD [12+esp] adc esi,edx mul DWORD [60+esp] add edi,eax mov eax,DWORD [16+esp] adc esi,edx imul eax,DWORD [56+esp] add edi,eax mov eax,DWORD [esp] adc esi,0 mul DWORD [48+esp] mov DWORD [28+esp],edi xor edi,edi add esi,eax mov eax,ebx adc edi,edx mul DWORD [44+esp] add esi,eax mov eax,ecx adc edi,edx mul DWORD [40+esp] add esi,eax mov eax,DWORD [12+esp] adc edi,edx mul DWORD [36+esp] add esi,eax mov ecx,DWORD [16+esp] adc edi,edx mov edx,ecx imul ecx,DWORD [60+esp] add esi,ecx mov eax,DWORD [20+esp] adc edi,0 imul edx,DWORD [36+esp] add edx,edi mov ebx,DWORD [24+esp] mov ecx,DWORD [28+esp] mov edi,edx shr edx,2 and edi,3 lea edx,[edx*4+edx] add eax,edx adc ebx,0 adc ecx,0 adc esi,0 adc edi,0 cmp ebp,DWORD [92+esp] jne NEAR L$004loop mov edx,DWORD [84+esp] add esp,64 mov DWORD [edx],eax mov DWORD [4+edx],ebx mov DWORD [8+edx],ecx mov DWORD [12+edx],esi mov DWORD [16+edx],edi L$003nodata: pop edi pop esi pop ebx pop ebp ret global _poly1305_emit align 16 _poly1305_emit: L$_poly1305_emit_begin: push ebp push ebx push esi push edi mov ebp,DWORD [20+esp] L$enter_emit: mov edi,DWORD [24+esp] mov eax,DWORD [ebp] mov ebx,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov esi,DWORD [16+ebp] add eax,5 adc ebx,0 adc ecx,0 adc edx,0 adc esi,0 shr esi,2 neg esi and eax,esi and ebx,esi and ecx,esi and edx,esi mov DWORD [edi],eax mov DWORD [4+edi],ebx mov DWORD [8+edi],ecx mov DWORD [12+edi],edx not esi mov eax,DWORD [ebp] mov ebx,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov ebp,DWORD [28+esp] and eax,esi and ebx,esi and ecx,esi and edx,esi or eax,DWORD [edi] or ebx,DWORD [4+edi] or ecx,DWORD [8+edi] or edx,DWORD [12+edi] add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] adc ecx,DWORD [8+ebp] adc edx,DWORD [12+ebp] mov DWORD [edi],eax mov DWORD [4+edi],ebx mov DWORD [8+edi],ecx mov DWORD [12+edi],edx pop edi pop esi pop ebx pop ebp ret align 32 align 16 __poly1305_init_sse2: movdqu xmm4,[24+edi] lea edi,[48+edi] mov ebp,esp sub esp,224 and esp,-16 movq xmm7,[64+ebx] movdqa xmm0,xmm4 movdqa xmm1,xmm4 movdqa xmm2,xmm4 pand xmm0,xmm7 psrlq xmm1,26 psrldq xmm2,6 pand xmm1,xmm7 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 pand xmm2,xmm7 pand xmm3,xmm7 psrldq xmm4,13 lea edx,[144+esp] mov ecx,2 L$005square: movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm6,xmm1 movdqa xmm5,xmm2 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm1 paddd xmm5,xmm2 movdqa [80+esp],xmm6 movdqa [96+esp],xmm5 movdqa xmm6,xmm3 movdqa xmm5,xmm4 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm3 paddd xmm5,xmm4 movdqa [112+esp],xmm6 movdqa [128+esp],xmm5 pshufd xmm6,xmm0,68 movdqa xmm5,xmm1 pshufd xmm1,xmm1,68 pshufd xmm2,xmm2,68 pshufd xmm3,xmm3,68 pshufd xmm4,xmm4,68 movdqa [edx],xmm6 movdqa [16+edx],xmm1 movdqa [32+edx],xmm2 movdqa [48+edx],xmm3 movdqa [64+edx],xmm4 pmuludq xmm4,xmm0 pmuludq xmm3,xmm0 pmuludq xmm2,xmm0 pmuludq xmm1,xmm0 pmuludq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+edx] movdqa xmm7,xmm6 pmuludq xmm6,[32+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm3,xmm6 movdqa xmm6,[80+esp] pmuludq xmm5,[edx] paddq xmm2,xmm7 pmuludq xmm6,[64+edx] movdqa xmm7,[32+esp] paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+edx] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+edx] paddq xmm4,xmm7 movdqa xmm7,[96+esp] pmuludq xmm6,[edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+edx] paddq xmm2,xmm6 pmuludq xmm5,[48+edx] movdqa xmm6,[48+esp] paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+edx] paddq xmm0,xmm5 movdqa xmm5,[112+esp] pmuludq xmm7,[edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+edx] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+edx] paddq xmm2,xmm5 pmuludq xmm7,[32+edx] movdqa xmm5,[64+esp] paddq xmm1,xmm6 movdqa xmm6,[128+esp] pmuludq xmm5,[edx] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+edx] paddq xmm0,xmm7 pmuludq xmm6,[48+edx] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 dec ecx jz NEAR L$006square_break punpcklqdq xmm0,[esp] punpcklqdq xmm1,[16+esp] punpcklqdq xmm2,[32+esp] punpcklqdq xmm3,[48+esp] punpcklqdq xmm4,[64+esp] jmp NEAR L$005square L$006square_break: psllq xmm0,32 psllq xmm1,32 psllq xmm2,32 psllq xmm3,32 psllq xmm4,32 por xmm0,[esp] por xmm1,[16+esp] por xmm2,[32+esp] por xmm3,[48+esp] por xmm4,[64+esp] pshufd xmm0,xmm0,141 pshufd xmm1,xmm1,141 pshufd xmm2,xmm2,141 pshufd xmm3,xmm3,141 pshufd xmm4,xmm4,141 movdqu [edi],xmm0 movdqu [16+edi],xmm1 movdqu [32+edi],xmm2 movdqu [48+edi],xmm3 movdqu [64+edi],xmm4 movdqa xmm6,xmm1 movdqa xmm5,xmm2 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm1 paddd xmm5,xmm2 movdqu [80+edi],xmm6 movdqu [96+edi],xmm5 movdqa xmm6,xmm3 movdqa xmm5,xmm4 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm3 paddd xmm5,xmm4 movdqu [112+edi],xmm6 movdqu [128+edi],xmm5 mov esp,ebp lea edi,[edi-48] ret align 32 align 16 __poly1305_blocks_sse2: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] mov eax,DWORD [20+edi] and ecx,-16 jz NEAR L$007nodata cmp ecx,64 jae NEAR L$008enter_sse2 test eax,eax jz NEAR L$enter_blocks align 16 L$008enter_sse2: call L$009pic_point L$009pic_point: pop ebx lea ebx,[(L$const_sse2-L$009pic_point)+ebx] test eax,eax jnz NEAR L$010base2_26 call __poly1305_init_sse2 mov eax,DWORD [edi] mov ecx,DWORD [3+edi] mov edx,DWORD [6+edi] mov esi,DWORD [9+edi] mov ebp,DWORD [13+edi] mov DWORD [20+edi],1 shr ecx,2 and eax,67108863 shr edx,4 and ecx,67108863 shr esi,6 and edx,67108863 movd xmm0,eax movd xmm1,ecx movd xmm2,edx movd xmm3,esi movd xmm4,ebp mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] jmp NEAR L$011base2_32 align 16 L$010base2_26: movd xmm0,DWORD [edi] movd xmm1,DWORD [4+edi] movd xmm2,DWORD [8+edi] movd xmm3,DWORD [12+edi] movd xmm4,DWORD [16+edi] movdqa xmm7,[64+ebx] L$011base2_32: mov eax,DWORD [32+esp] mov ebp,esp sub esp,528 and esp,-16 lea edi,[48+edi] shl eax,24 test ecx,31 jz NEAR L$012even movdqu xmm6,[esi] lea esi,[16+esi] movdqa xmm5,xmm6 pand xmm6,xmm7 paddd xmm0,xmm6 movdqa xmm6,xmm5 psrlq xmm5,26 psrldq xmm6,6 pand xmm5,xmm7 paddd xmm1,xmm5 movdqa xmm5,xmm6 psrlq xmm6,4 pand xmm6,xmm7 paddd xmm2,xmm6 movdqa xmm6,xmm5 psrlq xmm5,30 pand xmm5,xmm7 psrldq xmm6,7 paddd xmm3,xmm5 movd xmm5,eax paddd xmm4,xmm6 movd xmm6,DWORD [12+edi] paddd xmm4,xmm5 movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 pmuludq xmm0,xmm6 pmuludq xmm1,xmm6 pmuludq xmm2,xmm6 movd xmm5,DWORD [28+edi] pmuludq xmm3,xmm6 pmuludq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+esp] movdqa xmm7,xmm6 pmuludq xmm6,[32+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movd xmm6,DWORD [92+edi] pmuludq xmm5,[esp] paddq xmm2,xmm7 pmuludq xmm6,[64+esp] movd xmm7,DWORD [44+edi] paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+esp] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+esp] paddq xmm4,xmm7 movd xmm7,DWORD [108+edi] pmuludq xmm6,[esp] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+esp] paddq xmm2,xmm6 pmuludq xmm5,[48+esp] movd xmm6,DWORD [60+edi] paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+esp] paddq xmm0,xmm5 movd xmm5,DWORD [124+edi] pmuludq xmm7,[esp] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+esp] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+esp] paddq xmm2,xmm5 pmuludq xmm7,[32+esp] movd xmm5,DWORD [76+edi] paddq xmm1,xmm6 movd xmm6,DWORD [140+edi] pmuludq xmm5,[esp] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+esp] paddq xmm0,xmm7 pmuludq xmm6,[48+esp] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 sub ecx,16 jz NEAR L$013done L$012even: lea edx,[384+esp] lea eax,[esi-32] sub ecx,64 movdqu xmm5,[edi] pshufd xmm6,xmm5,68 cmovb esi,eax pshufd xmm5,xmm5,238 movdqa [edx],xmm6 lea eax,[160+esp] movdqu xmm6,[16+edi] movdqa [edx-144],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [16+edx],xmm5 movdqu xmm5,[32+edi] movdqa [edx-128],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [32+edx],xmm6 movdqu xmm6,[48+edi] movdqa [edx-112],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [48+edx],xmm5 movdqu xmm5,[64+edi] movdqa [edx-96],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [64+edx],xmm6 movdqu xmm6,[80+edi] movdqa [edx-80],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [80+edx],xmm5 movdqu xmm5,[96+edi] movdqa [edx-64],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [96+edx],xmm6 movdqu xmm6,[112+edi] movdqa [edx-48],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [112+edx],xmm5 movdqu xmm5,[128+edi] movdqa [edx-32],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [128+edx],xmm6 movdqa [edx-16],xmm5 movdqu xmm5,[32+esi] movdqu xmm6,[48+esi] lea esi,[32+esi] movdqa [112+esp],xmm2 movdqa [128+esp],xmm3 movdqa [144+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] movdqa [80+esp],xmm0 movdqa [96+esp],xmm1 jbe NEAR L$014skip_loop jmp NEAR L$015loop align 32 L$015loop: movdqa xmm7,[edx-144] movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 movdqa xmm1,xmm5 pmuludq xmm5,xmm7 movdqa xmm0,xmm6 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 pmuludq xmm3,xmm7 pmuludq xmm4,xmm7 pmuludq xmm0,[edx-16] movdqa xmm7,xmm1 pmuludq xmm1,[edx-128] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-112] paddq xmm1,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-96] paddq xmm2,xmm7 movdqa xmm7,[16+eax] pmuludq xmm6,[edx-80] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-128] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-112] paddq xmm2,xmm7 movdqa xmm7,[32+eax] pmuludq xmm6,[edx-96] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-32] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-16] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-128] paddq xmm1,xmm5 movdqa xmm5,[48+eax] pmuludq xmm7,[edx-112] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-48] paddq xmm4,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-32] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-16] paddq xmm1,xmm6 movdqa xmm6,[64+eax] pmuludq xmm5,[edx-128] paddq xmm2,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-16] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-64] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-48] paddq xmm0,xmm7 movdqa xmm7,[64+ebx] pmuludq xmm6,[edx-32] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqu xmm5,[esi-32] movdqu xmm6,[esi-16] lea esi,[32+esi] movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] lea eax,[esi-32] sub ecx,64 paddd xmm5,[80+esp] paddd xmm6,[96+esp] paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] cmovb esi,eax lea eax,[160+esp] movdqa xmm7,[edx] movdqa [16+esp],xmm1 movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 movdqa xmm1,xmm5 pmuludq xmm5,xmm7 paddq xmm5,xmm0 movdqa xmm0,xmm6 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 pmuludq xmm3,xmm7 pmuludq xmm4,xmm7 paddq xmm6,[16+esp] paddq xmm2,[32+esp] paddq xmm3,[48+esp] paddq xmm4,[64+esp] pmuludq xmm0,[128+edx] movdqa xmm7,xmm1 pmuludq xmm1,[16+edx] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+edx] paddq xmm1,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+edx] paddq xmm2,xmm7 movdqa xmm7,[16+eax] pmuludq xmm6,[64+edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+edx] paddq xmm2,xmm7 movdqa xmm7,[32+eax] pmuludq xmm6,[48+edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[112+edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[128+edx] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+edx] paddq xmm1,xmm5 movdqa xmm5,[48+eax] pmuludq xmm7,[32+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[96+edx] paddq xmm4,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[112+edx] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[128+edx] paddq xmm1,xmm6 movdqa xmm6,[64+eax] pmuludq xmm5,[16+edx] paddq xmm2,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[128+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[80+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[96+edx] paddq xmm0,xmm7 movdqa xmm7,[64+ebx] pmuludq xmm6,[112+edx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 movdqu xmm5,[32+esi] movdqu xmm6,[48+esi] lea esi,[32+esi] movdqa [112+esp],xmm2 movdqa [128+esp],xmm3 movdqa [144+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] movdqa [80+esp],xmm0 movdqa [96+esp],xmm1 ja NEAR L$015loop L$014skip_loop: pshufd xmm7,[edx-144],16 add ecx,32 jnz NEAR L$016long_tail paddd xmm5,xmm0 paddd xmm6,xmm1 paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] L$016long_tail: movdqa [eax],xmm5 movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 pmuludq xmm5,xmm7 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 movdqa xmm0,xmm5 pshufd xmm5,[edx-128],16 pmuludq xmm3,xmm7 movdqa xmm1,xmm6 pmuludq xmm4,xmm7 movdqa xmm6,xmm5 pmuludq xmm5,[48+eax] movdqa xmm7,xmm6 pmuludq xmm6,[32+eax] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+eax] paddq xmm3,xmm6 pshufd xmm6,[edx-64],16 pmuludq xmm5,[eax] paddq xmm2,xmm7 pmuludq xmm6,[64+eax] pshufd xmm7,[edx-112],16 paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+eax] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+eax] paddq xmm4,xmm7 pshufd xmm7,[edx-48],16 pmuludq xmm6,[eax] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+eax] paddq xmm2,xmm6 pmuludq xmm5,[48+eax] pshufd xmm6,[edx-96],16 paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+eax] paddq xmm0,xmm5 pshufd xmm5,[edx-32],16 pmuludq xmm7,[eax] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+eax] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+eax] paddq xmm2,xmm5 pmuludq xmm7,[32+eax] pshufd xmm5,[edx-80],16 paddq xmm1,xmm6 pshufd xmm6,[edx-16],16 pmuludq xmm5,[eax] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+eax] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+eax] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+eax] paddq xmm0,xmm7 pmuludq xmm6,[48+eax] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 jz NEAR L$017short_tail movdqu xmm5,[esi-32] movdqu xmm6,[esi-16] lea esi,[32+esi] movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] pshufd xmm7,[edx],16 paddd xmm5,[80+esp] paddd xmm6,[96+esp] paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] movdqa [esp],xmm5 pmuludq xmm5,xmm7 movdqa [16+esp],xmm6 pmuludq xmm6,xmm7 paddq xmm0,xmm5 movdqa xmm5,xmm2 pmuludq xmm2,xmm7 paddq xmm1,xmm6 movdqa xmm6,xmm3 pmuludq xmm3,xmm7 paddq xmm2,[32+esp] movdqa [32+esp],xmm5 pshufd xmm5,[16+edx],16 paddq xmm3,[48+esp] movdqa [48+esp],xmm6 movdqa xmm6,xmm4 pmuludq xmm4,xmm7 paddq xmm4,[64+esp] movdqa [64+esp],xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+esp] movdqa xmm7,xmm6 pmuludq xmm6,[32+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 pshufd xmm6,[80+edx],16 pmuludq xmm5,[esp] paddq xmm2,xmm7 pmuludq xmm6,[64+esp] pshufd xmm7,[32+edx],16 paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+esp] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+esp] paddq xmm4,xmm7 pshufd xmm7,[96+edx],16 pmuludq xmm6,[esp] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+esp] paddq xmm2,xmm6 pmuludq xmm5,[48+esp] pshufd xmm6,[48+edx],16 paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+esp] paddq xmm0,xmm5 pshufd xmm5,[112+edx],16 pmuludq xmm7,[esp] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+esp] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+esp] paddq xmm2,xmm5 pmuludq xmm7,[32+esp] pshufd xmm5,[64+edx],16 paddq xmm1,xmm6 pshufd xmm6,[128+edx],16 pmuludq xmm5,[esp] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+esp] paddq xmm0,xmm7 pmuludq xmm6,[48+esp] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 L$017short_tail: pshufd xmm6,xmm4,78 pshufd xmm5,xmm3,78 paddq xmm4,xmm6 paddq xmm3,xmm5 pshufd xmm6,xmm0,78 pshufd xmm5,xmm1,78 paddq xmm0,xmm6 paddq xmm1,xmm5 pshufd xmm6,xmm2,78 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm2,xmm6 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 L$013done: movd DWORD [edi-48],xmm0 movd DWORD [edi-44],xmm1 movd DWORD [edi-40],xmm2 movd DWORD [edi-36],xmm3 movd DWORD [edi-32],xmm4 mov esp,ebp L$007nodata: pop edi pop esi pop ebx pop ebp ret align 32 align 16 __poly1305_emit_sse2: push ebp push ebx push esi push edi mov ebp,DWORD [20+esp] cmp DWORD [20+ebp],0 je NEAR L$enter_emit mov eax,DWORD [ebp] mov edi,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov esi,DWORD [16+ebp] mov ebx,edi shl edi,26 shr ebx,6 add eax,edi mov edi,ecx adc ebx,0 shl edi,20 shr ecx,12 add ebx,edi mov edi,edx adc ecx,0 shl edi,14 shr edx,18 add ecx,edi mov edi,esi adc edx,0 shl edi,8 shr esi,24 add edx,edi adc esi,0 mov edi,esi and esi,3 shr edi,2 lea ebp,[edi*4+edi] mov edi,DWORD [24+esp] add eax,ebp mov ebp,DWORD [28+esp] adc ebx,0 adc ecx,0 adc edx,0 adc esi,0 movd xmm0,eax add eax,5 movd xmm1,ebx adc ebx,0 movd xmm2,ecx adc ecx,0 movd xmm3,edx adc edx,0 adc esi,0 shr esi,2 neg esi and eax,esi and ebx,esi and ecx,esi and edx,esi mov DWORD [edi],eax movd eax,xmm0 mov DWORD [4+edi],ebx movd ebx,xmm1 mov DWORD [8+edi],ecx movd ecx,xmm2 mov DWORD [12+edi],edx movd edx,xmm3 not esi and eax,esi and ebx,esi or eax,DWORD [edi] and ecx,esi or ebx,DWORD [4+edi] and edx,esi or ecx,DWORD [8+edi] or edx,DWORD [12+edi] add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] mov DWORD [edi],eax adc ecx,DWORD [8+ebp] mov DWORD [4+edi],ebx adc edx,DWORD [12+ebp] mov DWORD [8+edi],ecx mov DWORD [12+edi],edx pop edi pop esi pop ebx pop ebp ret align 32 align 16 __poly1305_init_avx2: vmovdqu xmm4,[24+edi] lea edi,[48+edi] mov ebp,esp sub esp,224 and esp,-16 vmovdqa xmm7,[64+ebx] vpand xmm0,xmm4,xmm7 vpsrlq xmm1,xmm4,26 vpsrldq xmm3,xmm4,6 vpand xmm1,xmm1,xmm7 vpsrlq xmm2,xmm3,4 vpsrlq xmm3,xmm3,30 vpand xmm2,xmm2,xmm7 vpand xmm3,xmm3,xmm7 vpsrldq xmm4,xmm4,13 lea edx,[144+esp] mov ecx,2 L$018square: vmovdqa [esp],xmm0 vmovdqa [16+esp],xmm1 vmovdqa [32+esp],xmm2 vmovdqa [48+esp],xmm3 vmovdqa [64+esp],xmm4 vpslld xmm6,xmm1,2 vpslld xmm5,xmm2,2 vpaddd xmm6,xmm6,xmm1 vpaddd xmm5,xmm5,xmm2 vmovdqa [80+esp],xmm6 vmovdqa [96+esp],xmm5 vpslld xmm6,xmm3,2 vpslld xmm5,xmm4,2 vpaddd xmm6,xmm6,xmm3 vpaddd xmm5,xmm5,xmm4 vmovdqa [112+esp],xmm6 vmovdqa [128+esp],xmm5 vpshufd xmm5,xmm0,68 vmovdqa xmm6,xmm1 vpshufd xmm1,xmm1,68 vpshufd xmm2,xmm2,68 vpshufd xmm3,xmm3,68 vpshufd xmm4,xmm4,68 vmovdqa [edx],xmm5 vmovdqa [16+edx],xmm1 vmovdqa [32+edx],xmm2 vmovdqa [48+edx],xmm3 vmovdqa [64+edx],xmm4 vpmuludq xmm4,xmm4,xmm0 vpmuludq xmm3,xmm3,xmm0 vpmuludq xmm2,xmm2,xmm0 vpmuludq xmm1,xmm1,xmm0 vpmuludq xmm0,xmm5,xmm0 vpmuludq xmm5,xmm6,[48+edx] vpaddq xmm4,xmm4,xmm5 vpmuludq xmm7,xmm6,[32+edx] vpaddq xmm3,xmm3,xmm7 vpmuludq xmm5,xmm6,[16+edx] vpaddq xmm2,xmm2,xmm5 vmovdqa xmm7,[80+esp] vpmuludq xmm6,xmm6,[edx] vpaddq xmm1,xmm1,xmm6 vmovdqa xmm5,[32+esp] vpmuludq xmm7,xmm7,[64+edx] vpaddq xmm0,xmm0,xmm7 vpmuludq xmm6,xmm5,[32+edx] vpaddq xmm4,xmm4,xmm6 vpmuludq xmm7,xmm5,[16+edx] vpaddq xmm3,xmm3,xmm7 vmovdqa xmm6,[96+esp] vpmuludq xmm5,xmm5,[edx] vpaddq xmm2,xmm2,xmm5 vpmuludq xmm7,xmm6,[64+edx] vpaddq xmm1,xmm1,xmm7 vmovdqa xmm5,[48+esp] vpmuludq xmm6,xmm6,[48+edx] vpaddq xmm0,xmm0,xmm6 vpmuludq xmm7,xmm5,[16+edx] vpaddq xmm4,xmm4,xmm7 vmovdqa xmm6,[112+esp] vpmuludq xmm5,xmm5,[edx] vpaddq xmm3,xmm3,xmm5 vpmuludq xmm7,xmm6,[64+edx] vpaddq xmm2,xmm2,xmm7 vpmuludq xmm5,xmm6,[48+edx] vpaddq xmm1,xmm1,xmm5 vmovdqa xmm7,[64+esp] vpmuludq xmm6,xmm6,[32+edx] vpaddq xmm0,xmm0,xmm6 vmovdqa xmm5,[128+esp] vpmuludq xmm7,xmm7,[edx] vpaddq xmm4,xmm4,xmm7 vpmuludq xmm6,xmm5,[64+edx] vpaddq xmm3,xmm3,xmm6 vpmuludq xmm7,xmm5,[16+edx] vpaddq xmm0,xmm0,xmm7 vpmuludq xmm6,xmm5,[32+edx] vpaddq xmm1,xmm1,xmm6 vmovdqa xmm7,[64+ebx] vpmuludq xmm5,xmm5,[48+edx] vpaddq xmm2,xmm2,xmm5 vpsrlq xmm5,xmm3,26 vpand xmm3,xmm3,xmm7 vpsrlq xmm6,xmm0,26 vpand xmm0,xmm0,xmm7 vpaddq xmm4,xmm4,xmm5 vpaddq xmm1,xmm1,xmm6 vpsrlq xmm5,xmm4,26 vpand xmm4,xmm4,xmm7 vpsrlq xmm6,xmm1,26 vpand xmm1,xmm1,xmm7 vpaddq xmm2,xmm2,xmm6 vpaddd xmm0,xmm0,xmm5 vpsllq xmm5,xmm5,2 vpsrlq xmm6,xmm2,26 vpand xmm2,xmm2,xmm7 vpaddd xmm0,xmm0,xmm5 vpaddd xmm3,xmm3,xmm6 vpsrlq xmm6,xmm3,26 vpsrlq xmm5,xmm0,26 vpand xmm0,xmm0,xmm7 vpand xmm3,xmm3,xmm7 vpaddd xmm1,xmm1,xmm5 vpaddd xmm4,xmm4,xmm6 dec ecx jz NEAR L$019square_break vpunpcklqdq xmm0,xmm0,[esp] vpunpcklqdq xmm1,xmm1,[16+esp] vpunpcklqdq xmm2,xmm2,[32+esp] vpunpcklqdq xmm3,xmm3,[48+esp] vpunpcklqdq xmm4,xmm4,[64+esp] jmp NEAR L$018square L$019square_break: vpsllq xmm0,xmm0,32 vpsllq xmm1,xmm1,32 vpsllq xmm2,xmm2,32 vpsllq xmm3,xmm3,32 vpsllq xmm4,xmm4,32 vpor xmm0,xmm0,[esp] vpor xmm1,xmm1,[16+esp] vpor xmm2,xmm2,[32+esp] vpor xmm3,xmm3,[48+esp] vpor xmm4,xmm4,[64+esp] vpshufd xmm0,xmm0,141 vpshufd xmm1,xmm1,141 vpshufd xmm2,xmm2,141 vpshufd xmm3,xmm3,141 vpshufd xmm4,xmm4,141 vmovdqu [edi],xmm0 vmovdqu [16+edi],xmm1 vmovdqu [32+edi],xmm2 vmovdqu [48+edi],xmm3 vmovdqu [64+edi],xmm4 vpslld xmm6,xmm1,2 vpslld xmm5,xmm2,2 vpaddd xmm6,xmm6,xmm1 vpaddd xmm5,xmm5,xmm2 vmovdqu [80+edi],xmm6 vmovdqu [96+edi],xmm5 vpslld xmm6,xmm3,2 vpslld xmm5,xmm4,2 vpaddd xmm6,xmm6,xmm3 vpaddd xmm5,xmm5,xmm4 vmovdqu [112+edi],xmm6 vmovdqu [128+edi],xmm5 mov esp,ebp lea edi,[edi-48] ret align 32 align 16 __poly1305_blocks_avx2: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] mov eax,DWORD [20+edi] and ecx,-16 jz NEAR L$020nodata cmp ecx,64 jae NEAR L$021enter_avx2 test eax,eax jz NEAR L$enter_blocks L$021enter_avx2: vzeroupper call L$022pic_point L$022pic_point: pop ebx lea ebx,[(L$const_sse2-L$022pic_point)+ebx] test eax,eax jnz NEAR L$023base2_26 call __poly1305_init_avx2 mov eax,DWORD [edi] mov ecx,DWORD [3+edi] mov edx,DWORD [6+edi] mov esi,DWORD [9+edi] mov ebp,DWORD [13+edi] shr ecx,2 and eax,67108863 shr edx,4 and ecx,67108863 shr esi,6 and edx,67108863 mov DWORD [edi],eax mov DWORD [4+edi],ecx mov DWORD [8+edi],edx mov DWORD [12+edi],esi mov DWORD [16+edi],ebp mov DWORD [20+edi],1 mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] L$023base2_26: mov eax,DWORD [32+esp] mov ebp,esp sub esp,448 and esp,-512 vmovdqu xmm0,[48+edi] lea edx,[288+esp] vmovdqu xmm1,[64+edi] vmovdqu xmm2,[80+edi] vmovdqu xmm3,[96+edi] vmovdqu xmm4,[112+edi] lea edi,[48+edi] vpermq ymm0,ymm0,64 vpermq ymm1,ymm1,64 vpermq ymm2,ymm2,64 vpermq ymm3,ymm3,64 vpermq ymm4,ymm4,64 vpshufd ymm0,ymm0,200 vpshufd ymm1,ymm1,200 vpshufd ymm2,ymm2,200 vpshufd ymm3,ymm3,200 vpshufd ymm4,ymm4,200 vmovdqa [edx-128],ymm0 vmovdqu xmm0,[80+edi] vmovdqa [edx-96],ymm1 vmovdqu xmm1,[96+edi] vmovdqa [edx-64],ymm2 vmovdqu xmm2,[112+edi] vmovdqa [edx-32],ymm3 vmovdqu xmm3,[128+edi] vmovdqa [edx],ymm4 vpermq ymm0,ymm0,64 vpermq ymm1,ymm1,64 vpermq ymm2,ymm2,64 vpermq ymm3,ymm3,64 vpshufd ymm0,ymm0,200 vpshufd ymm1,ymm1,200 vpshufd ymm2,ymm2,200 vpshufd ymm3,ymm3,200 vmovdqa [32+edx],ymm0 vmovd xmm0,DWORD [edi-48] vmovdqa [64+edx],ymm1 vmovd xmm1,DWORD [edi-44] vmovdqa [96+edx],ymm2 vmovd xmm2,DWORD [edi-40] vmovdqa [128+edx],ymm3 vmovd xmm3,DWORD [edi-36] vmovd xmm4,DWORD [edi-32] vmovdqa ymm7,[64+ebx] neg eax test ecx,63 jz NEAR L$024even mov edx,ecx and ecx,-64 and edx,63 vmovdqu xmm5,[esi] cmp edx,32 jb NEAR L$025one vmovdqu xmm6,[16+esi] je NEAR L$026two vinserti128 ymm5,ymm5,[32+esi],1 lea esi,[48+esi] lea ebx,[8+ebx] lea edx,[296+esp] jmp NEAR L$027tail L$026two: lea esi,[32+esi] lea ebx,[16+ebx] lea edx,[304+esp] jmp NEAR L$027tail L$025one: lea esi,[16+esi] vpxor ymm6,ymm6,ymm6 lea ebx,[32+eax*8+ebx] lea edx,[312+esp] jmp NEAR L$027tail align 32 L$024even: vmovdqu xmm5,[esi] vmovdqu xmm6,[16+esi] vinserti128 ymm5,ymm5,[32+esi],1 vinserti128 ymm6,ymm6,[48+esi],1 lea esi,[64+esi] sub ecx,64 jz NEAR L$027tail L$028loop: vmovdqa [64+esp],ymm2 vpsrldq ymm2,ymm5,6 vmovdqa [esp],ymm0 vpsrldq ymm0,ymm6,6 vmovdqa [32+esp],ymm1 vpunpckhqdq ymm1,ymm5,ymm6 vpunpcklqdq ymm5,ymm5,ymm6 vpunpcklqdq ymm2,ymm2,ymm0 vpsrlq ymm0,ymm2,30 vpsrlq ymm2,ymm2,4 vpsrlq ymm6,ymm5,26 vpsrlq ymm1,ymm1,40 vpand ymm2,ymm2,ymm7 vpand ymm5,ymm5,ymm7 vpand ymm6,ymm6,ymm7 vpand ymm0,ymm0,ymm7 vpor ymm1,ymm1,[ebx] vpaddq ymm2,ymm2,[64+esp] vpaddq ymm5,ymm5,[esp] vpaddq ymm6,ymm6,[32+esp] vpaddq ymm0,ymm0,ymm3 vpaddq ymm1,ymm1,ymm4 vpmuludq ymm3,ymm2,[edx-96] vmovdqa [32+esp],ymm6 vpmuludq ymm4,ymm2,[edx-64] vmovdqa [96+esp],ymm0 vpmuludq ymm0,ymm2,[96+edx] vmovdqa [128+esp],ymm1 vpmuludq ymm1,ymm2,[128+edx] vpmuludq ymm2,ymm2,[edx-128] vpmuludq ymm7,ymm5,[edx-32] vpaddq ymm3,ymm3,ymm7 vpmuludq ymm6,ymm5,[edx] vpaddq ymm4,ymm4,ymm6 vpmuludq ymm7,ymm5,[edx-128] vpaddq ymm0,ymm0,ymm7 vmovdqa ymm7,[32+esp] vpmuludq ymm6,ymm5,[edx-96] vpaddq ymm1,ymm1,ymm6 vpmuludq ymm5,ymm5,[edx-64] vpaddq ymm2,ymm2,ymm5 vpmuludq ymm6,ymm7,[edx-64] vpaddq ymm3,ymm3,ymm6 vpmuludq ymm5,ymm7,[edx-32] vpaddq ymm4,ymm4,ymm5 vpmuludq ymm6,ymm7,[128+edx] vpaddq ymm0,ymm0,ymm6 vmovdqa ymm6,[96+esp] vpmuludq ymm5,ymm7,[edx-128] vpaddq ymm1,ymm1,ymm5 vpmuludq ymm7,ymm7,[edx-96] vpaddq ymm2,ymm2,ymm7 vpmuludq ymm5,ymm6,[edx-128] vpaddq ymm3,ymm3,ymm5 vpmuludq ymm7,ymm6,[edx-96] vpaddq ymm4,ymm4,ymm7 vpmuludq ymm5,ymm6,[64+edx] vpaddq ymm0,ymm0,ymm5 vmovdqa ymm5,[128+esp] vpmuludq ymm7,ymm6,[96+edx] vpaddq ymm1,ymm1,ymm7 vpmuludq ymm6,ymm6,[128+edx] vpaddq ymm2,ymm2,ymm6 vpmuludq ymm7,ymm5,[128+edx] vpaddq ymm3,ymm3,ymm7 vpmuludq ymm6,ymm5,[32+edx] vpaddq ymm0,ymm0,ymm6 vpmuludq ymm7,ymm5,[edx-128] vpaddq ymm4,ymm4,ymm7 vmovdqa ymm7,[64+ebx] vpmuludq ymm6,ymm5,[64+edx] vpaddq ymm1,ymm1,ymm6 vpmuludq ymm5,ymm5,[96+edx] vpaddq ymm2,ymm2,ymm5 vpsrlq ymm5,ymm3,26 vpand ymm3,ymm3,ymm7 vpsrlq ymm6,ymm0,26 vpand ymm0,ymm0,ymm7 vpaddq ymm4,ymm4,ymm5 vpaddq ymm1,ymm1,ymm6 vpsrlq ymm5,ymm4,26 vpand ymm4,ymm4,ymm7 vpsrlq ymm6,ymm1,26 vpand ymm1,ymm1,ymm7 vpaddq ymm2,ymm2,ymm6 vpaddq ymm0,ymm0,ymm5 vpsllq ymm5,ymm5,2 vpsrlq ymm6,ymm2,26 vpand ymm2,ymm2,ymm7 vpaddq ymm0,ymm0,ymm5 vpaddq ymm3,ymm3,ymm6 vpsrlq ymm6,ymm3,26 vpsrlq ymm5,ymm0,26 vpand ymm0,ymm0,ymm7 vpand ymm3,ymm3,ymm7 vpaddq ymm1,ymm1,ymm5 vpaddq ymm4,ymm4,ymm6 vmovdqu xmm5,[esi] vmovdqu xmm6,[16+esi] vinserti128 ymm5,ymm5,[32+esi],1 vinserti128 ymm6,ymm6,[48+esi],1 lea esi,[64+esi] sub ecx,64 jnz NEAR L$028loop L$027tail: vmovdqa [64+esp],ymm2 vpsrldq ymm2,ymm5,6 vmovdqa [esp],ymm0 vpsrldq ymm0,ymm6,6 vmovdqa [32+esp],ymm1 vpunpckhqdq ymm1,ymm5,ymm6 vpunpcklqdq ymm5,ymm5,ymm6 vpunpcklqdq ymm2,ymm2,ymm0 vpsrlq ymm0,ymm2,30 vpsrlq ymm2,ymm2,4 vpsrlq ymm6,ymm5,26 vpsrlq ymm1,ymm1,40 vpand ymm2,ymm2,ymm7 vpand ymm5,ymm5,ymm7 vpand ymm6,ymm6,ymm7 vpand ymm0,ymm0,ymm7 vpor ymm1,ymm1,[ebx] and ebx,-64 vpaddq ymm2,ymm2,[64+esp] vpaddq ymm5,ymm5,[esp] vpaddq ymm6,ymm6,[32+esp] vpaddq ymm0,ymm0,ymm3 vpaddq ymm1,ymm1,ymm4 vpmuludq ymm3,ymm2,[edx-92] vmovdqa [32+esp],ymm6 vpmuludq ymm4,ymm2,[edx-60] vmovdqa [96+esp],ymm0 vpmuludq ymm0,ymm2,[100+edx] vmovdqa [128+esp],ymm1 vpmuludq ymm1,ymm2,[132+edx] vpmuludq ymm2,ymm2,[edx-124] vpmuludq ymm7,ymm5,[edx-28] vpaddq ymm3,ymm3,ymm7 vpmuludq ymm6,ymm5,[4+edx] vpaddq ymm4,ymm4,ymm6 vpmuludq ymm7,ymm5,[edx-124] vpaddq ymm0,ymm0,ymm7 vmovdqa ymm7,[32+esp] vpmuludq ymm6,ymm5,[edx-92] vpaddq ymm1,ymm1,ymm6 vpmuludq ymm5,ymm5,[edx-60] vpaddq ymm2,ymm2,ymm5 vpmuludq ymm6,ymm7,[edx-60] vpaddq ymm3,ymm3,ymm6 vpmuludq ymm5,ymm7,[edx-28] vpaddq ymm4,ymm4,ymm5 vpmuludq ymm6,ymm7,[132+edx] vpaddq ymm0,ymm0,ymm6 vmovdqa ymm6,[96+esp] vpmuludq ymm5,ymm7,[edx-124] vpaddq ymm1,ymm1,ymm5 vpmuludq ymm7,ymm7,[edx-92] vpaddq ymm2,ymm2,ymm7 vpmuludq ymm5,ymm6,[edx-124] vpaddq ymm3,ymm3,ymm5 vpmuludq ymm7,ymm6,[edx-92] vpaddq ymm4,ymm4,ymm7 vpmuludq ymm5,ymm6,[68+edx] vpaddq ymm0,ymm0,ymm5 vmovdqa ymm5,[128+esp] vpmuludq ymm7,ymm6,[100+edx] vpaddq ymm1,ymm1,ymm7 vpmuludq ymm6,ymm6,[132+edx] vpaddq ymm2,ymm2,ymm6 vpmuludq ymm7,ymm5,[132+edx] vpaddq ymm3,ymm3,ymm7 vpmuludq ymm6,ymm5,[36+edx] vpaddq ymm0,ymm0,ymm6 vpmuludq ymm7,ymm5,[edx-124] vpaddq ymm4,ymm4,ymm7 vmovdqa ymm7,[64+ebx] vpmuludq ymm6,ymm5,[68+edx] vpaddq ymm1,ymm1,ymm6 vpmuludq ymm5,ymm5,[100+edx] vpaddq ymm2,ymm2,ymm5 vpsrldq ymm5,ymm4,8 vpsrldq ymm6,ymm3,8 vpaddq ymm4,ymm4,ymm5 vpsrldq ymm5,ymm0,8 vpaddq ymm3,ymm3,ymm6 vpsrldq ymm6,ymm1,8 vpaddq ymm0,ymm0,ymm5 vpsrldq ymm5,ymm2,8 vpaddq ymm1,ymm1,ymm6 vpermq ymm6,ymm4,2 vpaddq ymm2,ymm2,ymm5 vpermq ymm5,ymm3,2 vpaddq ymm4,ymm4,ymm6 vpermq ymm6,ymm0,2 vpaddq ymm3,ymm3,ymm5 vpermq ymm5,ymm1,2 vpaddq ymm0,ymm0,ymm6 vpermq ymm6,ymm2,2 vpaddq ymm1,ymm1,ymm5 vpaddq ymm2,ymm2,ymm6 vpsrlq ymm5,ymm3,26 vpand ymm3,ymm3,ymm7 vpsrlq ymm6,ymm0,26 vpand ymm0,ymm0,ymm7 vpaddq ymm4,ymm4,ymm5 vpaddq ymm1,ymm1,ymm6 vpsrlq ymm5,ymm4,26 vpand ymm4,ymm4,ymm7 vpsrlq ymm6,ymm1,26 vpand ymm1,ymm1,ymm7 vpaddq ymm2,ymm2,ymm6 vpaddq ymm0,ymm0,ymm5 vpsllq ymm5,ymm5,2 vpsrlq ymm6,ymm2,26 vpand ymm2,ymm2,ymm7 vpaddq ymm0,ymm0,ymm5 vpaddq ymm3,ymm3,ymm6 vpsrlq ymm6,ymm3,26 vpsrlq ymm5,ymm0,26 vpand ymm0,ymm0,ymm7 vpand ymm3,ymm3,ymm7 vpaddq ymm1,ymm1,ymm5 vpaddq ymm4,ymm4,ymm6 cmp ecx,0 je NEAR L$029done vpshufd xmm0,xmm0,252 lea edx,[288+esp] vpshufd xmm1,xmm1,252 vpshufd xmm2,xmm2,252 vpshufd xmm3,xmm3,252 vpshufd xmm4,xmm4,252 jmp NEAR L$024even align 16 L$029done: vmovd DWORD [edi-48],xmm0 vmovd DWORD [edi-44],xmm1 vmovd DWORD [edi-40],xmm2 vmovd DWORD [edi-36],xmm3 vmovd DWORD [edi-32],xmm4 vzeroupper mov esp,ebp L$020nodata: pop edi pop esi pop ebx pop ebp ret align 64 L$const_sse2: dd 16777216,0,16777216,0,16777216,0,16777216,0 dd 0,0,0,0,0,0,0,0 dd 67108863,0,67108863,0,67108863,0,67108863,0 dd 268435455,268435452,268435452,268435452 db 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 db 114,103,62,0 align 4 segment .bss common _OPENSSL_ia32cap_P 16