mirror of
https://github.com/CloverHackyColor/CloverBootloader.git
synced 2025-02-14 00:41:35 +01:00
8292 lines
215 KiB
NASM
8292 lines
215 KiB
NASM
default rel
|
|
%define XMMWORD
|
|
%define YMMWORD
|
|
%define ZMMWORD
|
|
section .text code align=64
|
|
|
|
|
|
EXTERN OPENSSL_ia32cap_P
|
|
|
|
global sha256_multi_block
|
|
|
|
ALIGN 32
|
|
sha256_multi_block:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_sha256_multi_block:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
|
|
mov rcx,QWORD[((OPENSSL_ia32cap_P+4))]
|
|
bt rcx,61
|
|
jc NEAR _shaext_shortcut
|
|
test ecx,268435456
|
|
jnz NEAR _avx_shortcut
|
|
mov rax,rsp
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
lea rsp,[((-168))+rsp]
|
|
movaps XMMWORD[rsp],xmm6
|
|
movaps XMMWORD[16+rsp],xmm7
|
|
movaps XMMWORD[32+rsp],xmm8
|
|
movaps XMMWORD[48+rsp],xmm9
|
|
movaps XMMWORD[(-120)+rax],xmm10
|
|
movaps XMMWORD[(-104)+rax],xmm11
|
|
movaps XMMWORD[(-88)+rax],xmm12
|
|
movaps XMMWORD[(-72)+rax],xmm13
|
|
movaps XMMWORD[(-56)+rax],xmm14
|
|
movaps XMMWORD[(-40)+rax],xmm15
|
|
sub rsp,288
|
|
and rsp,-256
|
|
mov QWORD[272+rsp],rax
|
|
|
|
$L$body:
|
|
lea rbp,[((K256+128))]
|
|
lea rbx,[256+rsp]
|
|
lea rdi,[128+rdi]
|
|
|
|
$L$oop_grande:
|
|
mov DWORD[280+rsp],edx
|
|
xor edx,edx
|
|
|
|
mov r8,QWORD[rsi]
|
|
|
|
mov ecx,DWORD[8+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[rbx],ecx
|
|
cmovle r8,rbp
|
|
|
|
mov r9,QWORD[16+rsi]
|
|
|
|
mov ecx,DWORD[24+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[4+rbx],ecx
|
|
cmovle r9,rbp
|
|
|
|
mov r10,QWORD[32+rsi]
|
|
|
|
mov ecx,DWORD[40+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[8+rbx],ecx
|
|
cmovle r10,rbp
|
|
|
|
mov r11,QWORD[48+rsi]
|
|
|
|
mov ecx,DWORD[56+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[12+rbx],ecx
|
|
cmovle r11,rbp
|
|
test edx,edx
|
|
jz NEAR $L$done
|
|
|
|
movdqu xmm8,XMMWORD[((0-128))+rdi]
|
|
lea rax,[128+rsp]
|
|
movdqu xmm9,XMMWORD[((32-128))+rdi]
|
|
movdqu xmm10,XMMWORD[((64-128))+rdi]
|
|
movdqu xmm11,XMMWORD[((96-128))+rdi]
|
|
movdqu xmm12,XMMWORD[((128-128))+rdi]
|
|
movdqu xmm13,XMMWORD[((160-128))+rdi]
|
|
movdqu xmm14,XMMWORD[((192-128))+rdi]
|
|
movdqu xmm15,XMMWORD[((224-128))+rdi]
|
|
movdqu xmm6,XMMWORD[$L$pbswap]
|
|
jmp NEAR $L$oop
|
|
|
|
ALIGN 32
|
|
$L$oop:
|
|
movdqa xmm4,xmm10
|
|
pxor xmm4,xmm9
|
|
movd xmm5,DWORD[r8]
|
|
movd xmm0,DWORD[r9]
|
|
movd xmm1,DWORD[r10]
|
|
movd xmm2,DWORD[r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm12
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm12
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm12
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(0-128)+rax],xmm5
|
|
paddd xmm5,xmm15
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-128))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm12
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm12
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm14
|
|
pand xmm3,xmm13
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm8
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm8
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm9
|
|
movdqa xmm7,xmm8
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm8
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm15,xmm9
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm15,xmm4
|
|
paddd xmm11,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm15,xmm5
|
|
paddd xmm15,xmm7
|
|
movd xmm5,DWORD[4+r8]
|
|
movd xmm0,DWORD[4+r9]
|
|
movd xmm1,DWORD[4+r10]
|
|
movd xmm2,DWORD[4+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm11
|
|
|
|
movdqa xmm2,xmm11
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm11
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(16-128)+rax],xmm5
|
|
paddd xmm5,xmm14
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-96))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm11
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm11
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm13
|
|
pand xmm4,xmm12
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm15
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm15
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm8
|
|
movdqa xmm7,xmm15
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm15
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm14,xmm8
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm14,xmm3
|
|
paddd xmm10,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm14,xmm5
|
|
paddd xmm14,xmm7
|
|
movd xmm5,DWORD[8+r8]
|
|
movd xmm0,DWORD[8+r9]
|
|
movd xmm1,DWORD[8+r10]
|
|
movd xmm2,DWORD[8+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm10
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm10
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm10
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(32-128)+rax],xmm5
|
|
paddd xmm5,xmm13
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-64))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm10
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm10
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm12
|
|
pand xmm3,xmm11
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm14
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm14
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm15
|
|
movdqa xmm7,xmm14
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm14
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm13,xmm15
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm13,xmm4
|
|
paddd xmm9,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm13,xmm5
|
|
paddd xmm13,xmm7
|
|
movd xmm5,DWORD[12+r8]
|
|
movd xmm0,DWORD[12+r9]
|
|
movd xmm1,DWORD[12+r10]
|
|
movd xmm2,DWORD[12+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm9
|
|
|
|
movdqa xmm2,xmm9
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm9
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(48-128)+rax],xmm5
|
|
paddd xmm5,xmm12
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-32))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm9
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm9
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm11
|
|
pand xmm4,xmm10
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm13
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm13
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm14
|
|
movdqa xmm7,xmm13
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm13
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm12,xmm14
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm12,xmm3
|
|
paddd xmm8,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm12,xmm5
|
|
paddd xmm12,xmm7
|
|
movd xmm5,DWORD[16+r8]
|
|
movd xmm0,DWORD[16+r9]
|
|
movd xmm1,DWORD[16+r10]
|
|
movd xmm2,DWORD[16+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm8
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm8
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm8
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(64-128)+rax],xmm5
|
|
paddd xmm5,xmm11
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm8
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm8
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm10
|
|
pand xmm3,xmm9
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm12
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm12
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm13
|
|
movdqa xmm7,xmm12
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm12
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm11,xmm13
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm11,xmm4
|
|
paddd xmm15,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm11,xmm5
|
|
paddd xmm11,xmm7
|
|
movd xmm5,DWORD[20+r8]
|
|
movd xmm0,DWORD[20+r9]
|
|
movd xmm1,DWORD[20+r10]
|
|
movd xmm2,DWORD[20+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm2,xmm15
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm15
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(80-128)+rax],xmm5
|
|
paddd xmm5,xmm10
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[32+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm15
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm15
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm9
|
|
pand xmm4,xmm8
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm11
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm11
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm7,xmm11
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm11
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm10,xmm12
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm10,xmm3
|
|
paddd xmm14,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm10,xmm5
|
|
paddd xmm10,xmm7
|
|
movd xmm5,DWORD[24+r8]
|
|
movd xmm0,DWORD[24+r9]
|
|
movd xmm1,DWORD[24+r10]
|
|
movd xmm2,DWORD[24+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm14
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm14
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm14
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(96-128)+rax],xmm5
|
|
paddd xmm5,xmm9
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[64+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm14
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm14
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm8
|
|
pand xmm3,xmm15
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm10
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm10
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm11
|
|
movdqa xmm7,xmm10
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm10
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm9,xmm11
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm9,xmm4
|
|
paddd xmm13,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm9,xmm5
|
|
paddd xmm9,xmm7
|
|
movd xmm5,DWORD[28+r8]
|
|
movd xmm0,DWORD[28+r9]
|
|
movd xmm1,DWORD[28+r10]
|
|
movd xmm2,DWORD[28+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm13
|
|
|
|
movdqa xmm2,xmm13
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm13
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(112-128)+rax],xmm5
|
|
paddd xmm5,xmm8
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[96+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm13
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm13
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm15
|
|
pand xmm4,xmm14
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm9
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm9
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm10
|
|
movdqa xmm7,xmm9
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm9
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm8,xmm10
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm8,xmm3
|
|
paddd xmm12,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm8,xmm5
|
|
paddd xmm8,xmm7
|
|
lea rbp,[256+rbp]
|
|
movd xmm5,DWORD[32+r8]
|
|
movd xmm0,DWORD[32+r9]
|
|
movd xmm1,DWORD[32+r10]
|
|
movd xmm2,DWORD[32+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm12
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm12
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm12
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(128-128)+rax],xmm5
|
|
paddd xmm5,xmm15
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-128))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm12
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm12
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm14
|
|
pand xmm3,xmm13
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm8
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm8
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm9
|
|
movdqa xmm7,xmm8
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm8
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm15,xmm9
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm15,xmm4
|
|
paddd xmm11,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm15,xmm5
|
|
paddd xmm15,xmm7
|
|
movd xmm5,DWORD[36+r8]
|
|
movd xmm0,DWORD[36+r9]
|
|
movd xmm1,DWORD[36+r10]
|
|
movd xmm2,DWORD[36+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm11
|
|
|
|
movdqa xmm2,xmm11
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm11
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(144-128)+rax],xmm5
|
|
paddd xmm5,xmm14
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-96))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm11
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm11
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm13
|
|
pand xmm4,xmm12
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm15
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm15
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm8
|
|
movdqa xmm7,xmm15
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm15
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm14,xmm8
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm14,xmm3
|
|
paddd xmm10,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm14,xmm5
|
|
paddd xmm14,xmm7
|
|
movd xmm5,DWORD[40+r8]
|
|
movd xmm0,DWORD[40+r9]
|
|
movd xmm1,DWORD[40+r10]
|
|
movd xmm2,DWORD[40+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm10
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm10
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm10
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(160-128)+rax],xmm5
|
|
paddd xmm5,xmm13
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-64))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm10
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm10
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm12
|
|
pand xmm3,xmm11
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm14
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm14
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm15
|
|
movdqa xmm7,xmm14
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm14
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm13,xmm15
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm13,xmm4
|
|
paddd xmm9,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm13,xmm5
|
|
paddd xmm13,xmm7
|
|
movd xmm5,DWORD[44+r8]
|
|
movd xmm0,DWORD[44+r9]
|
|
movd xmm1,DWORD[44+r10]
|
|
movd xmm2,DWORD[44+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm9
|
|
|
|
movdqa xmm2,xmm9
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm9
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(176-128)+rax],xmm5
|
|
paddd xmm5,xmm12
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-32))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm9
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm9
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm11
|
|
pand xmm4,xmm10
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm13
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm13
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm14
|
|
movdqa xmm7,xmm13
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm13
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm12,xmm14
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm12,xmm3
|
|
paddd xmm8,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm12,xmm5
|
|
paddd xmm12,xmm7
|
|
movd xmm5,DWORD[48+r8]
|
|
movd xmm0,DWORD[48+r9]
|
|
movd xmm1,DWORD[48+r10]
|
|
movd xmm2,DWORD[48+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm8
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm8
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm8
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(192-128)+rax],xmm5
|
|
paddd xmm5,xmm11
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm8
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm8
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm10
|
|
pand xmm3,xmm9
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm12
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm12
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm13
|
|
movdqa xmm7,xmm12
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm12
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm11,xmm13
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm11,xmm4
|
|
paddd xmm15,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm11,xmm5
|
|
paddd xmm11,xmm7
|
|
movd xmm5,DWORD[52+r8]
|
|
movd xmm0,DWORD[52+r9]
|
|
movd xmm1,DWORD[52+r10]
|
|
movd xmm2,DWORD[52+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm2,xmm15
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm15
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(208-128)+rax],xmm5
|
|
paddd xmm5,xmm10
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[32+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm15
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm15
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm9
|
|
pand xmm4,xmm8
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm11
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm11
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm7,xmm11
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm11
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm10,xmm12
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm10,xmm3
|
|
paddd xmm14,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm10,xmm5
|
|
paddd xmm10,xmm7
|
|
movd xmm5,DWORD[56+r8]
|
|
movd xmm0,DWORD[56+r9]
|
|
movd xmm1,DWORD[56+r10]
|
|
movd xmm2,DWORD[56+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm14
|
|
DB 102,15,56,0,238
|
|
movdqa xmm2,xmm14
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm14
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(224-128)+rax],xmm5
|
|
paddd xmm5,xmm9
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[64+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm14
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm14
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm8
|
|
pand xmm3,xmm15
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm10
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm10
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm11
|
|
movdqa xmm7,xmm10
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm10
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm9,xmm11
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm9,xmm4
|
|
paddd xmm13,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm9,xmm5
|
|
paddd xmm9,xmm7
|
|
movd xmm5,DWORD[60+r8]
|
|
lea r8,[64+r8]
|
|
movd xmm0,DWORD[60+r9]
|
|
lea r9,[64+r9]
|
|
movd xmm1,DWORD[60+r10]
|
|
lea r10,[64+r10]
|
|
movd xmm2,DWORD[60+r11]
|
|
lea r11,[64+r11]
|
|
punpckldq xmm5,xmm1
|
|
punpckldq xmm0,xmm2
|
|
punpckldq xmm5,xmm0
|
|
movdqa xmm7,xmm13
|
|
|
|
movdqa xmm2,xmm13
|
|
DB 102,15,56,0,238
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm13
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(240-128)+rax],xmm5
|
|
paddd xmm5,xmm8
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[96+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm13
|
|
prefetcht0 [63+r8]
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm13
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm15
|
|
pand xmm4,xmm14
|
|
pxor xmm7,xmm1
|
|
|
|
prefetcht0 [63+r9]
|
|
movdqa xmm1,xmm9
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm9
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm10
|
|
movdqa xmm7,xmm9
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm9
|
|
|
|
prefetcht0 [63+r10]
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
prefetcht0 [63+r11]
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm8,xmm10
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm8,xmm3
|
|
paddd xmm12,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm8,xmm5
|
|
paddd xmm8,xmm7
|
|
lea rbp,[256+rbp]
|
|
movdqu xmm5,XMMWORD[((0-128))+rax]
|
|
mov ecx,3
|
|
jmp NEAR $L$oop_16_xx
|
|
ALIGN 32
|
|
$L$oop_16_xx:
|
|
movdqa xmm6,XMMWORD[((16-128))+rax]
|
|
paddd xmm5,XMMWORD[((144-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((224-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm12
|
|
|
|
movdqa xmm2,xmm12
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm12
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(0-128)+rax],xmm5
|
|
paddd xmm5,xmm15
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-128))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm12
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm12
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm14
|
|
pand xmm3,xmm13
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm8
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm8
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm9
|
|
movdqa xmm7,xmm8
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm8
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm15,xmm9
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm15,xmm4
|
|
paddd xmm11,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm15,xmm5
|
|
paddd xmm15,xmm7
|
|
movdqa xmm5,XMMWORD[((32-128))+rax]
|
|
paddd xmm6,XMMWORD[((160-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((240-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm11
|
|
|
|
movdqa xmm2,xmm11
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm11
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(16-128)+rax],xmm6
|
|
paddd xmm6,xmm14
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[((-96))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm11
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm11
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm13
|
|
pand xmm4,xmm12
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm15
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm15
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm8
|
|
movdqa xmm7,xmm15
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm15
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm14,xmm8
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm14,xmm3
|
|
paddd xmm10,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm14,xmm6
|
|
paddd xmm14,xmm7
|
|
movdqa xmm6,XMMWORD[((48-128))+rax]
|
|
paddd xmm5,XMMWORD[((176-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((0-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm10
|
|
|
|
movdqa xmm2,xmm10
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm10
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(32-128)+rax],xmm5
|
|
paddd xmm5,xmm13
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-64))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm10
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm10
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm12
|
|
pand xmm3,xmm11
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm14
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm14
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm15
|
|
movdqa xmm7,xmm14
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm14
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm13,xmm15
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm13,xmm4
|
|
paddd xmm9,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm13,xmm5
|
|
paddd xmm13,xmm7
|
|
movdqa xmm5,XMMWORD[((64-128))+rax]
|
|
paddd xmm6,XMMWORD[((192-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((16-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm9
|
|
|
|
movdqa xmm2,xmm9
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm9
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(48-128)+rax],xmm6
|
|
paddd xmm6,xmm12
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[((-32))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm9
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm9
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm11
|
|
pand xmm4,xmm10
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm13
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm13
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm14
|
|
movdqa xmm7,xmm13
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm13
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm12,xmm14
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm12,xmm3
|
|
paddd xmm8,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm12,xmm6
|
|
paddd xmm12,xmm7
|
|
movdqa xmm6,XMMWORD[((80-128))+rax]
|
|
paddd xmm5,XMMWORD[((208-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((32-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm8
|
|
|
|
movdqa xmm2,xmm8
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm8
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(64-128)+rax],xmm5
|
|
paddd xmm5,xmm11
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm8
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm8
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm10
|
|
pand xmm3,xmm9
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm12
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm12
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm13
|
|
movdqa xmm7,xmm12
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm12
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm11,xmm13
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm11,xmm4
|
|
paddd xmm15,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm11,xmm5
|
|
paddd xmm11,xmm7
|
|
movdqa xmm5,XMMWORD[((96-128))+rax]
|
|
paddd xmm6,XMMWORD[((224-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((48-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm2,xmm15
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm15
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(80-128)+rax],xmm6
|
|
paddd xmm6,xmm10
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[32+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm15
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm15
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm9
|
|
pand xmm4,xmm8
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm11
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm11
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm7,xmm11
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm11
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm10,xmm12
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm10,xmm3
|
|
paddd xmm14,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm10,xmm6
|
|
paddd xmm10,xmm7
|
|
movdqa xmm6,XMMWORD[((112-128))+rax]
|
|
paddd xmm5,XMMWORD[((240-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((64-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm14
|
|
|
|
movdqa xmm2,xmm14
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm14
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(96-128)+rax],xmm5
|
|
paddd xmm5,xmm9
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[64+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm14
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm14
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm8
|
|
pand xmm3,xmm15
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm10
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm10
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm11
|
|
movdqa xmm7,xmm10
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm10
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm9,xmm11
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm9,xmm4
|
|
paddd xmm13,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm9,xmm5
|
|
paddd xmm9,xmm7
|
|
movdqa xmm5,XMMWORD[((128-128))+rax]
|
|
paddd xmm6,XMMWORD[((0-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((80-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm13
|
|
|
|
movdqa xmm2,xmm13
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm13
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(112-128)+rax],xmm6
|
|
paddd xmm6,xmm8
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[96+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm13
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm13
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm15
|
|
pand xmm4,xmm14
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm9
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm9
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm10
|
|
movdqa xmm7,xmm9
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm9
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm8,xmm10
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm8,xmm3
|
|
paddd xmm12,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm8,xmm6
|
|
paddd xmm8,xmm7
|
|
lea rbp,[256+rbp]
|
|
movdqa xmm6,XMMWORD[((144-128))+rax]
|
|
paddd xmm5,XMMWORD[((16-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((96-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm12
|
|
|
|
movdqa xmm2,xmm12
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm12
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(128-128)+rax],xmm5
|
|
paddd xmm5,xmm15
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-128))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm12
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm12
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm14
|
|
pand xmm3,xmm13
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm8
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm8
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm9
|
|
movdqa xmm7,xmm8
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm8
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm15,xmm9
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm15,xmm4
|
|
paddd xmm11,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm15,xmm5
|
|
paddd xmm15,xmm7
|
|
movdqa xmm5,XMMWORD[((160-128))+rax]
|
|
paddd xmm6,XMMWORD[((32-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((112-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm11
|
|
|
|
movdqa xmm2,xmm11
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm11
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(144-128)+rax],xmm6
|
|
paddd xmm6,xmm14
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[((-96))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm11
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm11
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm13
|
|
pand xmm4,xmm12
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm15
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm15
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm8
|
|
movdqa xmm7,xmm15
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm15
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm14,xmm8
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm14,xmm3
|
|
paddd xmm10,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm14,xmm6
|
|
paddd xmm14,xmm7
|
|
movdqa xmm6,XMMWORD[((176-128))+rax]
|
|
paddd xmm5,XMMWORD[((48-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((128-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm10
|
|
|
|
movdqa xmm2,xmm10
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm10
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(160-128)+rax],xmm5
|
|
paddd xmm5,xmm13
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[((-64))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm10
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm10
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm12
|
|
pand xmm3,xmm11
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm14
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm14
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm15
|
|
movdqa xmm7,xmm14
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm14
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm13,xmm15
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm13,xmm4
|
|
paddd xmm9,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm13,xmm5
|
|
paddd xmm13,xmm7
|
|
movdqa xmm5,XMMWORD[((192-128))+rax]
|
|
paddd xmm6,XMMWORD[((64-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((144-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm9
|
|
|
|
movdqa xmm2,xmm9
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm9
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(176-128)+rax],xmm6
|
|
paddd xmm6,xmm12
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[((-32))+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm9
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm9
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm11
|
|
pand xmm4,xmm10
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm13
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm13
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm14
|
|
movdqa xmm7,xmm13
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm13
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm12,xmm14
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm12,xmm3
|
|
paddd xmm8,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm12,xmm6
|
|
paddd xmm12,xmm7
|
|
movdqa xmm6,XMMWORD[((208-128))+rax]
|
|
paddd xmm5,XMMWORD[((80-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((160-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm8
|
|
|
|
movdqa xmm2,xmm8
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm8
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(192-128)+rax],xmm5
|
|
paddd xmm5,xmm11
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm8
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm8
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm10
|
|
pand xmm3,xmm9
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm12
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm12
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm13
|
|
movdqa xmm7,xmm12
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm12
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm11,xmm13
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm11,xmm4
|
|
paddd xmm15,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm11,xmm5
|
|
paddd xmm11,xmm7
|
|
movdqa xmm5,XMMWORD[((224-128))+rax]
|
|
paddd xmm6,XMMWORD[((96-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((176-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm2,xmm15
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm15
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(208-128)+rax],xmm6
|
|
paddd xmm6,xmm10
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[32+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm15
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm15
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm9
|
|
pand xmm4,xmm8
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm11
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm11
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm7,xmm11
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm11
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm10,xmm12
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm10,xmm3
|
|
paddd xmm14,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm10,xmm6
|
|
paddd xmm10,xmm7
|
|
movdqa xmm6,XMMWORD[((240-128))+rax]
|
|
paddd xmm5,XMMWORD[((112-128))+rax]
|
|
|
|
movdqa xmm7,xmm6
|
|
movdqa xmm1,xmm6
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm6
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((192-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm3,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm3
|
|
|
|
psrld xmm3,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
psrld xmm3,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm3
|
|
pxor xmm0,xmm1
|
|
paddd xmm5,xmm0
|
|
movdqa xmm7,xmm14
|
|
|
|
movdqa xmm2,xmm14
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm14
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(224-128)+rax],xmm5
|
|
paddd xmm5,xmm9
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm5,XMMWORD[64+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm14
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm3,xmm14
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm8
|
|
pand xmm3,xmm15
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm10
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm10
|
|
psrld xmm1,2
|
|
paddd xmm5,xmm7
|
|
pxor xmm0,xmm3
|
|
movdqa xmm3,xmm11
|
|
movdqa xmm7,xmm10
|
|
pslld xmm2,10
|
|
pxor xmm3,xmm10
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm5,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm4,xmm3
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm9,xmm11
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm9,xmm4
|
|
paddd xmm13,xmm5
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm9,xmm5
|
|
paddd xmm9,xmm7
|
|
movdqa xmm5,XMMWORD[((0-128))+rax]
|
|
paddd xmm6,XMMWORD[((128-128))+rax]
|
|
|
|
movdqa xmm7,xmm5
|
|
movdqa xmm1,xmm5
|
|
psrld xmm7,3
|
|
movdqa xmm2,xmm5
|
|
|
|
psrld xmm1,7
|
|
movdqa xmm0,XMMWORD[((208-128))+rax]
|
|
pslld xmm2,14
|
|
pxor xmm7,xmm1
|
|
psrld xmm1,18-7
|
|
movdqa xmm4,xmm0
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,25-14
|
|
pxor xmm7,xmm1
|
|
psrld xmm0,10
|
|
movdqa xmm1,xmm4
|
|
|
|
psrld xmm4,17
|
|
pxor xmm7,xmm2
|
|
pslld xmm1,13
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
psrld xmm4,19-17
|
|
pxor xmm0,xmm1
|
|
pslld xmm1,15-13
|
|
pxor xmm0,xmm4
|
|
pxor xmm0,xmm1
|
|
paddd xmm6,xmm0
|
|
movdqa xmm7,xmm13
|
|
|
|
movdqa xmm2,xmm13
|
|
|
|
psrld xmm7,6
|
|
movdqa xmm1,xmm13
|
|
pslld xmm2,7
|
|
movdqa XMMWORD[(240-128)+rax],xmm6
|
|
paddd xmm6,xmm8
|
|
|
|
psrld xmm1,11
|
|
pxor xmm7,xmm2
|
|
pslld xmm2,21-7
|
|
paddd xmm6,XMMWORD[96+rbp]
|
|
pxor xmm7,xmm1
|
|
|
|
psrld xmm1,25-11
|
|
movdqa xmm0,xmm13
|
|
|
|
pxor xmm7,xmm2
|
|
movdqa xmm4,xmm13
|
|
pslld xmm2,26-21
|
|
pandn xmm0,xmm15
|
|
pand xmm4,xmm14
|
|
pxor xmm7,xmm1
|
|
|
|
|
|
movdqa xmm1,xmm9
|
|
pxor xmm7,xmm2
|
|
movdqa xmm2,xmm9
|
|
psrld xmm1,2
|
|
paddd xmm6,xmm7
|
|
pxor xmm0,xmm4
|
|
movdqa xmm4,xmm10
|
|
movdqa xmm7,xmm9
|
|
pslld xmm2,10
|
|
pxor xmm4,xmm9
|
|
|
|
|
|
psrld xmm7,13
|
|
pxor xmm1,xmm2
|
|
paddd xmm6,xmm0
|
|
pslld xmm2,19-10
|
|
pand xmm3,xmm4
|
|
pxor xmm1,xmm7
|
|
|
|
|
|
psrld xmm7,22-13
|
|
pxor xmm1,xmm2
|
|
movdqa xmm8,xmm10
|
|
pslld xmm2,30-19
|
|
pxor xmm7,xmm1
|
|
pxor xmm8,xmm3
|
|
paddd xmm12,xmm6
|
|
pxor xmm7,xmm2
|
|
|
|
paddd xmm8,xmm6
|
|
paddd xmm8,xmm7
|
|
lea rbp,[256+rbp]
|
|
dec ecx
|
|
jnz NEAR $L$oop_16_xx
|
|
|
|
mov ecx,1
|
|
lea rbp,[((K256+128))]
|
|
|
|
movdqa xmm7,XMMWORD[rbx]
|
|
cmp ecx,DWORD[rbx]
|
|
pxor xmm0,xmm0
|
|
cmovge r8,rbp
|
|
cmp ecx,DWORD[4+rbx]
|
|
movdqa xmm6,xmm7
|
|
cmovge r9,rbp
|
|
cmp ecx,DWORD[8+rbx]
|
|
pcmpgtd xmm6,xmm0
|
|
cmovge r10,rbp
|
|
cmp ecx,DWORD[12+rbx]
|
|
paddd xmm7,xmm6
|
|
cmovge r11,rbp
|
|
|
|
movdqu xmm0,XMMWORD[((0-128))+rdi]
|
|
pand xmm8,xmm6
|
|
movdqu xmm1,XMMWORD[((32-128))+rdi]
|
|
pand xmm9,xmm6
|
|
movdqu xmm2,XMMWORD[((64-128))+rdi]
|
|
pand xmm10,xmm6
|
|
movdqu xmm5,XMMWORD[((96-128))+rdi]
|
|
pand xmm11,xmm6
|
|
paddd xmm8,xmm0
|
|
movdqu xmm0,XMMWORD[((128-128))+rdi]
|
|
pand xmm12,xmm6
|
|
paddd xmm9,xmm1
|
|
movdqu xmm1,XMMWORD[((160-128))+rdi]
|
|
pand xmm13,xmm6
|
|
paddd xmm10,xmm2
|
|
movdqu xmm2,XMMWORD[((192-128))+rdi]
|
|
pand xmm14,xmm6
|
|
paddd xmm11,xmm5
|
|
movdqu xmm5,XMMWORD[((224-128))+rdi]
|
|
pand xmm15,xmm6
|
|
paddd xmm12,xmm0
|
|
paddd xmm13,xmm1
|
|
movdqu XMMWORD[(0-128)+rdi],xmm8
|
|
paddd xmm14,xmm2
|
|
movdqu XMMWORD[(32-128)+rdi],xmm9
|
|
paddd xmm15,xmm5
|
|
movdqu XMMWORD[(64-128)+rdi],xmm10
|
|
movdqu XMMWORD[(96-128)+rdi],xmm11
|
|
movdqu XMMWORD[(128-128)+rdi],xmm12
|
|
movdqu XMMWORD[(160-128)+rdi],xmm13
|
|
movdqu XMMWORD[(192-128)+rdi],xmm14
|
|
movdqu XMMWORD[(224-128)+rdi],xmm15
|
|
|
|
movdqa XMMWORD[rbx],xmm7
|
|
movdqa xmm6,XMMWORD[$L$pbswap]
|
|
dec edx
|
|
jnz NEAR $L$oop
|
|
|
|
mov edx,DWORD[280+rsp]
|
|
lea rdi,[16+rdi]
|
|
lea rsi,[64+rsi]
|
|
dec edx
|
|
jnz NEAR $L$oop_grande
|
|
|
|
$L$done:
|
|
mov rax,QWORD[272+rsp]
|
|
|
|
movaps xmm6,XMMWORD[((-184))+rax]
|
|
movaps xmm7,XMMWORD[((-168))+rax]
|
|
movaps xmm8,XMMWORD[((-152))+rax]
|
|
movaps xmm9,XMMWORD[((-136))+rax]
|
|
movaps xmm10,XMMWORD[((-120))+rax]
|
|
movaps xmm11,XMMWORD[((-104))+rax]
|
|
movaps xmm12,XMMWORD[((-88))+rax]
|
|
movaps xmm13,XMMWORD[((-72))+rax]
|
|
movaps xmm14,XMMWORD[((-56))+rax]
|
|
movaps xmm15,XMMWORD[((-40))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
|
|
lea rsp,[rax]
|
|
|
|
$L$epilogue:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_sha256_multi_block:
|
|
|
|
ALIGN 32
|
|
sha256_multi_block_shaext:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_sha256_multi_block_shaext:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
|
|
_shaext_shortcut:
|
|
mov rax,rsp
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
lea rsp,[((-168))+rsp]
|
|
movaps XMMWORD[rsp],xmm6
|
|
movaps XMMWORD[16+rsp],xmm7
|
|
movaps XMMWORD[32+rsp],xmm8
|
|
movaps XMMWORD[48+rsp],xmm9
|
|
movaps XMMWORD[(-120)+rax],xmm10
|
|
movaps XMMWORD[(-104)+rax],xmm11
|
|
movaps XMMWORD[(-88)+rax],xmm12
|
|
movaps XMMWORD[(-72)+rax],xmm13
|
|
movaps XMMWORD[(-56)+rax],xmm14
|
|
movaps XMMWORD[(-40)+rax],xmm15
|
|
sub rsp,288
|
|
shl edx,1
|
|
and rsp,-256
|
|
lea rdi,[128+rdi]
|
|
mov QWORD[272+rsp],rax
|
|
$L$body_shaext:
|
|
lea rbx,[256+rsp]
|
|
lea rbp,[((K256_shaext+128))]
|
|
|
|
$L$oop_grande_shaext:
|
|
mov DWORD[280+rsp],edx
|
|
xor edx,edx
|
|
|
|
mov r8,QWORD[rsi]
|
|
|
|
mov ecx,DWORD[8+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[rbx],ecx
|
|
cmovle r8,rsp
|
|
|
|
mov r9,QWORD[16+rsi]
|
|
|
|
mov ecx,DWORD[24+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[4+rbx],ecx
|
|
cmovle r9,rsp
|
|
test edx,edx
|
|
jz NEAR $L$done_shaext
|
|
|
|
movq xmm12,QWORD[((0-128))+rdi]
|
|
movq xmm4,QWORD[((32-128))+rdi]
|
|
movq xmm13,QWORD[((64-128))+rdi]
|
|
movq xmm5,QWORD[((96-128))+rdi]
|
|
movq xmm8,QWORD[((128-128))+rdi]
|
|
movq xmm9,QWORD[((160-128))+rdi]
|
|
movq xmm10,QWORD[((192-128))+rdi]
|
|
movq xmm11,QWORD[((224-128))+rdi]
|
|
|
|
punpckldq xmm12,xmm4
|
|
punpckldq xmm13,xmm5
|
|
punpckldq xmm8,xmm9
|
|
punpckldq xmm10,xmm11
|
|
movdqa xmm3,XMMWORD[((K256_shaext-16))]
|
|
|
|
movdqa xmm14,xmm12
|
|
movdqa xmm15,xmm13
|
|
punpcklqdq xmm12,xmm8
|
|
punpcklqdq xmm13,xmm10
|
|
punpckhqdq xmm14,xmm8
|
|
punpckhqdq xmm15,xmm10
|
|
|
|
pshufd xmm12,xmm12,27
|
|
pshufd xmm13,xmm13,27
|
|
pshufd xmm14,xmm14,27
|
|
pshufd xmm15,xmm15,27
|
|
jmp NEAR $L$oop_shaext
|
|
|
|
ALIGN 32
|
|
$L$oop_shaext:
|
|
movdqu xmm4,XMMWORD[r8]
|
|
movdqu xmm8,XMMWORD[r9]
|
|
movdqu xmm5,XMMWORD[16+r8]
|
|
movdqu xmm9,XMMWORD[16+r9]
|
|
movdqu xmm6,XMMWORD[32+r8]
|
|
DB 102,15,56,0,227
|
|
movdqu xmm10,XMMWORD[32+r9]
|
|
DB 102,68,15,56,0,195
|
|
movdqu xmm7,XMMWORD[48+r8]
|
|
lea r8,[64+r8]
|
|
movdqu xmm11,XMMWORD[48+r9]
|
|
lea r9,[64+r9]
|
|
|
|
movdqa xmm0,XMMWORD[((0-128))+rbp]
|
|
DB 102,15,56,0,235
|
|
paddd xmm0,xmm4
|
|
pxor xmm4,xmm12
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,XMMWORD[((0-128))+rbp]
|
|
DB 102,68,15,56,0,203
|
|
paddd xmm2,xmm8
|
|
movdqa XMMWORD[80+rsp],xmm13
|
|
DB 69,15,56,203,236
|
|
pxor xmm8,xmm14
|
|
movdqa xmm0,xmm2
|
|
movdqa XMMWORD[112+rsp],xmm15
|
|
DB 69,15,56,203,254
|
|
pshufd xmm0,xmm1,0x0e
|
|
pxor xmm4,xmm12
|
|
movdqa XMMWORD[64+rsp],xmm12
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
pxor xmm8,xmm14
|
|
movdqa XMMWORD[96+rsp],xmm14
|
|
movdqa xmm1,XMMWORD[((16-128))+rbp]
|
|
paddd xmm1,xmm5
|
|
DB 102,15,56,0,243
|
|
DB 69,15,56,203,247
|
|
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((16-128))+rbp]
|
|
paddd xmm2,xmm9
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
prefetcht0 [127+r8]
|
|
DB 102,15,56,0,251
|
|
DB 102,68,15,56,0,211
|
|
prefetcht0 [127+r9]
|
|
DB 69,15,56,203,254
|
|
pshufd xmm0,xmm1,0x0e
|
|
DB 102,68,15,56,0,219
|
|
DB 15,56,204,229
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((32-128))+rbp]
|
|
paddd xmm1,xmm6
|
|
DB 69,15,56,203,247
|
|
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((32-128))+rbp]
|
|
paddd xmm2,xmm10
|
|
DB 69,15,56,203,236
|
|
DB 69,15,56,204,193
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm7
|
|
DB 69,15,56,203,254
|
|
pshufd xmm0,xmm1,0x0e
|
|
DB 102,15,58,15,222,4
|
|
paddd xmm4,xmm3
|
|
movdqa xmm3,xmm11
|
|
DB 102,65,15,58,15,218,4
|
|
DB 15,56,204,238
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((48-128))+rbp]
|
|
paddd xmm1,xmm7
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,202
|
|
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((48-128))+rbp]
|
|
paddd xmm8,xmm3
|
|
paddd xmm2,xmm11
|
|
DB 15,56,205,231
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm4
|
|
DB 102,15,58,15,223,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,195
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm5,xmm3
|
|
movdqa xmm3,xmm8
|
|
DB 102,65,15,58,15,219,4
|
|
DB 15,56,204,247
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((64-128))+rbp]
|
|
paddd xmm1,xmm4
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,211
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((64-128))+rbp]
|
|
paddd xmm9,xmm3
|
|
paddd xmm2,xmm8
|
|
DB 15,56,205,236
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm5
|
|
DB 102,15,58,15,220,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,200
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm6,xmm3
|
|
movdqa xmm3,xmm9
|
|
DB 102,65,15,58,15,216,4
|
|
DB 15,56,204,252
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((80-128))+rbp]
|
|
paddd xmm1,xmm5
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,216
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((80-128))+rbp]
|
|
paddd xmm10,xmm3
|
|
paddd xmm2,xmm9
|
|
DB 15,56,205,245
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm6
|
|
DB 102,15,58,15,221,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,209
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm7,xmm3
|
|
movdqa xmm3,xmm10
|
|
DB 102,65,15,58,15,217,4
|
|
DB 15,56,204,229
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((96-128))+rbp]
|
|
paddd xmm1,xmm6
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,193
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((96-128))+rbp]
|
|
paddd xmm11,xmm3
|
|
paddd xmm2,xmm10
|
|
DB 15,56,205,254
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm7
|
|
DB 102,15,58,15,222,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,218
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm4,xmm3
|
|
movdqa xmm3,xmm11
|
|
DB 102,65,15,58,15,218,4
|
|
DB 15,56,204,238
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((112-128))+rbp]
|
|
paddd xmm1,xmm7
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,202
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((112-128))+rbp]
|
|
paddd xmm8,xmm3
|
|
paddd xmm2,xmm11
|
|
DB 15,56,205,231
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm4
|
|
DB 102,15,58,15,223,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,195
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm5,xmm3
|
|
movdqa xmm3,xmm8
|
|
DB 102,65,15,58,15,219,4
|
|
DB 15,56,204,247
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((128-128))+rbp]
|
|
paddd xmm1,xmm4
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,211
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((128-128))+rbp]
|
|
paddd xmm9,xmm3
|
|
paddd xmm2,xmm8
|
|
DB 15,56,205,236
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm5
|
|
DB 102,15,58,15,220,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,200
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm6,xmm3
|
|
movdqa xmm3,xmm9
|
|
DB 102,65,15,58,15,216,4
|
|
DB 15,56,204,252
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((144-128))+rbp]
|
|
paddd xmm1,xmm5
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,216
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((144-128))+rbp]
|
|
paddd xmm10,xmm3
|
|
paddd xmm2,xmm9
|
|
DB 15,56,205,245
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm6
|
|
DB 102,15,58,15,221,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,209
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm7,xmm3
|
|
movdqa xmm3,xmm10
|
|
DB 102,65,15,58,15,217,4
|
|
DB 15,56,204,229
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((160-128))+rbp]
|
|
paddd xmm1,xmm6
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,193
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((160-128))+rbp]
|
|
paddd xmm11,xmm3
|
|
paddd xmm2,xmm10
|
|
DB 15,56,205,254
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm7
|
|
DB 102,15,58,15,222,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,218
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm4,xmm3
|
|
movdqa xmm3,xmm11
|
|
DB 102,65,15,58,15,218,4
|
|
DB 15,56,204,238
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((176-128))+rbp]
|
|
paddd xmm1,xmm7
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,202
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((176-128))+rbp]
|
|
paddd xmm8,xmm3
|
|
paddd xmm2,xmm11
|
|
DB 15,56,205,231
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm4
|
|
DB 102,15,58,15,223,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,195
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm5,xmm3
|
|
movdqa xmm3,xmm8
|
|
DB 102,65,15,58,15,219,4
|
|
DB 15,56,204,247
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((192-128))+rbp]
|
|
paddd xmm1,xmm4
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,211
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((192-128))+rbp]
|
|
paddd xmm9,xmm3
|
|
paddd xmm2,xmm8
|
|
DB 15,56,205,236
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm5
|
|
DB 102,15,58,15,220,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,200
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm6,xmm3
|
|
movdqa xmm3,xmm9
|
|
DB 102,65,15,58,15,216,4
|
|
DB 15,56,204,252
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((208-128))+rbp]
|
|
paddd xmm1,xmm5
|
|
DB 69,15,56,203,247
|
|
DB 69,15,56,204,216
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((208-128))+rbp]
|
|
paddd xmm10,xmm3
|
|
paddd xmm2,xmm9
|
|
DB 15,56,205,245
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
movdqa xmm3,xmm6
|
|
DB 102,15,58,15,221,4
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,209
|
|
pshufd xmm0,xmm1,0x0e
|
|
paddd xmm7,xmm3
|
|
movdqa xmm3,xmm10
|
|
DB 102,65,15,58,15,217,4
|
|
nop
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm1,XMMWORD[((224-128))+rbp]
|
|
paddd xmm1,xmm6
|
|
DB 69,15,56,203,247
|
|
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm2,XMMWORD[((224-128))+rbp]
|
|
paddd xmm11,xmm3
|
|
paddd xmm2,xmm10
|
|
DB 15,56,205,254
|
|
nop
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
mov ecx,1
|
|
pxor xmm6,xmm6
|
|
DB 69,15,56,203,254
|
|
DB 69,15,56,205,218
|
|
pshufd xmm0,xmm1,0x0e
|
|
movdqa xmm1,XMMWORD[((240-128))+rbp]
|
|
paddd xmm1,xmm7
|
|
movq xmm7,QWORD[rbx]
|
|
nop
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
movdqa xmm2,XMMWORD[((240-128))+rbp]
|
|
paddd xmm2,xmm11
|
|
DB 69,15,56,203,247
|
|
|
|
movdqa xmm0,xmm1
|
|
cmp ecx,DWORD[rbx]
|
|
cmovge r8,rsp
|
|
cmp ecx,DWORD[4+rbx]
|
|
cmovge r9,rsp
|
|
pshufd xmm9,xmm7,0x00
|
|
DB 69,15,56,203,236
|
|
movdqa xmm0,xmm2
|
|
pshufd xmm10,xmm7,0x55
|
|
movdqa xmm11,xmm7
|
|
DB 69,15,56,203,254
|
|
pshufd xmm0,xmm1,0x0e
|
|
pcmpgtd xmm9,xmm6
|
|
pcmpgtd xmm10,xmm6
|
|
DB 69,15,56,203,229
|
|
pshufd xmm0,xmm2,0x0e
|
|
pcmpgtd xmm11,xmm6
|
|
movdqa xmm3,XMMWORD[((K256_shaext-16))]
|
|
DB 69,15,56,203,247
|
|
|
|
pand xmm13,xmm9
|
|
pand xmm15,xmm10
|
|
pand xmm12,xmm9
|
|
pand xmm14,xmm10
|
|
paddd xmm11,xmm7
|
|
|
|
paddd xmm13,XMMWORD[80+rsp]
|
|
paddd xmm15,XMMWORD[112+rsp]
|
|
paddd xmm12,XMMWORD[64+rsp]
|
|
paddd xmm14,XMMWORD[96+rsp]
|
|
|
|
movq QWORD[rbx],xmm11
|
|
dec edx
|
|
jnz NEAR $L$oop_shaext
|
|
|
|
mov edx,DWORD[280+rsp]
|
|
|
|
pshufd xmm12,xmm12,27
|
|
pshufd xmm13,xmm13,27
|
|
pshufd xmm14,xmm14,27
|
|
pshufd xmm15,xmm15,27
|
|
|
|
movdqa xmm5,xmm12
|
|
movdqa xmm6,xmm13
|
|
punpckldq xmm12,xmm14
|
|
punpckhdq xmm5,xmm14
|
|
punpckldq xmm13,xmm15
|
|
punpckhdq xmm6,xmm15
|
|
|
|
movq QWORD[(0-128)+rdi],xmm12
|
|
psrldq xmm12,8
|
|
movq QWORD[(128-128)+rdi],xmm5
|
|
psrldq xmm5,8
|
|
movq QWORD[(32-128)+rdi],xmm12
|
|
movq QWORD[(160-128)+rdi],xmm5
|
|
|
|
movq QWORD[(64-128)+rdi],xmm13
|
|
psrldq xmm13,8
|
|
movq QWORD[(192-128)+rdi],xmm6
|
|
psrldq xmm6,8
|
|
movq QWORD[(96-128)+rdi],xmm13
|
|
movq QWORD[(224-128)+rdi],xmm6
|
|
|
|
lea rdi,[8+rdi]
|
|
lea rsi,[32+rsi]
|
|
dec edx
|
|
jnz NEAR $L$oop_grande_shaext
|
|
|
|
$L$done_shaext:
|
|
|
|
movaps xmm6,XMMWORD[((-184))+rax]
|
|
movaps xmm7,XMMWORD[((-168))+rax]
|
|
movaps xmm8,XMMWORD[((-152))+rax]
|
|
movaps xmm9,XMMWORD[((-136))+rax]
|
|
movaps xmm10,XMMWORD[((-120))+rax]
|
|
movaps xmm11,XMMWORD[((-104))+rax]
|
|
movaps xmm12,XMMWORD[((-88))+rax]
|
|
movaps xmm13,XMMWORD[((-72))+rax]
|
|
movaps xmm14,XMMWORD[((-56))+rax]
|
|
movaps xmm15,XMMWORD[((-40))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
|
|
lea rsp,[rax]
|
|
|
|
$L$epilogue_shaext:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_sha256_multi_block_shaext:
|
|
|
|
ALIGN 32
|
|
sha256_multi_block_avx:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_sha256_multi_block_avx:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
|
|
_avx_shortcut:
|
|
shr rcx,32
|
|
cmp edx,2
|
|
jb NEAR $L$avx
|
|
test ecx,32
|
|
jnz NEAR _avx2_shortcut
|
|
jmp NEAR $L$avx
|
|
ALIGN 32
|
|
$L$avx:
|
|
mov rax,rsp
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
lea rsp,[((-168))+rsp]
|
|
movaps XMMWORD[rsp],xmm6
|
|
movaps XMMWORD[16+rsp],xmm7
|
|
movaps XMMWORD[32+rsp],xmm8
|
|
movaps XMMWORD[48+rsp],xmm9
|
|
movaps XMMWORD[(-120)+rax],xmm10
|
|
movaps XMMWORD[(-104)+rax],xmm11
|
|
movaps XMMWORD[(-88)+rax],xmm12
|
|
movaps XMMWORD[(-72)+rax],xmm13
|
|
movaps XMMWORD[(-56)+rax],xmm14
|
|
movaps XMMWORD[(-40)+rax],xmm15
|
|
sub rsp,288
|
|
and rsp,-256
|
|
mov QWORD[272+rsp],rax
|
|
|
|
$L$body_avx:
|
|
lea rbp,[((K256+128))]
|
|
lea rbx,[256+rsp]
|
|
lea rdi,[128+rdi]
|
|
|
|
$L$oop_grande_avx:
|
|
mov DWORD[280+rsp],edx
|
|
xor edx,edx
|
|
|
|
mov r8,QWORD[rsi]
|
|
|
|
mov ecx,DWORD[8+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[rbx],ecx
|
|
cmovle r8,rbp
|
|
|
|
mov r9,QWORD[16+rsi]
|
|
|
|
mov ecx,DWORD[24+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[4+rbx],ecx
|
|
cmovle r9,rbp
|
|
|
|
mov r10,QWORD[32+rsi]
|
|
|
|
mov ecx,DWORD[40+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[8+rbx],ecx
|
|
cmovle r10,rbp
|
|
|
|
mov r11,QWORD[48+rsi]
|
|
|
|
mov ecx,DWORD[56+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[12+rbx],ecx
|
|
cmovle r11,rbp
|
|
test edx,edx
|
|
jz NEAR $L$done_avx
|
|
|
|
vmovdqu xmm8,XMMWORD[((0-128))+rdi]
|
|
lea rax,[128+rsp]
|
|
vmovdqu xmm9,XMMWORD[((32-128))+rdi]
|
|
vmovdqu xmm10,XMMWORD[((64-128))+rdi]
|
|
vmovdqu xmm11,XMMWORD[((96-128))+rdi]
|
|
vmovdqu xmm12,XMMWORD[((128-128))+rdi]
|
|
vmovdqu xmm13,XMMWORD[((160-128))+rdi]
|
|
vmovdqu xmm14,XMMWORD[((192-128))+rdi]
|
|
vmovdqu xmm15,XMMWORD[((224-128))+rdi]
|
|
vmovdqu xmm6,XMMWORD[$L$pbswap]
|
|
jmp NEAR $L$oop_avx
|
|
|
|
ALIGN 32
|
|
$L$oop_avx:
|
|
vpxor xmm4,xmm10,xmm9
|
|
vmovd xmm5,DWORD[r8]
|
|
vmovd xmm0,DWORD[r9]
|
|
vpinsrd xmm5,xmm5,DWORD[r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm12,6
|
|
vpslld xmm2,xmm12,26
|
|
vmovdqu XMMWORD[(0-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm15
|
|
|
|
vpsrld xmm1,xmm12,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm12,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm12,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,7
|
|
vpandn xmm0,xmm12,xmm14
|
|
vpand xmm3,xmm12,xmm13
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm15,xmm8,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm8,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm9,xmm8
|
|
|
|
vpxor xmm15,xmm15,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm8,13
|
|
|
|
vpslld xmm2,xmm8,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm15,xmm1
|
|
|
|
vpsrld xmm1,xmm8,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,10
|
|
vpxor xmm15,xmm9,xmm4
|
|
vpaddd xmm11,xmm11,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm15,xmm15,xmm5
|
|
vpaddd xmm15,xmm15,xmm7
|
|
vmovd xmm5,DWORD[4+r8]
|
|
vmovd xmm0,DWORD[4+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[4+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[4+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm11,6
|
|
vpslld xmm2,xmm11,26
|
|
vmovdqu XMMWORD[(16-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm14
|
|
|
|
vpsrld xmm1,xmm11,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm11,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm11,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,7
|
|
vpandn xmm0,xmm11,xmm13
|
|
vpand xmm4,xmm11,xmm12
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm14,xmm15,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm15,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm8,xmm15
|
|
|
|
vpxor xmm14,xmm14,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm15,13
|
|
|
|
vpslld xmm2,xmm15,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm14,xmm1
|
|
|
|
vpsrld xmm1,xmm15,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,10
|
|
vpxor xmm14,xmm8,xmm3
|
|
vpaddd xmm10,xmm10,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm14,xmm14,xmm5
|
|
vpaddd xmm14,xmm14,xmm7
|
|
vmovd xmm5,DWORD[8+r8]
|
|
vmovd xmm0,DWORD[8+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[8+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[8+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm10,6
|
|
vpslld xmm2,xmm10,26
|
|
vmovdqu XMMWORD[(32-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm13
|
|
|
|
vpsrld xmm1,xmm10,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm10,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm10,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,7
|
|
vpandn xmm0,xmm10,xmm12
|
|
vpand xmm3,xmm10,xmm11
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm13,xmm14,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm14,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm15,xmm14
|
|
|
|
vpxor xmm13,xmm13,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm14,13
|
|
|
|
vpslld xmm2,xmm14,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm13,xmm1
|
|
|
|
vpsrld xmm1,xmm14,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,10
|
|
vpxor xmm13,xmm15,xmm4
|
|
vpaddd xmm9,xmm9,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm13,xmm13,xmm5
|
|
vpaddd xmm13,xmm13,xmm7
|
|
vmovd xmm5,DWORD[12+r8]
|
|
vmovd xmm0,DWORD[12+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[12+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[12+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm9,6
|
|
vpslld xmm2,xmm9,26
|
|
vmovdqu XMMWORD[(48-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm12
|
|
|
|
vpsrld xmm1,xmm9,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm9,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm9,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,7
|
|
vpandn xmm0,xmm9,xmm11
|
|
vpand xmm4,xmm9,xmm10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm12,xmm13,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm13,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm14,xmm13
|
|
|
|
vpxor xmm12,xmm12,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm13,13
|
|
|
|
vpslld xmm2,xmm13,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm12,xmm1
|
|
|
|
vpsrld xmm1,xmm13,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,10
|
|
vpxor xmm12,xmm14,xmm3
|
|
vpaddd xmm8,xmm8,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm12,xmm12,xmm5
|
|
vpaddd xmm12,xmm12,xmm7
|
|
vmovd xmm5,DWORD[16+r8]
|
|
vmovd xmm0,DWORD[16+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[16+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[16+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm8,6
|
|
vpslld xmm2,xmm8,26
|
|
vmovdqu XMMWORD[(64-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm11
|
|
|
|
vpsrld xmm1,xmm8,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm8,21
|
|
vpaddd xmm5,xmm5,XMMWORD[rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm8,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,7
|
|
vpandn xmm0,xmm8,xmm10
|
|
vpand xmm3,xmm8,xmm9
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm11,xmm12,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm12,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm13,xmm12
|
|
|
|
vpxor xmm11,xmm11,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm12,13
|
|
|
|
vpslld xmm2,xmm12,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm11,xmm1
|
|
|
|
vpsrld xmm1,xmm12,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,10
|
|
vpxor xmm11,xmm13,xmm4
|
|
vpaddd xmm15,xmm15,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm11,xmm11,xmm5
|
|
vpaddd xmm11,xmm11,xmm7
|
|
vmovd xmm5,DWORD[20+r8]
|
|
vmovd xmm0,DWORD[20+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[20+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[20+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm15,6
|
|
vpslld xmm2,xmm15,26
|
|
vmovdqu XMMWORD[(80-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm10
|
|
|
|
vpsrld xmm1,xmm15,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm15,21
|
|
vpaddd xmm5,xmm5,XMMWORD[32+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm15,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,7
|
|
vpandn xmm0,xmm15,xmm9
|
|
vpand xmm4,xmm15,xmm8
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm10,xmm11,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm11,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm12,xmm11
|
|
|
|
vpxor xmm10,xmm10,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm11,13
|
|
|
|
vpslld xmm2,xmm11,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm10,xmm1
|
|
|
|
vpsrld xmm1,xmm11,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,10
|
|
vpxor xmm10,xmm12,xmm3
|
|
vpaddd xmm14,xmm14,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm10,xmm10,xmm5
|
|
vpaddd xmm10,xmm10,xmm7
|
|
vmovd xmm5,DWORD[24+r8]
|
|
vmovd xmm0,DWORD[24+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[24+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[24+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm14,6
|
|
vpslld xmm2,xmm14,26
|
|
vmovdqu XMMWORD[(96-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm9
|
|
|
|
vpsrld xmm1,xmm14,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm14,21
|
|
vpaddd xmm5,xmm5,XMMWORD[64+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm14,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,7
|
|
vpandn xmm0,xmm14,xmm8
|
|
vpand xmm3,xmm14,xmm15
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm9,xmm10,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm10,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm11,xmm10
|
|
|
|
vpxor xmm9,xmm9,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm10,13
|
|
|
|
vpslld xmm2,xmm10,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm9,xmm1
|
|
|
|
vpsrld xmm1,xmm10,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,10
|
|
vpxor xmm9,xmm11,xmm4
|
|
vpaddd xmm13,xmm13,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm9,xmm9,xmm5
|
|
vpaddd xmm9,xmm9,xmm7
|
|
vmovd xmm5,DWORD[28+r8]
|
|
vmovd xmm0,DWORD[28+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[28+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[28+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm13,6
|
|
vpslld xmm2,xmm13,26
|
|
vmovdqu XMMWORD[(112-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm8
|
|
|
|
vpsrld xmm1,xmm13,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm13,21
|
|
vpaddd xmm5,xmm5,XMMWORD[96+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm13,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,7
|
|
vpandn xmm0,xmm13,xmm15
|
|
vpand xmm4,xmm13,xmm14
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm8,xmm9,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm9,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm10,xmm9
|
|
|
|
vpxor xmm8,xmm8,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm9,13
|
|
|
|
vpslld xmm2,xmm9,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm8,xmm1
|
|
|
|
vpsrld xmm1,xmm9,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,10
|
|
vpxor xmm8,xmm10,xmm3
|
|
vpaddd xmm12,xmm12,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm8,xmm8,xmm5
|
|
vpaddd xmm8,xmm8,xmm7
|
|
add rbp,256
|
|
vmovd xmm5,DWORD[32+r8]
|
|
vmovd xmm0,DWORD[32+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[32+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[32+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm12,6
|
|
vpslld xmm2,xmm12,26
|
|
vmovdqu XMMWORD[(128-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm15
|
|
|
|
vpsrld xmm1,xmm12,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm12,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm12,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,7
|
|
vpandn xmm0,xmm12,xmm14
|
|
vpand xmm3,xmm12,xmm13
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm15,xmm8,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm8,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm9,xmm8
|
|
|
|
vpxor xmm15,xmm15,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm8,13
|
|
|
|
vpslld xmm2,xmm8,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm15,xmm1
|
|
|
|
vpsrld xmm1,xmm8,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,10
|
|
vpxor xmm15,xmm9,xmm4
|
|
vpaddd xmm11,xmm11,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm15,xmm15,xmm5
|
|
vpaddd xmm15,xmm15,xmm7
|
|
vmovd xmm5,DWORD[36+r8]
|
|
vmovd xmm0,DWORD[36+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[36+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[36+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm11,6
|
|
vpslld xmm2,xmm11,26
|
|
vmovdqu XMMWORD[(144-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm14
|
|
|
|
vpsrld xmm1,xmm11,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm11,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm11,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,7
|
|
vpandn xmm0,xmm11,xmm13
|
|
vpand xmm4,xmm11,xmm12
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm14,xmm15,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm15,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm8,xmm15
|
|
|
|
vpxor xmm14,xmm14,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm15,13
|
|
|
|
vpslld xmm2,xmm15,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm14,xmm1
|
|
|
|
vpsrld xmm1,xmm15,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,10
|
|
vpxor xmm14,xmm8,xmm3
|
|
vpaddd xmm10,xmm10,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm14,xmm14,xmm5
|
|
vpaddd xmm14,xmm14,xmm7
|
|
vmovd xmm5,DWORD[40+r8]
|
|
vmovd xmm0,DWORD[40+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[40+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[40+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm10,6
|
|
vpslld xmm2,xmm10,26
|
|
vmovdqu XMMWORD[(160-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm13
|
|
|
|
vpsrld xmm1,xmm10,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm10,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm10,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,7
|
|
vpandn xmm0,xmm10,xmm12
|
|
vpand xmm3,xmm10,xmm11
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm13,xmm14,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm14,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm15,xmm14
|
|
|
|
vpxor xmm13,xmm13,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm14,13
|
|
|
|
vpslld xmm2,xmm14,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm13,xmm1
|
|
|
|
vpsrld xmm1,xmm14,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,10
|
|
vpxor xmm13,xmm15,xmm4
|
|
vpaddd xmm9,xmm9,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm13,xmm13,xmm5
|
|
vpaddd xmm13,xmm13,xmm7
|
|
vmovd xmm5,DWORD[44+r8]
|
|
vmovd xmm0,DWORD[44+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[44+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[44+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm9,6
|
|
vpslld xmm2,xmm9,26
|
|
vmovdqu XMMWORD[(176-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm12
|
|
|
|
vpsrld xmm1,xmm9,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm9,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm9,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,7
|
|
vpandn xmm0,xmm9,xmm11
|
|
vpand xmm4,xmm9,xmm10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm12,xmm13,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm13,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm14,xmm13
|
|
|
|
vpxor xmm12,xmm12,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm13,13
|
|
|
|
vpslld xmm2,xmm13,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm12,xmm1
|
|
|
|
vpsrld xmm1,xmm13,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,10
|
|
vpxor xmm12,xmm14,xmm3
|
|
vpaddd xmm8,xmm8,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm12,xmm12,xmm5
|
|
vpaddd xmm12,xmm12,xmm7
|
|
vmovd xmm5,DWORD[48+r8]
|
|
vmovd xmm0,DWORD[48+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[48+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[48+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm8,6
|
|
vpslld xmm2,xmm8,26
|
|
vmovdqu XMMWORD[(192-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm11
|
|
|
|
vpsrld xmm1,xmm8,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm8,21
|
|
vpaddd xmm5,xmm5,XMMWORD[rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm8,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,7
|
|
vpandn xmm0,xmm8,xmm10
|
|
vpand xmm3,xmm8,xmm9
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm11,xmm12,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm12,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm13,xmm12
|
|
|
|
vpxor xmm11,xmm11,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm12,13
|
|
|
|
vpslld xmm2,xmm12,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm11,xmm1
|
|
|
|
vpsrld xmm1,xmm12,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,10
|
|
vpxor xmm11,xmm13,xmm4
|
|
vpaddd xmm15,xmm15,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm11,xmm11,xmm5
|
|
vpaddd xmm11,xmm11,xmm7
|
|
vmovd xmm5,DWORD[52+r8]
|
|
vmovd xmm0,DWORD[52+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[52+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[52+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm15,6
|
|
vpslld xmm2,xmm15,26
|
|
vmovdqu XMMWORD[(208-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm10
|
|
|
|
vpsrld xmm1,xmm15,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm15,21
|
|
vpaddd xmm5,xmm5,XMMWORD[32+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm15,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,7
|
|
vpandn xmm0,xmm15,xmm9
|
|
vpand xmm4,xmm15,xmm8
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm10,xmm11,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm11,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm12,xmm11
|
|
|
|
vpxor xmm10,xmm10,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm11,13
|
|
|
|
vpslld xmm2,xmm11,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm10,xmm1
|
|
|
|
vpsrld xmm1,xmm11,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,10
|
|
vpxor xmm10,xmm12,xmm3
|
|
vpaddd xmm14,xmm14,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm10,xmm10,xmm5
|
|
vpaddd xmm10,xmm10,xmm7
|
|
vmovd xmm5,DWORD[56+r8]
|
|
vmovd xmm0,DWORD[56+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[56+r10],1
|
|
vpinsrd xmm0,xmm0,DWORD[56+r11],1
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm14,6
|
|
vpslld xmm2,xmm14,26
|
|
vmovdqu XMMWORD[(224-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm9
|
|
|
|
vpsrld xmm1,xmm14,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm14,21
|
|
vpaddd xmm5,xmm5,XMMWORD[64+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm14,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,7
|
|
vpandn xmm0,xmm14,xmm8
|
|
vpand xmm3,xmm14,xmm15
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm9,xmm10,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm10,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm11,xmm10
|
|
|
|
vpxor xmm9,xmm9,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm10,13
|
|
|
|
vpslld xmm2,xmm10,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm9,xmm1
|
|
|
|
vpsrld xmm1,xmm10,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,10
|
|
vpxor xmm9,xmm11,xmm4
|
|
vpaddd xmm13,xmm13,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm9,xmm9,xmm5
|
|
vpaddd xmm9,xmm9,xmm7
|
|
vmovd xmm5,DWORD[60+r8]
|
|
lea r8,[64+r8]
|
|
vmovd xmm0,DWORD[60+r9]
|
|
lea r9,[64+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[60+r10],1
|
|
lea r10,[64+r10]
|
|
vpinsrd xmm0,xmm0,DWORD[60+r11],1
|
|
lea r11,[64+r11]
|
|
vpunpckldq xmm5,xmm5,xmm0
|
|
vpshufb xmm5,xmm5,xmm6
|
|
vpsrld xmm7,xmm13,6
|
|
vpslld xmm2,xmm13,26
|
|
vmovdqu XMMWORD[(240-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm8
|
|
|
|
vpsrld xmm1,xmm13,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm13,21
|
|
vpaddd xmm5,xmm5,XMMWORD[96+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm13,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
prefetcht0 [63+r8]
|
|
vpslld xmm2,xmm13,7
|
|
vpandn xmm0,xmm13,xmm15
|
|
vpand xmm4,xmm13,xmm14
|
|
prefetcht0 [63+r9]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm8,xmm9,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
prefetcht0 [63+r10]
|
|
vpslld xmm1,xmm9,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm10,xmm9
|
|
prefetcht0 [63+r11]
|
|
vpxor xmm8,xmm8,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm9,13
|
|
|
|
vpslld xmm2,xmm9,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm8,xmm1
|
|
|
|
vpsrld xmm1,xmm9,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,10
|
|
vpxor xmm8,xmm10,xmm3
|
|
vpaddd xmm12,xmm12,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm8,xmm8,xmm5
|
|
vpaddd xmm8,xmm8,xmm7
|
|
add rbp,256
|
|
vmovdqu xmm5,XMMWORD[((0-128))+rax]
|
|
mov ecx,3
|
|
jmp NEAR $L$oop_16_xx_avx
|
|
ALIGN 32
|
|
$L$oop_16_xx_avx:
|
|
vmovdqu xmm6,XMMWORD[((16-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((144-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((224-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm12,6
|
|
vpslld xmm2,xmm12,26
|
|
vmovdqu XMMWORD[(0-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm15
|
|
|
|
vpsrld xmm1,xmm12,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm12,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm12,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,7
|
|
vpandn xmm0,xmm12,xmm14
|
|
vpand xmm3,xmm12,xmm13
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm15,xmm8,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm8,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm9,xmm8
|
|
|
|
vpxor xmm15,xmm15,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm8,13
|
|
|
|
vpslld xmm2,xmm8,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm15,xmm1
|
|
|
|
vpsrld xmm1,xmm8,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,10
|
|
vpxor xmm15,xmm9,xmm4
|
|
vpaddd xmm11,xmm11,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm15,xmm15,xmm5
|
|
vpaddd xmm15,xmm15,xmm7
|
|
vmovdqu xmm5,XMMWORD[((32-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((160-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((240-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm11,6
|
|
vpslld xmm2,xmm11,26
|
|
vmovdqu XMMWORD[(16-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm14
|
|
|
|
vpsrld xmm1,xmm11,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm11,21
|
|
vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm11,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,7
|
|
vpandn xmm0,xmm11,xmm13
|
|
vpand xmm4,xmm11,xmm12
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm14,xmm15,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm15,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm8,xmm15
|
|
|
|
vpxor xmm14,xmm14,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm15,13
|
|
|
|
vpslld xmm2,xmm15,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm14,xmm1
|
|
|
|
vpsrld xmm1,xmm15,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,10
|
|
vpxor xmm14,xmm8,xmm3
|
|
vpaddd xmm10,xmm10,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm14,xmm14,xmm6
|
|
vpaddd xmm14,xmm14,xmm7
|
|
vmovdqu xmm6,XMMWORD[((48-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((176-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((0-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm10,6
|
|
vpslld xmm2,xmm10,26
|
|
vmovdqu XMMWORD[(32-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm13
|
|
|
|
vpsrld xmm1,xmm10,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm10,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm10,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,7
|
|
vpandn xmm0,xmm10,xmm12
|
|
vpand xmm3,xmm10,xmm11
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm13,xmm14,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm14,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm15,xmm14
|
|
|
|
vpxor xmm13,xmm13,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm14,13
|
|
|
|
vpslld xmm2,xmm14,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm13,xmm1
|
|
|
|
vpsrld xmm1,xmm14,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,10
|
|
vpxor xmm13,xmm15,xmm4
|
|
vpaddd xmm9,xmm9,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm13,xmm13,xmm5
|
|
vpaddd xmm13,xmm13,xmm7
|
|
vmovdqu xmm5,XMMWORD[((64-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((192-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((16-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm9,6
|
|
vpslld xmm2,xmm9,26
|
|
vmovdqu XMMWORD[(48-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm12
|
|
|
|
vpsrld xmm1,xmm9,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm9,21
|
|
vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm9,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,7
|
|
vpandn xmm0,xmm9,xmm11
|
|
vpand xmm4,xmm9,xmm10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm12,xmm13,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm13,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm14,xmm13
|
|
|
|
vpxor xmm12,xmm12,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm13,13
|
|
|
|
vpslld xmm2,xmm13,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm12,xmm1
|
|
|
|
vpsrld xmm1,xmm13,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,10
|
|
vpxor xmm12,xmm14,xmm3
|
|
vpaddd xmm8,xmm8,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm12,xmm12,xmm6
|
|
vpaddd xmm12,xmm12,xmm7
|
|
vmovdqu xmm6,XMMWORD[((80-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((208-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((32-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm8,6
|
|
vpslld xmm2,xmm8,26
|
|
vmovdqu XMMWORD[(64-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm11
|
|
|
|
vpsrld xmm1,xmm8,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm8,21
|
|
vpaddd xmm5,xmm5,XMMWORD[rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm8,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,7
|
|
vpandn xmm0,xmm8,xmm10
|
|
vpand xmm3,xmm8,xmm9
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm11,xmm12,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm12,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm13,xmm12
|
|
|
|
vpxor xmm11,xmm11,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm12,13
|
|
|
|
vpslld xmm2,xmm12,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm11,xmm1
|
|
|
|
vpsrld xmm1,xmm12,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,10
|
|
vpxor xmm11,xmm13,xmm4
|
|
vpaddd xmm15,xmm15,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm11,xmm11,xmm5
|
|
vpaddd xmm11,xmm11,xmm7
|
|
vmovdqu xmm5,XMMWORD[((96-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((224-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((48-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm15,6
|
|
vpslld xmm2,xmm15,26
|
|
vmovdqu XMMWORD[(80-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm10
|
|
|
|
vpsrld xmm1,xmm15,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm15,21
|
|
vpaddd xmm6,xmm6,XMMWORD[32+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm15,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,7
|
|
vpandn xmm0,xmm15,xmm9
|
|
vpand xmm4,xmm15,xmm8
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm10,xmm11,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm11,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm12,xmm11
|
|
|
|
vpxor xmm10,xmm10,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm11,13
|
|
|
|
vpslld xmm2,xmm11,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm10,xmm1
|
|
|
|
vpsrld xmm1,xmm11,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,10
|
|
vpxor xmm10,xmm12,xmm3
|
|
vpaddd xmm14,xmm14,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm10,xmm10,xmm6
|
|
vpaddd xmm10,xmm10,xmm7
|
|
vmovdqu xmm6,XMMWORD[((112-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((240-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((64-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm14,6
|
|
vpslld xmm2,xmm14,26
|
|
vmovdqu XMMWORD[(96-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm9
|
|
|
|
vpsrld xmm1,xmm14,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm14,21
|
|
vpaddd xmm5,xmm5,XMMWORD[64+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm14,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,7
|
|
vpandn xmm0,xmm14,xmm8
|
|
vpand xmm3,xmm14,xmm15
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm9,xmm10,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm10,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm11,xmm10
|
|
|
|
vpxor xmm9,xmm9,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm10,13
|
|
|
|
vpslld xmm2,xmm10,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm9,xmm1
|
|
|
|
vpsrld xmm1,xmm10,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,10
|
|
vpxor xmm9,xmm11,xmm4
|
|
vpaddd xmm13,xmm13,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm9,xmm9,xmm5
|
|
vpaddd xmm9,xmm9,xmm7
|
|
vmovdqu xmm5,XMMWORD[((128-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((0-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((80-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm13,6
|
|
vpslld xmm2,xmm13,26
|
|
vmovdqu XMMWORD[(112-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm8
|
|
|
|
vpsrld xmm1,xmm13,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm13,21
|
|
vpaddd xmm6,xmm6,XMMWORD[96+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm13,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,7
|
|
vpandn xmm0,xmm13,xmm15
|
|
vpand xmm4,xmm13,xmm14
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm8,xmm9,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm9,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm10,xmm9
|
|
|
|
vpxor xmm8,xmm8,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm9,13
|
|
|
|
vpslld xmm2,xmm9,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm8,xmm1
|
|
|
|
vpsrld xmm1,xmm9,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,10
|
|
vpxor xmm8,xmm10,xmm3
|
|
vpaddd xmm12,xmm12,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm8,xmm8,xmm6
|
|
vpaddd xmm8,xmm8,xmm7
|
|
add rbp,256
|
|
vmovdqu xmm6,XMMWORD[((144-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((16-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((96-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm12,6
|
|
vpslld xmm2,xmm12,26
|
|
vmovdqu XMMWORD[(128-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm15
|
|
|
|
vpsrld xmm1,xmm12,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm12,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm12,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,7
|
|
vpandn xmm0,xmm12,xmm14
|
|
vpand xmm3,xmm12,xmm13
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm15,xmm8,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm8,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm9,xmm8
|
|
|
|
vpxor xmm15,xmm15,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm8,13
|
|
|
|
vpslld xmm2,xmm8,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm15,xmm1
|
|
|
|
vpsrld xmm1,xmm8,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,10
|
|
vpxor xmm15,xmm9,xmm4
|
|
vpaddd xmm11,xmm11,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm15,xmm15,xmm5
|
|
vpaddd xmm15,xmm15,xmm7
|
|
vmovdqu xmm5,XMMWORD[((160-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((32-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((112-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm11,6
|
|
vpslld xmm2,xmm11,26
|
|
vmovdqu XMMWORD[(144-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm14
|
|
|
|
vpsrld xmm1,xmm11,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm11,21
|
|
vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm11,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,7
|
|
vpandn xmm0,xmm11,xmm13
|
|
vpand xmm4,xmm11,xmm12
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm14,xmm15,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm15,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm8,xmm15
|
|
|
|
vpxor xmm14,xmm14,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm15,13
|
|
|
|
vpslld xmm2,xmm15,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm14,xmm1
|
|
|
|
vpsrld xmm1,xmm15,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,10
|
|
vpxor xmm14,xmm8,xmm3
|
|
vpaddd xmm10,xmm10,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm14,xmm14,xmm6
|
|
vpaddd xmm14,xmm14,xmm7
|
|
vmovdqu xmm6,XMMWORD[((176-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((48-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((128-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm10,6
|
|
vpslld xmm2,xmm10,26
|
|
vmovdqu XMMWORD[(160-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm13
|
|
|
|
vpsrld xmm1,xmm10,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm10,21
|
|
vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm10,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,7
|
|
vpandn xmm0,xmm10,xmm12
|
|
vpand xmm3,xmm10,xmm11
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm13,xmm14,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm14,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm15,xmm14
|
|
|
|
vpxor xmm13,xmm13,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm14,13
|
|
|
|
vpslld xmm2,xmm14,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm13,xmm1
|
|
|
|
vpsrld xmm1,xmm14,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,10
|
|
vpxor xmm13,xmm15,xmm4
|
|
vpaddd xmm9,xmm9,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm13,xmm13,xmm5
|
|
vpaddd xmm13,xmm13,xmm7
|
|
vmovdqu xmm5,XMMWORD[((192-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((64-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((144-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm9,6
|
|
vpslld xmm2,xmm9,26
|
|
vmovdqu XMMWORD[(176-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm12
|
|
|
|
vpsrld xmm1,xmm9,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm9,21
|
|
vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm9,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,7
|
|
vpandn xmm0,xmm9,xmm11
|
|
vpand xmm4,xmm9,xmm10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm12,xmm13,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm13,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm14,xmm13
|
|
|
|
vpxor xmm12,xmm12,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm13,13
|
|
|
|
vpslld xmm2,xmm13,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm12,xmm1
|
|
|
|
vpsrld xmm1,xmm13,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,10
|
|
vpxor xmm12,xmm14,xmm3
|
|
vpaddd xmm8,xmm8,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm12,xmm12,xmm6
|
|
vpaddd xmm12,xmm12,xmm7
|
|
vmovdqu xmm6,XMMWORD[((208-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((80-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((160-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm8,6
|
|
vpslld xmm2,xmm8,26
|
|
vmovdqu XMMWORD[(192-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm11
|
|
|
|
vpsrld xmm1,xmm8,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm8,21
|
|
vpaddd xmm5,xmm5,XMMWORD[rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm8,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm8,7
|
|
vpandn xmm0,xmm8,xmm10
|
|
vpand xmm3,xmm8,xmm9
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm11,xmm12,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm12,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm13,xmm12
|
|
|
|
vpxor xmm11,xmm11,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm12,13
|
|
|
|
vpslld xmm2,xmm12,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm11,xmm1
|
|
|
|
vpsrld xmm1,xmm12,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm12,10
|
|
vpxor xmm11,xmm13,xmm4
|
|
vpaddd xmm15,xmm15,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm11,xmm11,xmm5
|
|
vpaddd xmm11,xmm11,xmm7
|
|
vmovdqu xmm5,XMMWORD[((224-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((96-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((176-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm15,6
|
|
vpslld xmm2,xmm15,26
|
|
vmovdqu XMMWORD[(208-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm10
|
|
|
|
vpsrld xmm1,xmm15,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm15,21
|
|
vpaddd xmm6,xmm6,XMMWORD[32+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm15,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm15,7
|
|
vpandn xmm0,xmm15,xmm9
|
|
vpand xmm4,xmm15,xmm8
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm10,xmm11,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm11,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm12,xmm11
|
|
|
|
vpxor xmm10,xmm10,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm11,13
|
|
|
|
vpslld xmm2,xmm11,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm10,xmm1
|
|
|
|
vpsrld xmm1,xmm11,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm11,10
|
|
vpxor xmm10,xmm12,xmm3
|
|
vpaddd xmm14,xmm14,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm10,xmm10,xmm6
|
|
vpaddd xmm10,xmm10,xmm7
|
|
vmovdqu xmm6,XMMWORD[((240-128))+rax]
|
|
vpaddd xmm5,xmm5,XMMWORD[((112-128))+rax]
|
|
|
|
vpsrld xmm7,xmm6,3
|
|
vpsrld xmm1,xmm6,7
|
|
vpslld xmm2,xmm6,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm6,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm6,14
|
|
vmovdqu xmm0,XMMWORD[((192-128))+rax]
|
|
vpsrld xmm3,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpxor xmm7,xmm3,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm5,xmm5,xmm7
|
|
vpsrld xmm7,xmm14,6
|
|
vpslld xmm2,xmm14,26
|
|
vmovdqu XMMWORD[(224-128)+rax],xmm5
|
|
vpaddd xmm5,xmm5,xmm9
|
|
|
|
vpsrld xmm1,xmm14,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm14,21
|
|
vpaddd xmm5,xmm5,XMMWORD[64+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm14,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm14,7
|
|
vpandn xmm0,xmm14,xmm8
|
|
vpand xmm3,xmm14,xmm15
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm9,xmm10,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm10,30
|
|
vpxor xmm0,xmm0,xmm3
|
|
vpxor xmm3,xmm11,xmm10
|
|
|
|
vpxor xmm9,xmm9,xmm1
|
|
vpaddd xmm5,xmm5,xmm7
|
|
|
|
vpsrld xmm1,xmm10,13
|
|
|
|
vpslld xmm2,xmm10,19
|
|
vpaddd xmm5,xmm5,xmm0
|
|
vpand xmm4,xmm4,xmm3
|
|
|
|
vpxor xmm7,xmm9,xmm1
|
|
|
|
vpsrld xmm1,xmm10,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm10,10
|
|
vpxor xmm9,xmm11,xmm4
|
|
vpaddd xmm13,xmm13,xmm5
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm9,xmm9,xmm5
|
|
vpaddd xmm9,xmm9,xmm7
|
|
vmovdqu xmm5,XMMWORD[((0-128))+rax]
|
|
vpaddd xmm6,xmm6,XMMWORD[((128-128))+rax]
|
|
|
|
vpsrld xmm7,xmm5,3
|
|
vpsrld xmm1,xmm5,7
|
|
vpslld xmm2,xmm5,25
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm5,18
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm5,14
|
|
vmovdqu xmm0,XMMWORD[((208-128))+rax]
|
|
vpsrld xmm4,xmm0,10
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpsrld xmm1,xmm0,17
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,15
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpxor xmm7,xmm4,xmm1
|
|
vpsrld xmm1,xmm0,19
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm0,13
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpaddd xmm6,xmm6,xmm7
|
|
vpsrld xmm7,xmm13,6
|
|
vpslld xmm2,xmm13,26
|
|
vmovdqu XMMWORD[(240-128)+rax],xmm6
|
|
vpaddd xmm6,xmm6,xmm8
|
|
|
|
vpsrld xmm1,xmm13,11
|
|
vpxor xmm7,xmm7,xmm2
|
|
vpslld xmm2,xmm13,21
|
|
vpaddd xmm6,xmm6,XMMWORD[96+rbp]
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm1,xmm13,25
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm13,7
|
|
vpandn xmm0,xmm13,xmm15
|
|
vpand xmm4,xmm13,xmm14
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
|
|
vpsrld xmm8,xmm9,2
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm1,xmm9,30
|
|
vpxor xmm0,xmm0,xmm4
|
|
vpxor xmm4,xmm10,xmm9
|
|
|
|
vpxor xmm8,xmm8,xmm1
|
|
vpaddd xmm6,xmm6,xmm7
|
|
|
|
vpsrld xmm1,xmm9,13
|
|
|
|
vpslld xmm2,xmm9,19
|
|
vpaddd xmm6,xmm6,xmm0
|
|
vpand xmm3,xmm3,xmm4
|
|
|
|
vpxor xmm7,xmm8,xmm1
|
|
|
|
vpsrld xmm1,xmm9,22
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpslld xmm2,xmm9,10
|
|
vpxor xmm8,xmm10,xmm3
|
|
vpaddd xmm12,xmm12,xmm6
|
|
|
|
vpxor xmm7,xmm7,xmm1
|
|
vpxor xmm7,xmm7,xmm2
|
|
|
|
vpaddd xmm8,xmm8,xmm6
|
|
vpaddd xmm8,xmm8,xmm7
|
|
add rbp,256
|
|
dec ecx
|
|
jnz NEAR $L$oop_16_xx_avx
|
|
|
|
mov ecx,1
|
|
lea rbp,[((K256+128))]
|
|
cmp ecx,DWORD[rbx]
|
|
cmovge r8,rbp
|
|
cmp ecx,DWORD[4+rbx]
|
|
cmovge r9,rbp
|
|
cmp ecx,DWORD[8+rbx]
|
|
cmovge r10,rbp
|
|
cmp ecx,DWORD[12+rbx]
|
|
cmovge r11,rbp
|
|
vmovdqa xmm7,XMMWORD[rbx]
|
|
vpxor xmm0,xmm0,xmm0
|
|
vmovdqa xmm6,xmm7
|
|
vpcmpgtd xmm6,xmm6,xmm0
|
|
vpaddd xmm7,xmm7,xmm6
|
|
|
|
vmovdqu xmm0,XMMWORD[((0-128))+rdi]
|
|
vpand xmm8,xmm8,xmm6
|
|
vmovdqu xmm1,XMMWORD[((32-128))+rdi]
|
|
vpand xmm9,xmm9,xmm6
|
|
vmovdqu xmm2,XMMWORD[((64-128))+rdi]
|
|
vpand xmm10,xmm10,xmm6
|
|
vmovdqu xmm5,XMMWORD[((96-128))+rdi]
|
|
vpand xmm11,xmm11,xmm6
|
|
vpaddd xmm8,xmm8,xmm0
|
|
vmovdqu xmm0,XMMWORD[((128-128))+rdi]
|
|
vpand xmm12,xmm12,xmm6
|
|
vpaddd xmm9,xmm9,xmm1
|
|
vmovdqu xmm1,XMMWORD[((160-128))+rdi]
|
|
vpand xmm13,xmm13,xmm6
|
|
vpaddd xmm10,xmm10,xmm2
|
|
vmovdqu xmm2,XMMWORD[((192-128))+rdi]
|
|
vpand xmm14,xmm14,xmm6
|
|
vpaddd xmm11,xmm11,xmm5
|
|
vmovdqu xmm5,XMMWORD[((224-128))+rdi]
|
|
vpand xmm15,xmm15,xmm6
|
|
vpaddd xmm12,xmm12,xmm0
|
|
vpaddd xmm13,xmm13,xmm1
|
|
vmovdqu XMMWORD[(0-128)+rdi],xmm8
|
|
vpaddd xmm14,xmm14,xmm2
|
|
vmovdqu XMMWORD[(32-128)+rdi],xmm9
|
|
vpaddd xmm15,xmm15,xmm5
|
|
vmovdqu XMMWORD[(64-128)+rdi],xmm10
|
|
vmovdqu XMMWORD[(96-128)+rdi],xmm11
|
|
vmovdqu XMMWORD[(128-128)+rdi],xmm12
|
|
vmovdqu XMMWORD[(160-128)+rdi],xmm13
|
|
vmovdqu XMMWORD[(192-128)+rdi],xmm14
|
|
vmovdqu XMMWORD[(224-128)+rdi],xmm15
|
|
|
|
vmovdqu XMMWORD[rbx],xmm7
|
|
vmovdqu xmm6,XMMWORD[$L$pbswap]
|
|
dec edx
|
|
jnz NEAR $L$oop_avx
|
|
|
|
mov edx,DWORD[280+rsp]
|
|
lea rdi,[16+rdi]
|
|
lea rsi,[64+rsi]
|
|
dec edx
|
|
jnz NEAR $L$oop_grande_avx
|
|
|
|
$L$done_avx:
|
|
mov rax,QWORD[272+rsp]
|
|
|
|
vzeroupper
|
|
movaps xmm6,XMMWORD[((-184))+rax]
|
|
movaps xmm7,XMMWORD[((-168))+rax]
|
|
movaps xmm8,XMMWORD[((-152))+rax]
|
|
movaps xmm9,XMMWORD[((-136))+rax]
|
|
movaps xmm10,XMMWORD[((-120))+rax]
|
|
movaps xmm11,XMMWORD[((-104))+rax]
|
|
movaps xmm12,XMMWORD[((-88))+rax]
|
|
movaps xmm13,XMMWORD[((-72))+rax]
|
|
movaps xmm14,XMMWORD[((-56))+rax]
|
|
movaps xmm15,XMMWORD[((-40))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
|
|
lea rsp,[rax]
|
|
|
|
$L$epilogue_avx:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_sha256_multi_block_avx:
|
|
|
|
ALIGN 32
|
|
sha256_multi_block_avx2:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_sha256_multi_block_avx2:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
|
|
_avx2_shortcut:
|
|
mov rax,rsp
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
lea rsp,[((-168))+rsp]
|
|
movaps XMMWORD[rsp],xmm6
|
|
movaps XMMWORD[16+rsp],xmm7
|
|
movaps XMMWORD[32+rsp],xmm8
|
|
movaps XMMWORD[48+rsp],xmm9
|
|
movaps XMMWORD[64+rsp],xmm10
|
|
movaps XMMWORD[80+rsp],xmm11
|
|
movaps XMMWORD[(-120)+rax],xmm12
|
|
movaps XMMWORD[(-104)+rax],xmm13
|
|
movaps XMMWORD[(-88)+rax],xmm14
|
|
movaps XMMWORD[(-72)+rax],xmm15
|
|
sub rsp,576
|
|
and rsp,-256
|
|
mov QWORD[544+rsp],rax
|
|
|
|
$L$body_avx2:
|
|
lea rbp,[((K256+128))]
|
|
lea rdi,[128+rdi]
|
|
|
|
$L$oop_grande_avx2:
|
|
mov DWORD[552+rsp],edx
|
|
xor edx,edx
|
|
lea rbx,[512+rsp]
|
|
|
|
mov r12,QWORD[rsi]
|
|
|
|
mov ecx,DWORD[8+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[rbx],ecx
|
|
cmovle r12,rbp
|
|
|
|
mov r13,QWORD[16+rsi]
|
|
|
|
mov ecx,DWORD[24+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[4+rbx],ecx
|
|
cmovle r13,rbp
|
|
|
|
mov r14,QWORD[32+rsi]
|
|
|
|
mov ecx,DWORD[40+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[8+rbx],ecx
|
|
cmovle r14,rbp
|
|
|
|
mov r15,QWORD[48+rsi]
|
|
|
|
mov ecx,DWORD[56+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[12+rbx],ecx
|
|
cmovle r15,rbp
|
|
|
|
mov r8,QWORD[64+rsi]
|
|
|
|
mov ecx,DWORD[72+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[16+rbx],ecx
|
|
cmovle r8,rbp
|
|
|
|
mov r9,QWORD[80+rsi]
|
|
|
|
mov ecx,DWORD[88+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[20+rbx],ecx
|
|
cmovle r9,rbp
|
|
|
|
mov r10,QWORD[96+rsi]
|
|
|
|
mov ecx,DWORD[104+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[24+rbx],ecx
|
|
cmovle r10,rbp
|
|
|
|
mov r11,QWORD[112+rsi]
|
|
|
|
mov ecx,DWORD[120+rsi]
|
|
cmp ecx,edx
|
|
cmovg edx,ecx
|
|
test ecx,ecx
|
|
mov DWORD[28+rbx],ecx
|
|
cmovle r11,rbp
|
|
vmovdqu ymm8,YMMWORD[((0-128))+rdi]
|
|
lea rax,[128+rsp]
|
|
vmovdqu ymm9,YMMWORD[((32-128))+rdi]
|
|
lea rbx,[((256+128))+rsp]
|
|
vmovdqu ymm10,YMMWORD[((64-128))+rdi]
|
|
vmovdqu ymm11,YMMWORD[((96-128))+rdi]
|
|
vmovdqu ymm12,YMMWORD[((128-128))+rdi]
|
|
vmovdqu ymm13,YMMWORD[((160-128))+rdi]
|
|
vmovdqu ymm14,YMMWORD[((192-128))+rdi]
|
|
vmovdqu ymm15,YMMWORD[((224-128))+rdi]
|
|
vmovdqu ymm6,YMMWORD[$L$pbswap]
|
|
jmp NEAR $L$oop_avx2
|
|
|
|
ALIGN 32
|
|
$L$oop_avx2:
|
|
vpxor ymm4,ymm10,ymm9
|
|
vmovd xmm5,DWORD[r12]
|
|
vmovd xmm0,DWORD[r8]
|
|
vmovd xmm1,DWORD[r13]
|
|
vmovd xmm2,DWORD[r9]
|
|
vpinsrd xmm5,xmm5,DWORD[r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm12,6
|
|
vpslld ymm2,ymm12,26
|
|
vmovdqu YMMWORD[(0-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm15
|
|
|
|
vpsrld ymm1,ymm12,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm12,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm12,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,7
|
|
vpandn ymm0,ymm12,ymm14
|
|
vpand ymm3,ymm12,ymm13
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm15,ymm8,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm8,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm9,ymm8
|
|
|
|
vpxor ymm15,ymm15,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm8,13
|
|
|
|
vpslld ymm2,ymm8,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm15,ymm1
|
|
|
|
vpsrld ymm1,ymm8,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,10
|
|
vpxor ymm15,ymm9,ymm4
|
|
vpaddd ymm11,ymm11,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm15,ymm15,ymm5
|
|
vpaddd ymm15,ymm15,ymm7
|
|
vmovd xmm5,DWORD[4+r12]
|
|
vmovd xmm0,DWORD[4+r8]
|
|
vmovd xmm1,DWORD[4+r13]
|
|
vmovd xmm2,DWORD[4+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[4+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[4+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[4+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[4+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm11,6
|
|
vpslld ymm2,ymm11,26
|
|
vmovdqu YMMWORD[(32-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm14
|
|
|
|
vpsrld ymm1,ymm11,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm11,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm11,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,7
|
|
vpandn ymm0,ymm11,ymm13
|
|
vpand ymm4,ymm11,ymm12
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm14,ymm15,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm15,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm8,ymm15
|
|
|
|
vpxor ymm14,ymm14,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm15,13
|
|
|
|
vpslld ymm2,ymm15,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm14,ymm1
|
|
|
|
vpsrld ymm1,ymm15,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,10
|
|
vpxor ymm14,ymm8,ymm3
|
|
vpaddd ymm10,ymm10,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm14,ymm14,ymm5
|
|
vpaddd ymm14,ymm14,ymm7
|
|
vmovd xmm5,DWORD[8+r12]
|
|
vmovd xmm0,DWORD[8+r8]
|
|
vmovd xmm1,DWORD[8+r13]
|
|
vmovd xmm2,DWORD[8+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[8+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[8+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[8+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[8+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm10,6
|
|
vpslld ymm2,ymm10,26
|
|
vmovdqu YMMWORD[(64-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm13
|
|
|
|
vpsrld ymm1,ymm10,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm10,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm10,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,7
|
|
vpandn ymm0,ymm10,ymm12
|
|
vpand ymm3,ymm10,ymm11
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm13,ymm14,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm14,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm15,ymm14
|
|
|
|
vpxor ymm13,ymm13,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm14,13
|
|
|
|
vpslld ymm2,ymm14,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm13,ymm1
|
|
|
|
vpsrld ymm1,ymm14,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,10
|
|
vpxor ymm13,ymm15,ymm4
|
|
vpaddd ymm9,ymm9,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm13,ymm13,ymm5
|
|
vpaddd ymm13,ymm13,ymm7
|
|
vmovd xmm5,DWORD[12+r12]
|
|
vmovd xmm0,DWORD[12+r8]
|
|
vmovd xmm1,DWORD[12+r13]
|
|
vmovd xmm2,DWORD[12+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[12+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[12+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[12+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[12+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm9,6
|
|
vpslld ymm2,ymm9,26
|
|
vmovdqu YMMWORD[(96-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm12
|
|
|
|
vpsrld ymm1,ymm9,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm9,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm9,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,7
|
|
vpandn ymm0,ymm9,ymm11
|
|
vpand ymm4,ymm9,ymm10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm12,ymm13,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm13,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm14,ymm13
|
|
|
|
vpxor ymm12,ymm12,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm13,13
|
|
|
|
vpslld ymm2,ymm13,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm12,ymm1
|
|
|
|
vpsrld ymm1,ymm13,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,10
|
|
vpxor ymm12,ymm14,ymm3
|
|
vpaddd ymm8,ymm8,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm12,ymm12,ymm5
|
|
vpaddd ymm12,ymm12,ymm7
|
|
vmovd xmm5,DWORD[16+r12]
|
|
vmovd xmm0,DWORD[16+r8]
|
|
vmovd xmm1,DWORD[16+r13]
|
|
vmovd xmm2,DWORD[16+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[16+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[16+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[16+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[16+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm8,6
|
|
vpslld ymm2,ymm8,26
|
|
vmovdqu YMMWORD[(128-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm11
|
|
|
|
vpsrld ymm1,ymm8,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm8,21
|
|
vpaddd ymm5,ymm5,YMMWORD[rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm8,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,7
|
|
vpandn ymm0,ymm8,ymm10
|
|
vpand ymm3,ymm8,ymm9
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm11,ymm12,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm12,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm13,ymm12
|
|
|
|
vpxor ymm11,ymm11,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm12,13
|
|
|
|
vpslld ymm2,ymm12,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm11,ymm1
|
|
|
|
vpsrld ymm1,ymm12,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,10
|
|
vpxor ymm11,ymm13,ymm4
|
|
vpaddd ymm15,ymm15,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm11,ymm11,ymm5
|
|
vpaddd ymm11,ymm11,ymm7
|
|
vmovd xmm5,DWORD[20+r12]
|
|
vmovd xmm0,DWORD[20+r8]
|
|
vmovd xmm1,DWORD[20+r13]
|
|
vmovd xmm2,DWORD[20+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[20+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[20+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[20+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[20+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm15,6
|
|
vpslld ymm2,ymm15,26
|
|
vmovdqu YMMWORD[(160-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm10
|
|
|
|
vpsrld ymm1,ymm15,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm15,21
|
|
vpaddd ymm5,ymm5,YMMWORD[32+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm15,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,7
|
|
vpandn ymm0,ymm15,ymm9
|
|
vpand ymm4,ymm15,ymm8
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm10,ymm11,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm11,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm12,ymm11
|
|
|
|
vpxor ymm10,ymm10,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm11,13
|
|
|
|
vpslld ymm2,ymm11,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm10,ymm1
|
|
|
|
vpsrld ymm1,ymm11,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,10
|
|
vpxor ymm10,ymm12,ymm3
|
|
vpaddd ymm14,ymm14,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm10,ymm10,ymm5
|
|
vpaddd ymm10,ymm10,ymm7
|
|
vmovd xmm5,DWORD[24+r12]
|
|
vmovd xmm0,DWORD[24+r8]
|
|
vmovd xmm1,DWORD[24+r13]
|
|
vmovd xmm2,DWORD[24+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[24+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[24+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[24+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[24+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm14,6
|
|
vpslld ymm2,ymm14,26
|
|
vmovdqu YMMWORD[(192-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm9
|
|
|
|
vpsrld ymm1,ymm14,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm14,21
|
|
vpaddd ymm5,ymm5,YMMWORD[64+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm14,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,7
|
|
vpandn ymm0,ymm14,ymm8
|
|
vpand ymm3,ymm14,ymm15
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm9,ymm10,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm10,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm11,ymm10
|
|
|
|
vpxor ymm9,ymm9,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm10,13
|
|
|
|
vpslld ymm2,ymm10,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm9,ymm1
|
|
|
|
vpsrld ymm1,ymm10,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,10
|
|
vpxor ymm9,ymm11,ymm4
|
|
vpaddd ymm13,ymm13,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm9,ymm9,ymm5
|
|
vpaddd ymm9,ymm9,ymm7
|
|
vmovd xmm5,DWORD[28+r12]
|
|
vmovd xmm0,DWORD[28+r8]
|
|
vmovd xmm1,DWORD[28+r13]
|
|
vmovd xmm2,DWORD[28+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[28+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[28+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[28+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[28+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm13,6
|
|
vpslld ymm2,ymm13,26
|
|
vmovdqu YMMWORD[(224-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm8
|
|
|
|
vpsrld ymm1,ymm13,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm13,21
|
|
vpaddd ymm5,ymm5,YMMWORD[96+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm13,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,7
|
|
vpandn ymm0,ymm13,ymm15
|
|
vpand ymm4,ymm13,ymm14
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm8,ymm9,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm9,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm10,ymm9
|
|
|
|
vpxor ymm8,ymm8,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm9,13
|
|
|
|
vpslld ymm2,ymm9,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm8,ymm1
|
|
|
|
vpsrld ymm1,ymm9,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,10
|
|
vpxor ymm8,ymm10,ymm3
|
|
vpaddd ymm12,ymm12,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm8,ymm8,ymm5
|
|
vpaddd ymm8,ymm8,ymm7
|
|
add rbp,256
|
|
vmovd xmm5,DWORD[32+r12]
|
|
vmovd xmm0,DWORD[32+r8]
|
|
vmovd xmm1,DWORD[32+r13]
|
|
vmovd xmm2,DWORD[32+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[32+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[32+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[32+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[32+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm12,6
|
|
vpslld ymm2,ymm12,26
|
|
vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm15
|
|
|
|
vpsrld ymm1,ymm12,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm12,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm12,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,7
|
|
vpandn ymm0,ymm12,ymm14
|
|
vpand ymm3,ymm12,ymm13
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm15,ymm8,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm8,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm9,ymm8
|
|
|
|
vpxor ymm15,ymm15,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm8,13
|
|
|
|
vpslld ymm2,ymm8,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm15,ymm1
|
|
|
|
vpsrld ymm1,ymm8,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,10
|
|
vpxor ymm15,ymm9,ymm4
|
|
vpaddd ymm11,ymm11,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm15,ymm15,ymm5
|
|
vpaddd ymm15,ymm15,ymm7
|
|
vmovd xmm5,DWORD[36+r12]
|
|
vmovd xmm0,DWORD[36+r8]
|
|
vmovd xmm1,DWORD[36+r13]
|
|
vmovd xmm2,DWORD[36+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[36+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[36+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[36+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[36+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm11,6
|
|
vpslld ymm2,ymm11,26
|
|
vmovdqu YMMWORD[(288-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm14
|
|
|
|
vpsrld ymm1,ymm11,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm11,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm11,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,7
|
|
vpandn ymm0,ymm11,ymm13
|
|
vpand ymm4,ymm11,ymm12
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm14,ymm15,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm15,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm8,ymm15
|
|
|
|
vpxor ymm14,ymm14,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm15,13
|
|
|
|
vpslld ymm2,ymm15,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm14,ymm1
|
|
|
|
vpsrld ymm1,ymm15,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,10
|
|
vpxor ymm14,ymm8,ymm3
|
|
vpaddd ymm10,ymm10,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm14,ymm14,ymm5
|
|
vpaddd ymm14,ymm14,ymm7
|
|
vmovd xmm5,DWORD[40+r12]
|
|
vmovd xmm0,DWORD[40+r8]
|
|
vmovd xmm1,DWORD[40+r13]
|
|
vmovd xmm2,DWORD[40+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[40+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[40+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[40+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[40+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm10,6
|
|
vpslld ymm2,ymm10,26
|
|
vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm13
|
|
|
|
vpsrld ymm1,ymm10,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm10,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm10,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,7
|
|
vpandn ymm0,ymm10,ymm12
|
|
vpand ymm3,ymm10,ymm11
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm13,ymm14,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm14,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm15,ymm14
|
|
|
|
vpxor ymm13,ymm13,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm14,13
|
|
|
|
vpslld ymm2,ymm14,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm13,ymm1
|
|
|
|
vpsrld ymm1,ymm14,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,10
|
|
vpxor ymm13,ymm15,ymm4
|
|
vpaddd ymm9,ymm9,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm13,ymm13,ymm5
|
|
vpaddd ymm13,ymm13,ymm7
|
|
vmovd xmm5,DWORD[44+r12]
|
|
vmovd xmm0,DWORD[44+r8]
|
|
vmovd xmm1,DWORD[44+r13]
|
|
vmovd xmm2,DWORD[44+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[44+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[44+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[44+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[44+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm9,6
|
|
vpslld ymm2,ymm9,26
|
|
vmovdqu YMMWORD[(352-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm12
|
|
|
|
vpsrld ymm1,ymm9,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm9,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm9,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,7
|
|
vpandn ymm0,ymm9,ymm11
|
|
vpand ymm4,ymm9,ymm10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm12,ymm13,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm13,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm14,ymm13
|
|
|
|
vpxor ymm12,ymm12,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm13,13
|
|
|
|
vpslld ymm2,ymm13,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm12,ymm1
|
|
|
|
vpsrld ymm1,ymm13,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,10
|
|
vpxor ymm12,ymm14,ymm3
|
|
vpaddd ymm8,ymm8,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm12,ymm12,ymm5
|
|
vpaddd ymm12,ymm12,ymm7
|
|
vmovd xmm5,DWORD[48+r12]
|
|
vmovd xmm0,DWORD[48+r8]
|
|
vmovd xmm1,DWORD[48+r13]
|
|
vmovd xmm2,DWORD[48+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[48+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[48+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[48+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[48+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm8,6
|
|
vpslld ymm2,ymm8,26
|
|
vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm11
|
|
|
|
vpsrld ymm1,ymm8,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm8,21
|
|
vpaddd ymm5,ymm5,YMMWORD[rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm8,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,7
|
|
vpandn ymm0,ymm8,ymm10
|
|
vpand ymm3,ymm8,ymm9
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm11,ymm12,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm12,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm13,ymm12
|
|
|
|
vpxor ymm11,ymm11,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm12,13
|
|
|
|
vpslld ymm2,ymm12,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm11,ymm1
|
|
|
|
vpsrld ymm1,ymm12,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,10
|
|
vpxor ymm11,ymm13,ymm4
|
|
vpaddd ymm15,ymm15,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm11,ymm11,ymm5
|
|
vpaddd ymm11,ymm11,ymm7
|
|
vmovd xmm5,DWORD[52+r12]
|
|
vmovd xmm0,DWORD[52+r8]
|
|
vmovd xmm1,DWORD[52+r13]
|
|
vmovd xmm2,DWORD[52+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[52+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[52+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[52+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[52+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm15,6
|
|
vpslld ymm2,ymm15,26
|
|
vmovdqu YMMWORD[(416-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm10
|
|
|
|
vpsrld ymm1,ymm15,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm15,21
|
|
vpaddd ymm5,ymm5,YMMWORD[32+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm15,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,7
|
|
vpandn ymm0,ymm15,ymm9
|
|
vpand ymm4,ymm15,ymm8
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm10,ymm11,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm11,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm12,ymm11
|
|
|
|
vpxor ymm10,ymm10,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm11,13
|
|
|
|
vpslld ymm2,ymm11,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm10,ymm1
|
|
|
|
vpsrld ymm1,ymm11,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,10
|
|
vpxor ymm10,ymm12,ymm3
|
|
vpaddd ymm14,ymm14,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm10,ymm10,ymm5
|
|
vpaddd ymm10,ymm10,ymm7
|
|
vmovd xmm5,DWORD[56+r12]
|
|
vmovd xmm0,DWORD[56+r8]
|
|
vmovd xmm1,DWORD[56+r13]
|
|
vmovd xmm2,DWORD[56+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[56+r14],1
|
|
vpinsrd xmm0,xmm0,DWORD[56+r10],1
|
|
vpinsrd xmm1,xmm1,DWORD[56+r15],1
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[56+r11],1
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm14,6
|
|
vpslld ymm2,ymm14,26
|
|
vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm9
|
|
|
|
vpsrld ymm1,ymm14,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm14,21
|
|
vpaddd ymm5,ymm5,YMMWORD[64+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm14,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,7
|
|
vpandn ymm0,ymm14,ymm8
|
|
vpand ymm3,ymm14,ymm15
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm9,ymm10,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm10,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm11,ymm10
|
|
|
|
vpxor ymm9,ymm9,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm10,13
|
|
|
|
vpslld ymm2,ymm10,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm9,ymm1
|
|
|
|
vpsrld ymm1,ymm10,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,10
|
|
vpxor ymm9,ymm11,ymm4
|
|
vpaddd ymm13,ymm13,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm9,ymm9,ymm5
|
|
vpaddd ymm9,ymm9,ymm7
|
|
vmovd xmm5,DWORD[60+r12]
|
|
lea r12,[64+r12]
|
|
vmovd xmm0,DWORD[60+r8]
|
|
lea r8,[64+r8]
|
|
vmovd xmm1,DWORD[60+r13]
|
|
lea r13,[64+r13]
|
|
vmovd xmm2,DWORD[60+r9]
|
|
lea r9,[64+r9]
|
|
vpinsrd xmm5,xmm5,DWORD[60+r14],1
|
|
lea r14,[64+r14]
|
|
vpinsrd xmm0,xmm0,DWORD[60+r10],1
|
|
lea r10,[64+r10]
|
|
vpinsrd xmm1,xmm1,DWORD[60+r15],1
|
|
lea r15,[64+r15]
|
|
vpunpckldq ymm5,ymm5,ymm1
|
|
vpinsrd xmm2,xmm2,DWORD[60+r11],1
|
|
lea r11,[64+r11]
|
|
vpunpckldq ymm0,ymm0,ymm2
|
|
vinserti128 ymm5,ymm5,xmm0,1
|
|
vpshufb ymm5,ymm5,ymm6
|
|
vpsrld ymm7,ymm13,6
|
|
vpslld ymm2,ymm13,26
|
|
vmovdqu YMMWORD[(480-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm8
|
|
|
|
vpsrld ymm1,ymm13,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm13,21
|
|
vpaddd ymm5,ymm5,YMMWORD[96+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm13,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
prefetcht0 [63+r12]
|
|
vpslld ymm2,ymm13,7
|
|
vpandn ymm0,ymm13,ymm15
|
|
vpand ymm4,ymm13,ymm14
|
|
prefetcht0 [63+r13]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm8,ymm9,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
prefetcht0 [63+r14]
|
|
vpslld ymm1,ymm9,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm10,ymm9
|
|
prefetcht0 [63+r15]
|
|
vpxor ymm8,ymm8,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm9,13
|
|
prefetcht0 [63+r8]
|
|
vpslld ymm2,ymm9,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
prefetcht0 [63+r9]
|
|
vpxor ymm7,ymm8,ymm1
|
|
|
|
vpsrld ymm1,ymm9,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
prefetcht0 [63+r10]
|
|
vpslld ymm2,ymm9,10
|
|
vpxor ymm8,ymm10,ymm3
|
|
vpaddd ymm12,ymm12,ymm5
|
|
prefetcht0 [63+r11]
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm8,ymm8,ymm5
|
|
vpaddd ymm8,ymm8,ymm7
|
|
add rbp,256
|
|
vmovdqu ymm5,YMMWORD[((0-128))+rax]
|
|
mov ecx,3
|
|
jmp NEAR $L$oop_16_xx_avx2
|
|
ALIGN 32
|
|
$L$oop_16_xx_avx2:
|
|
vmovdqu ymm6,YMMWORD[((32-128))+rax]
|
|
vpaddd ymm5,ymm5,YMMWORD[((288-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((448-256-128))+rbx]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm12,6
|
|
vpslld ymm2,ymm12,26
|
|
vmovdqu YMMWORD[(0-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm15
|
|
|
|
vpsrld ymm1,ymm12,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm12,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm12,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,7
|
|
vpandn ymm0,ymm12,ymm14
|
|
vpand ymm3,ymm12,ymm13
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm15,ymm8,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm8,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm9,ymm8
|
|
|
|
vpxor ymm15,ymm15,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm8,13
|
|
|
|
vpslld ymm2,ymm8,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm15,ymm1
|
|
|
|
vpsrld ymm1,ymm8,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,10
|
|
vpxor ymm15,ymm9,ymm4
|
|
vpaddd ymm11,ymm11,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm15,ymm15,ymm5
|
|
vpaddd ymm15,ymm15,ymm7
|
|
vmovdqu ymm5,YMMWORD[((64-128))+rax]
|
|
vpaddd ymm6,ymm6,YMMWORD[((320-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((480-256-128))+rbx]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm11,6
|
|
vpslld ymm2,ymm11,26
|
|
vmovdqu YMMWORD[(32-128)+rax],ymm6
|
|
vpaddd ymm6,ymm6,ymm14
|
|
|
|
vpsrld ymm1,ymm11,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm11,21
|
|
vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm11,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,7
|
|
vpandn ymm0,ymm11,ymm13
|
|
vpand ymm4,ymm11,ymm12
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm14,ymm15,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm15,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm8,ymm15
|
|
|
|
vpxor ymm14,ymm14,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm15,13
|
|
|
|
vpslld ymm2,ymm15,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm14,ymm1
|
|
|
|
vpsrld ymm1,ymm15,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,10
|
|
vpxor ymm14,ymm8,ymm3
|
|
vpaddd ymm10,ymm10,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm14,ymm14,ymm6
|
|
vpaddd ymm14,ymm14,ymm7
|
|
vmovdqu ymm6,YMMWORD[((96-128))+rax]
|
|
vpaddd ymm5,ymm5,YMMWORD[((352-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((0-128))+rax]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm10,6
|
|
vpslld ymm2,ymm10,26
|
|
vmovdqu YMMWORD[(64-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm13
|
|
|
|
vpsrld ymm1,ymm10,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm10,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm10,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,7
|
|
vpandn ymm0,ymm10,ymm12
|
|
vpand ymm3,ymm10,ymm11
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm13,ymm14,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm14,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm15,ymm14
|
|
|
|
vpxor ymm13,ymm13,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm14,13
|
|
|
|
vpslld ymm2,ymm14,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm13,ymm1
|
|
|
|
vpsrld ymm1,ymm14,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,10
|
|
vpxor ymm13,ymm15,ymm4
|
|
vpaddd ymm9,ymm9,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm13,ymm13,ymm5
|
|
vpaddd ymm13,ymm13,ymm7
|
|
vmovdqu ymm5,YMMWORD[((128-128))+rax]
|
|
vpaddd ymm6,ymm6,YMMWORD[((384-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((32-128))+rax]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm9,6
|
|
vpslld ymm2,ymm9,26
|
|
vmovdqu YMMWORD[(96-128)+rax],ymm6
|
|
vpaddd ymm6,ymm6,ymm12
|
|
|
|
vpsrld ymm1,ymm9,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm9,21
|
|
vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm9,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,7
|
|
vpandn ymm0,ymm9,ymm11
|
|
vpand ymm4,ymm9,ymm10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm12,ymm13,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm13,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm14,ymm13
|
|
|
|
vpxor ymm12,ymm12,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm13,13
|
|
|
|
vpslld ymm2,ymm13,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm12,ymm1
|
|
|
|
vpsrld ymm1,ymm13,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,10
|
|
vpxor ymm12,ymm14,ymm3
|
|
vpaddd ymm8,ymm8,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm12,ymm12,ymm6
|
|
vpaddd ymm12,ymm12,ymm7
|
|
vmovdqu ymm6,YMMWORD[((160-128))+rax]
|
|
vpaddd ymm5,ymm5,YMMWORD[((416-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((64-128))+rax]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm8,6
|
|
vpslld ymm2,ymm8,26
|
|
vmovdqu YMMWORD[(128-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm11
|
|
|
|
vpsrld ymm1,ymm8,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm8,21
|
|
vpaddd ymm5,ymm5,YMMWORD[rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm8,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,7
|
|
vpandn ymm0,ymm8,ymm10
|
|
vpand ymm3,ymm8,ymm9
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm11,ymm12,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm12,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm13,ymm12
|
|
|
|
vpxor ymm11,ymm11,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm12,13
|
|
|
|
vpslld ymm2,ymm12,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm11,ymm1
|
|
|
|
vpsrld ymm1,ymm12,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,10
|
|
vpxor ymm11,ymm13,ymm4
|
|
vpaddd ymm15,ymm15,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm11,ymm11,ymm5
|
|
vpaddd ymm11,ymm11,ymm7
|
|
vmovdqu ymm5,YMMWORD[((192-128))+rax]
|
|
vpaddd ymm6,ymm6,YMMWORD[((448-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((96-128))+rax]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm15,6
|
|
vpslld ymm2,ymm15,26
|
|
vmovdqu YMMWORD[(160-128)+rax],ymm6
|
|
vpaddd ymm6,ymm6,ymm10
|
|
|
|
vpsrld ymm1,ymm15,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm15,21
|
|
vpaddd ymm6,ymm6,YMMWORD[32+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm15,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,7
|
|
vpandn ymm0,ymm15,ymm9
|
|
vpand ymm4,ymm15,ymm8
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm10,ymm11,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm11,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm12,ymm11
|
|
|
|
vpxor ymm10,ymm10,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm11,13
|
|
|
|
vpslld ymm2,ymm11,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm10,ymm1
|
|
|
|
vpsrld ymm1,ymm11,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,10
|
|
vpxor ymm10,ymm12,ymm3
|
|
vpaddd ymm14,ymm14,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm10,ymm10,ymm6
|
|
vpaddd ymm10,ymm10,ymm7
|
|
vmovdqu ymm6,YMMWORD[((224-128))+rax]
|
|
vpaddd ymm5,ymm5,YMMWORD[((480-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((128-128))+rax]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm14,6
|
|
vpslld ymm2,ymm14,26
|
|
vmovdqu YMMWORD[(192-128)+rax],ymm5
|
|
vpaddd ymm5,ymm5,ymm9
|
|
|
|
vpsrld ymm1,ymm14,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm14,21
|
|
vpaddd ymm5,ymm5,YMMWORD[64+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm14,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,7
|
|
vpandn ymm0,ymm14,ymm8
|
|
vpand ymm3,ymm14,ymm15
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm9,ymm10,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm10,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm11,ymm10
|
|
|
|
vpxor ymm9,ymm9,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm10,13
|
|
|
|
vpslld ymm2,ymm10,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm9,ymm1
|
|
|
|
vpsrld ymm1,ymm10,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,10
|
|
vpxor ymm9,ymm11,ymm4
|
|
vpaddd ymm13,ymm13,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm9,ymm9,ymm5
|
|
vpaddd ymm9,ymm9,ymm7
|
|
vmovdqu ymm5,YMMWORD[((256-256-128))+rbx]
|
|
vpaddd ymm6,ymm6,YMMWORD[((0-128))+rax]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((160-128))+rax]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm13,6
|
|
vpslld ymm2,ymm13,26
|
|
vmovdqu YMMWORD[(224-128)+rax],ymm6
|
|
vpaddd ymm6,ymm6,ymm8
|
|
|
|
vpsrld ymm1,ymm13,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm13,21
|
|
vpaddd ymm6,ymm6,YMMWORD[96+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm13,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,7
|
|
vpandn ymm0,ymm13,ymm15
|
|
vpand ymm4,ymm13,ymm14
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm8,ymm9,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm9,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm10,ymm9
|
|
|
|
vpxor ymm8,ymm8,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm9,13
|
|
|
|
vpslld ymm2,ymm9,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm8,ymm1
|
|
|
|
vpsrld ymm1,ymm9,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,10
|
|
vpxor ymm8,ymm10,ymm3
|
|
vpaddd ymm12,ymm12,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm8,ymm8,ymm6
|
|
vpaddd ymm8,ymm8,ymm7
|
|
add rbp,256
|
|
vmovdqu ymm6,YMMWORD[((288-256-128))+rbx]
|
|
vpaddd ymm5,ymm5,YMMWORD[((32-128))+rax]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((192-128))+rax]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm12,6
|
|
vpslld ymm2,ymm12,26
|
|
vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm15
|
|
|
|
vpsrld ymm1,ymm12,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm12,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm12,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,7
|
|
vpandn ymm0,ymm12,ymm14
|
|
vpand ymm3,ymm12,ymm13
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm15,ymm8,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm8,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm9,ymm8
|
|
|
|
vpxor ymm15,ymm15,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm8,13
|
|
|
|
vpslld ymm2,ymm8,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm15,ymm1
|
|
|
|
vpsrld ymm1,ymm8,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,10
|
|
vpxor ymm15,ymm9,ymm4
|
|
vpaddd ymm11,ymm11,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm15,ymm15,ymm5
|
|
vpaddd ymm15,ymm15,ymm7
|
|
vmovdqu ymm5,YMMWORD[((320-256-128))+rbx]
|
|
vpaddd ymm6,ymm6,YMMWORD[((64-128))+rax]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((224-128))+rax]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm11,6
|
|
vpslld ymm2,ymm11,26
|
|
vmovdqu YMMWORD[(288-256-128)+rbx],ymm6
|
|
vpaddd ymm6,ymm6,ymm14
|
|
|
|
vpsrld ymm1,ymm11,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm11,21
|
|
vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm11,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,7
|
|
vpandn ymm0,ymm11,ymm13
|
|
vpand ymm4,ymm11,ymm12
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm14,ymm15,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm15,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm8,ymm15
|
|
|
|
vpxor ymm14,ymm14,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm15,13
|
|
|
|
vpslld ymm2,ymm15,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm14,ymm1
|
|
|
|
vpsrld ymm1,ymm15,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,10
|
|
vpxor ymm14,ymm8,ymm3
|
|
vpaddd ymm10,ymm10,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm14,ymm14,ymm6
|
|
vpaddd ymm14,ymm14,ymm7
|
|
vmovdqu ymm6,YMMWORD[((352-256-128))+rbx]
|
|
vpaddd ymm5,ymm5,YMMWORD[((96-128))+rax]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((256-256-128))+rbx]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm10,6
|
|
vpslld ymm2,ymm10,26
|
|
vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm13
|
|
|
|
vpsrld ymm1,ymm10,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm10,21
|
|
vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm10,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,7
|
|
vpandn ymm0,ymm10,ymm12
|
|
vpand ymm3,ymm10,ymm11
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm13,ymm14,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm14,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm15,ymm14
|
|
|
|
vpxor ymm13,ymm13,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm14,13
|
|
|
|
vpslld ymm2,ymm14,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm13,ymm1
|
|
|
|
vpsrld ymm1,ymm14,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,10
|
|
vpxor ymm13,ymm15,ymm4
|
|
vpaddd ymm9,ymm9,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm13,ymm13,ymm5
|
|
vpaddd ymm13,ymm13,ymm7
|
|
vmovdqu ymm5,YMMWORD[((384-256-128))+rbx]
|
|
vpaddd ymm6,ymm6,YMMWORD[((128-128))+rax]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((288-256-128))+rbx]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm9,6
|
|
vpslld ymm2,ymm9,26
|
|
vmovdqu YMMWORD[(352-256-128)+rbx],ymm6
|
|
vpaddd ymm6,ymm6,ymm12
|
|
|
|
vpsrld ymm1,ymm9,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm9,21
|
|
vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm9,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,7
|
|
vpandn ymm0,ymm9,ymm11
|
|
vpand ymm4,ymm9,ymm10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm12,ymm13,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm13,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm14,ymm13
|
|
|
|
vpxor ymm12,ymm12,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm13,13
|
|
|
|
vpslld ymm2,ymm13,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm12,ymm1
|
|
|
|
vpsrld ymm1,ymm13,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,10
|
|
vpxor ymm12,ymm14,ymm3
|
|
vpaddd ymm8,ymm8,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm12,ymm12,ymm6
|
|
vpaddd ymm12,ymm12,ymm7
|
|
vmovdqu ymm6,YMMWORD[((416-256-128))+rbx]
|
|
vpaddd ymm5,ymm5,YMMWORD[((160-128))+rax]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((320-256-128))+rbx]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm8,6
|
|
vpslld ymm2,ymm8,26
|
|
vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm11
|
|
|
|
vpsrld ymm1,ymm8,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm8,21
|
|
vpaddd ymm5,ymm5,YMMWORD[rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm8,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm8,7
|
|
vpandn ymm0,ymm8,ymm10
|
|
vpand ymm3,ymm8,ymm9
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm11,ymm12,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm12,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm13,ymm12
|
|
|
|
vpxor ymm11,ymm11,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm12,13
|
|
|
|
vpslld ymm2,ymm12,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm11,ymm1
|
|
|
|
vpsrld ymm1,ymm12,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm12,10
|
|
vpxor ymm11,ymm13,ymm4
|
|
vpaddd ymm15,ymm15,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm11,ymm11,ymm5
|
|
vpaddd ymm11,ymm11,ymm7
|
|
vmovdqu ymm5,YMMWORD[((448-256-128))+rbx]
|
|
vpaddd ymm6,ymm6,YMMWORD[((192-128))+rax]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((352-256-128))+rbx]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm15,6
|
|
vpslld ymm2,ymm15,26
|
|
vmovdqu YMMWORD[(416-256-128)+rbx],ymm6
|
|
vpaddd ymm6,ymm6,ymm10
|
|
|
|
vpsrld ymm1,ymm15,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm15,21
|
|
vpaddd ymm6,ymm6,YMMWORD[32+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm15,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm15,7
|
|
vpandn ymm0,ymm15,ymm9
|
|
vpand ymm4,ymm15,ymm8
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm10,ymm11,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm11,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm12,ymm11
|
|
|
|
vpxor ymm10,ymm10,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm11,13
|
|
|
|
vpslld ymm2,ymm11,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm10,ymm1
|
|
|
|
vpsrld ymm1,ymm11,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm11,10
|
|
vpxor ymm10,ymm12,ymm3
|
|
vpaddd ymm14,ymm14,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm10,ymm10,ymm6
|
|
vpaddd ymm10,ymm10,ymm7
|
|
vmovdqu ymm6,YMMWORD[((480-256-128))+rbx]
|
|
vpaddd ymm5,ymm5,YMMWORD[((224-128))+rax]
|
|
|
|
vpsrld ymm7,ymm6,3
|
|
vpsrld ymm1,ymm6,7
|
|
vpslld ymm2,ymm6,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm6,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm6,14
|
|
vmovdqu ymm0,YMMWORD[((384-256-128))+rbx]
|
|
vpsrld ymm3,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpxor ymm7,ymm3,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm5,ymm5,ymm7
|
|
vpsrld ymm7,ymm14,6
|
|
vpslld ymm2,ymm14,26
|
|
vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
|
|
vpaddd ymm5,ymm5,ymm9
|
|
|
|
vpsrld ymm1,ymm14,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm14,21
|
|
vpaddd ymm5,ymm5,YMMWORD[64+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm14,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm14,7
|
|
vpandn ymm0,ymm14,ymm8
|
|
vpand ymm3,ymm14,ymm15
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm9,ymm10,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm10,30
|
|
vpxor ymm0,ymm0,ymm3
|
|
vpxor ymm3,ymm11,ymm10
|
|
|
|
vpxor ymm9,ymm9,ymm1
|
|
vpaddd ymm5,ymm5,ymm7
|
|
|
|
vpsrld ymm1,ymm10,13
|
|
|
|
vpslld ymm2,ymm10,19
|
|
vpaddd ymm5,ymm5,ymm0
|
|
vpand ymm4,ymm4,ymm3
|
|
|
|
vpxor ymm7,ymm9,ymm1
|
|
|
|
vpsrld ymm1,ymm10,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm10,10
|
|
vpxor ymm9,ymm11,ymm4
|
|
vpaddd ymm13,ymm13,ymm5
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm9,ymm9,ymm5
|
|
vpaddd ymm9,ymm9,ymm7
|
|
vmovdqu ymm5,YMMWORD[((0-128))+rax]
|
|
vpaddd ymm6,ymm6,YMMWORD[((256-256-128))+rbx]
|
|
|
|
vpsrld ymm7,ymm5,3
|
|
vpsrld ymm1,ymm5,7
|
|
vpslld ymm2,ymm5,25
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm5,18
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm5,14
|
|
vmovdqu ymm0,YMMWORD[((416-256-128))+rbx]
|
|
vpsrld ymm4,ymm0,10
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpsrld ymm1,ymm0,17
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,15
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpxor ymm7,ymm4,ymm1
|
|
vpsrld ymm1,ymm0,19
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm0,13
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpaddd ymm6,ymm6,ymm7
|
|
vpsrld ymm7,ymm13,6
|
|
vpslld ymm2,ymm13,26
|
|
vmovdqu YMMWORD[(480-256-128)+rbx],ymm6
|
|
vpaddd ymm6,ymm6,ymm8
|
|
|
|
vpsrld ymm1,ymm13,11
|
|
vpxor ymm7,ymm7,ymm2
|
|
vpslld ymm2,ymm13,21
|
|
vpaddd ymm6,ymm6,YMMWORD[96+rbp]
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm1,ymm13,25
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm13,7
|
|
vpandn ymm0,ymm13,ymm15
|
|
vpand ymm4,ymm13,ymm14
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
|
|
vpsrld ymm8,ymm9,2
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm1,ymm9,30
|
|
vpxor ymm0,ymm0,ymm4
|
|
vpxor ymm4,ymm10,ymm9
|
|
|
|
vpxor ymm8,ymm8,ymm1
|
|
vpaddd ymm6,ymm6,ymm7
|
|
|
|
vpsrld ymm1,ymm9,13
|
|
|
|
vpslld ymm2,ymm9,19
|
|
vpaddd ymm6,ymm6,ymm0
|
|
vpand ymm3,ymm3,ymm4
|
|
|
|
vpxor ymm7,ymm8,ymm1
|
|
|
|
vpsrld ymm1,ymm9,22
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpslld ymm2,ymm9,10
|
|
vpxor ymm8,ymm10,ymm3
|
|
vpaddd ymm12,ymm12,ymm6
|
|
|
|
vpxor ymm7,ymm7,ymm1
|
|
vpxor ymm7,ymm7,ymm2
|
|
|
|
vpaddd ymm8,ymm8,ymm6
|
|
vpaddd ymm8,ymm8,ymm7
|
|
add rbp,256
|
|
dec ecx
|
|
jnz NEAR $L$oop_16_xx_avx2
|
|
|
|
mov ecx,1
|
|
lea rbx,[512+rsp]
|
|
lea rbp,[((K256+128))]
|
|
cmp ecx,DWORD[rbx]
|
|
cmovge r12,rbp
|
|
cmp ecx,DWORD[4+rbx]
|
|
cmovge r13,rbp
|
|
cmp ecx,DWORD[8+rbx]
|
|
cmovge r14,rbp
|
|
cmp ecx,DWORD[12+rbx]
|
|
cmovge r15,rbp
|
|
cmp ecx,DWORD[16+rbx]
|
|
cmovge r8,rbp
|
|
cmp ecx,DWORD[20+rbx]
|
|
cmovge r9,rbp
|
|
cmp ecx,DWORD[24+rbx]
|
|
cmovge r10,rbp
|
|
cmp ecx,DWORD[28+rbx]
|
|
cmovge r11,rbp
|
|
vmovdqa ymm7,YMMWORD[rbx]
|
|
vpxor ymm0,ymm0,ymm0
|
|
vmovdqa ymm6,ymm7
|
|
vpcmpgtd ymm6,ymm6,ymm0
|
|
vpaddd ymm7,ymm7,ymm6
|
|
|
|
vmovdqu ymm0,YMMWORD[((0-128))+rdi]
|
|
vpand ymm8,ymm8,ymm6
|
|
vmovdqu ymm1,YMMWORD[((32-128))+rdi]
|
|
vpand ymm9,ymm9,ymm6
|
|
vmovdqu ymm2,YMMWORD[((64-128))+rdi]
|
|
vpand ymm10,ymm10,ymm6
|
|
vmovdqu ymm5,YMMWORD[((96-128))+rdi]
|
|
vpand ymm11,ymm11,ymm6
|
|
vpaddd ymm8,ymm8,ymm0
|
|
vmovdqu ymm0,YMMWORD[((128-128))+rdi]
|
|
vpand ymm12,ymm12,ymm6
|
|
vpaddd ymm9,ymm9,ymm1
|
|
vmovdqu ymm1,YMMWORD[((160-128))+rdi]
|
|
vpand ymm13,ymm13,ymm6
|
|
vpaddd ymm10,ymm10,ymm2
|
|
vmovdqu ymm2,YMMWORD[((192-128))+rdi]
|
|
vpand ymm14,ymm14,ymm6
|
|
vpaddd ymm11,ymm11,ymm5
|
|
vmovdqu ymm5,YMMWORD[((224-128))+rdi]
|
|
vpand ymm15,ymm15,ymm6
|
|
vpaddd ymm12,ymm12,ymm0
|
|
vpaddd ymm13,ymm13,ymm1
|
|
vmovdqu YMMWORD[(0-128)+rdi],ymm8
|
|
vpaddd ymm14,ymm14,ymm2
|
|
vmovdqu YMMWORD[(32-128)+rdi],ymm9
|
|
vpaddd ymm15,ymm15,ymm5
|
|
vmovdqu YMMWORD[(64-128)+rdi],ymm10
|
|
vmovdqu YMMWORD[(96-128)+rdi],ymm11
|
|
vmovdqu YMMWORD[(128-128)+rdi],ymm12
|
|
vmovdqu YMMWORD[(160-128)+rdi],ymm13
|
|
vmovdqu YMMWORD[(192-128)+rdi],ymm14
|
|
vmovdqu YMMWORD[(224-128)+rdi],ymm15
|
|
|
|
vmovdqu YMMWORD[rbx],ymm7
|
|
lea rbx,[((256+128))+rsp]
|
|
vmovdqu ymm6,YMMWORD[$L$pbswap]
|
|
dec edx
|
|
jnz NEAR $L$oop_avx2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$L$done_avx2:
|
|
mov rax,QWORD[544+rsp]
|
|
|
|
vzeroupper
|
|
movaps xmm6,XMMWORD[((-216))+rax]
|
|
movaps xmm7,XMMWORD[((-200))+rax]
|
|
movaps xmm8,XMMWORD[((-184))+rax]
|
|
movaps xmm9,XMMWORD[((-168))+rax]
|
|
movaps xmm10,XMMWORD[((-152))+rax]
|
|
movaps xmm11,XMMWORD[((-136))+rax]
|
|
movaps xmm12,XMMWORD[((-120))+rax]
|
|
movaps xmm13,XMMWORD[((-104))+rax]
|
|
movaps xmm14,XMMWORD[((-88))+rax]
|
|
movaps xmm15,XMMWORD[((-72))+rax]
|
|
mov r15,QWORD[((-48))+rax]
|
|
|
|
mov r14,QWORD[((-40))+rax]
|
|
|
|
mov r13,QWORD[((-32))+rax]
|
|
|
|
mov r12,QWORD[((-24))+rax]
|
|
|
|
mov rbp,QWORD[((-16))+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
|
|
lea rsp,[rax]
|
|
|
|
$L$epilogue_avx2:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_sha256_multi_block_avx2:
|
|
ALIGN 256
|
|
K256:
|
|
DD 1116352408,1116352408,1116352408,1116352408
|
|
DD 1116352408,1116352408,1116352408,1116352408
|
|
DD 1899447441,1899447441,1899447441,1899447441
|
|
DD 1899447441,1899447441,1899447441,1899447441
|
|
DD 3049323471,3049323471,3049323471,3049323471
|
|
DD 3049323471,3049323471,3049323471,3049323471
|
|
DD 3921009573,3921009573,3921009573,3921009573
|
|
DD 3921009573,3921009573,3921009573,3921009573
|
|
DD 961987163,961987163,961987163,961987163
|
|
DD 961987163,961987163,961987163,961987163
|
|
DD 1508970993,1508970993,1508970993,1508970993
|
|
DD 1508970993,1508970993,1508970993,1508970993
|
|
DD 2453635748,2453635748,2453635748,2453635748
|
|
DD 2453635748,2453635748,2453635748,2453635748
|
|
DD 2870763221,2870763221,2870763221,2870763221
|
|
DD 2870763221,2870763221,2870763221,2870763221
|
|
DD 3624381080,3624381080,3624381080,3624381080
|
|
DD 3624381080,3624381080,3624381080,3624381080
|
|
DD 310598401,310598401,310598401,310598401
|
|
DD 310598401,310598401,310598401,310598401
|
|
DD 607225278,607225278,607225278,607225278
|
|
DD 607225278,607225278,607225278,607225278
|
|
DD 1426881987,1426881987,1426881987,1426881987
|
|
DD 1426881987,1426881987,1426881987,1426881987
|
|
DD 1925078388,1925078388,1925078388,1925078388
|
|
DD 1925078388,1925078388,1925078388,1925078388
|
|
DD 2162078206,2162078206,2162078206,2162078206
|
|
DD 2162078206,2162078206,2162078206,2162078206
|
|
DD 2614888103,2614888103,2614888103,2614888103
|
|
DD 2614888103,2614888103,2614888103,2614888103
|
|
DD 3248222580,3248222580,3248222580,3248222580
|
|
DD 3248222580,3248222580,3248222580,3248222580
|
|
DD 3835390401,3835390401,3835390401,3835390401
|
|
DD 3835390401,3835390401,3835390401,3835390401
|
|
DD 4022224774,4022224774,4022224774,4022224774
|
|
DD 4022224774,4022224774,4022224774,4022224774
|
|
DD 264347078,264347078,264347078,264347078
|
|
DD 264347078,264347078,264347078,264347078
|
|
DD 604807628,604807628,604807628,604807628
|
|
DD 604807628,604807628,604807628,604807628
|
|
DD 770255983,770255983,770255983,770255983
|
|
DD 770255983,770255983,770255983,770255983
|
|
DD 1249150122,1249150122,1249150122,1249150122
|
|
DD 1249150122,1249150122,1249150122,1249150122
|
|
DD 1555081692,1555081692,1555081692,1555081692
|
|
DD 1555081692,1555081692,1555081692,1555081692
|
|
DD 1996064986,1996064986,1996064986,1996064986
|
|
DD 1996064986,1996064986,1996064986,1996064986
|
|
DD 2554220882,2554220882,2554220882,2554220882
|
|
DD 2554220882,2554220882,2554220882,2554220882
|
|
DD 2821834349,2821834349,2821834349,2821834349
|
|
DD 2821834349,2821834349,2821834349,2821834349
|
|
DD 2952996808,2952996808,2952996808,2952996808
|
|
DD 2952996808,2952996808,2952996808,2952996808
|
|
DD 3210313671,3210313671,3210313671,3210313671
|
|
DD 3210313671,3210313671,3210313671,3210313671
|
|
DD 3336571891,3336571891,3336571891,3336571891
|
|
DD 3336571891,3336571891,3336571891,3336571891
|
|
DD 3584528711,3584528711,3584528711,3584528711
|
|
DD 3584528711,3584528711,3584528711,3584528711
|
|
DD 113926993,113926993,113926993,113926993
|
|
DD 113926993,113926993,113926993,113926993
|
|
DD 338241895,338241895,338241895,338241895
|
|
DD 338241895,338241895,338241895,338241895
|
|
DD 666307205,666307205,666307205,666307205
|
|
DD 666307205,666307205,666307205,666307205
|
|
DD 773529912,773529912,773529912,773529912
|
|
DD 773529912,773529912,773529912,773529912
|
|
DD 1294757372,1294757372,1294757372,1294757372
|
|
DD 1294757372,1294757372,1294757372,1294757372
|
|
DD 1396182291,1396182291,1396182291,1396182291
|
|
DD 1396182291,1396182291,1396182291,1396182291
|
|
DD 1695183700,1695183700,1695183700,1695183700
|
|
DD 1695183700,1695183700,1695183700,1695183700
|
|
DD 1986661051,1986661051,1986661051,1986661051
|
|
DD 1986661051,1986661051,1986661051,1986661051
|
|
DD 2177026350,2177026350,2177026350,2177026350
|
|
DD 2177026350,2177026350,2177026350,2177026350
|
|
DD 2456956037,2456956037,2456956037,2456956037
|
|
DD 2456956037,2456956037,2456956037,2456956037
|
|
DD 2730485921,2730485921,2730485921,2730485921
|
|
DD 2730485921,2730485921,2730485921,2730485921
|
|
DD 2820302411,2820302411,2820302411,2820302411
|
|
DD 2820302411,2820302411,2820302411,2820302411
|
|
DD 3259730800,3259730800,3259730800,3259730800
|
|
DD 3259730800,3259730800,3259730800,3259730800
|
|
DD 3345764771,3345764771,3345764771,3345764771
|
|
DD 3345764771,3345764771,3345764771,3345764771
|
|
DD 3516065817,3516065817,3516065817,3516065817
|
|
DD 3516065817,3516065817,3516065817,3516065817
|
|
DD 3600352804,3600352804,3600352804,3600352804
|
|
DD 3600352804,3600352804,3600352804,3600352804
|
|
DD 4094571909,4094571909,4094571909,4094571909
|
|
DD 4094571909,4094571909,4094571909,4094571909
|
|
DD 275423344,275423344,275423344,275423344
|
|
DD 275423344,275423344,275423344,275423344
|
|
DD 430227734,430227734,430227734,430227734
|
|
DD 430227734,430227734,430227734,430227734
|
|
DD 506948616,506948616,506948616,506948616
|
|
DD 506948616,506948616,506948616,506948616
|
|
DD 659060556,659060556,659060556,659060556
|
|
DD 659060556,659060556,659060556,659060556
|
|
DD 883997877,883997877,883997877,883997877
|
|
DD 883997877,883997877,883997877,883997877
|
|
DD 958139571,958139571,958139571,958139571
|
|
DD 958139571,958139571,958139571,958139571
|
|
DD 1322822218,1322822218,1322822218,1322822218
|
|
DD 1322822218,1322822218,1322822218,1322822218
|
|
DD 1537002063,1537002063,1537002063,1537002063
|
|
DD 1537002063,1537002063,1537002063,1537002063
|
|
DD 1747873779,1747873779,1747873779,1747873779
|
|
DD 1747873779,1747873779,1747873779,1747873779
|
|
DD 1955562222,1955562222,1955562222,1955562222
|
|
DD 1955562222,1955562222,1955562222,1955562222
|
|
DD 2024104815,2024104815,2024104815,2024104815
|
|
DD 2024104815,2024104815,2024104815,2024104815
|
|
DD 2227730452,2227730452,2227730452,2227730452
|
|
DD 2227730452,2227730452,2227730452,2227730452
|
|
DD 2361852424,2361852424,2361852424,2361852424
|
|
DD 2361852424,2361852424,2361852424,2361852424
|
|
DD 2428436474,2428436474,2428436474,2428436474
|
|
DD 2428436474,2428436474,2428436474,2428436474
|
|
DD 2756734187,2756734187,2756734187,2756734187
|
|
DD 2756734187,2756734187,2756734187,2756734187
|
|
DD 3204031479,3204031479,3204031479,3204031479
|
|
DD 3204031479,3204031479,3204031479,3204031479
|
|
DD 3329325298,3329325298,3329325298,3329325298
|
|
DD 3329325298,3329325298,3329325298,3329325298
|
|
$L$pbswap:
|
|
DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
|
|
DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
|
|
K256_shaext:
|
|
DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
|
DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
|
DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
|
DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
|
DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
|
DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
|
DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
|
DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
|
DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
|
DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
|
DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
|
DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
|
DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
|
DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
|
DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
|
DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
|
DB 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111
|
|
DB 99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114
|
|
DB 32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
|
|
DB 65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112
|
|
DB 101,110,115,115,108,46,111,114,103,62,0
|
|
EXTERN __imp_RtlVirtualUnwind
|
|
|
|
ALIGN 16
|
|
se_handler:
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
pushfq
|
|
sub rsp,64
|
|
|
|
mov rax,QWORD[120+r8]
|
|
mov rbx,QWORD[248+r8]
|
|
|
|
mov rsi,QWORD[8+r9]
|
|
mov r11,QWORD[56+r9]
|
|
|
|
mov r10d,DWORD[r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jb NEAR $L$in_prologue
|
|
|
|
mov rax,QWORD[152+r8]
|
|
|
|
mov r10d,DWORD[4+r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jae NEAR $L$in_prologue
|
|
|
|
mov rax,QWORD[272+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
mov QWORD[144+r8],rbx
|
|
mov QWORD[160+r8],rbp
|
|
|
|
lea rsi,[((-24-160))+rax]
|
|
lea rdi,[512+r8]
|
|
mov ecx,20
|
|
DD 0xa548f3fc
|
|
|
|
$L$in_prologue:
|
|
mov rdi,QWORD[8+rax]
|
|
mov rsi,QWORD[16+rax]
|
|
mov QWORD[152+r8],rax
|
|
mov QWORD[168+r8],rsi
|
|
mov QWORD[176+r8],rdi
|
|
|
|
mov rdi,QWORD[40+r9]
|
|
mov rsi,r8
|
|
mov ecx,154
|
|
DD 0xa548f3fc
|
|
|
|
mov rsi,r9
|
|
xor rcx,rcx
|
|
mov rdx,QWORD[8+rsi]
|
|
mov r8,QWORD[rsi]
|
|
mov r9,QWORD[16+rsi]
|
|
mov r10,QWORD[40+rsi]
|
|
lea r11,[56+rsi]
|
|
lea r12,[24+rsi]
|
|
mov QWORD[32+rsp],r10
|
|
mov QWORD[40+rsp],r11
|
|
mov QWORD[48+rsp],r12
|
|
mov QWORD[56+rsp],rcx
|
|
call QWORD[__imp_RtlVirtualUnwind]
|
|
|
|
mov eax,1
|
|
add rsp,64
|
|
popfq
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
ALIGN 16
|
|
avx2_handler:
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
pushfq
|
|
sub rsp,64
|
|
|
|
mov rax,QWORD[120+r8]
|
|
mov rbx,QWORD[248+r8]
|
|
|
|
mov rsi,QWORD[8+r9]
|
|
mov r11,QWORD[56+r9]
|
|
|
|
mov r10d,DWORD[r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jb NEAR $L$in_prologue
|
|
|
|
mov rax,QWORD[152+r8]
|
|
|
|
mov r10d,DWORD[4+r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jae NEAR $L$in_prologue
|
|
|
|
mov rax,QWORD[544+r8]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
mov r12,QWORD[((-24))+rax]
|
|
mov r13,QWORD[((-32))+rax]
|
|
mov r14,QWORD[((-40))+rax]
|
|
mov r15,QWORD[((-48))+rax]
|
|
mov QWORD[144+r8],rbx
|
|
mov QWORD[160+r8],rbp
|
|
mov QWORD[216+r8],r12
|
|
mov QWORD[224+r8],r13
|
|
mov QWORD[232+r8],r14
|
|
mov QWORD[240+r8],r15
|
|
|
|
lea rsi,[((-56-160))+rax]
|
|
lea rdi,[512+r8]
|
|
mov ecx,20
|
|
DD 0xa548f3fc
|
|
|
|
jmp NEAR $L$in_prologue
|
|
|
|
section .pdata rdata align=4
|
|
ALIGN 4
|
|
DD $L$SEH_begin_sha256_multi_block wrt ..imagebase
|
|
DD $L$SEH_end_sha256_multi_block wrt ..imagebase
|
|
DD $L$SEH_info_sha256_multi_block wrt ..imagebase
|
|
DD $L$SEH_begin_sha256_multi_block_shaext wrt ..imagebase
|
|
DD $L$SEH_end_sha256_multi_block_shaext wrt ..imagebase
|
|
DD $L$SEH_info_sha256_multi_block_shaext wrt ..imagebase
|
|
DD $L$SEH_begin_sha256_multi_block_avx wrt ..imagebase
|
|
DD $L$SEH_end_sha256_multi_block_avx wrt ..imagebase
|
|
DD $L$SEH_info_sha256_multi_block_avx wrt ..imagebase
|
|
DD $L$SEH_begin_sha256_multi_block_avx2 wrt ..imagebase
|
|
DD $L$SEH_end_sha256_multi_block_avx2 wrt ..imagebase
|
|
DD $L$SEH_info_sha256_multi_block_avx2 wrt ..imagebase
|
|
section .xdata rdata align=8
|
|
ALIGN 8
|
|
$L$SEH_info_sha256_multi_block:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$body wrt ..imagebase,$L$epilogue wrt ..imagebase
|
|
$L$SEH_info_sha256_multi_block_shaext:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
|
|
$L$SEH_info_sha256_multi_block_avx:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
|
|
$L$SEH_info_sha256_multi_block_avx2:
|
|
DB 9,0,0,0
|
|
DD avx2_handler wrt ..imagebase
|
|
DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
|