CloverBootloader/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm

1032 lines
27 KiB
NASM

default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
EXTERN OPENSSL_ia32cap_P
global ossl_rsaz_avx512ifma_eligible
ALIGN 32
ossl_rsaz_avx512ifma_eligible:
mov ecx,DWORD[((OPENSSL_ia32cap_P+8))]
xor eax,eax
and ecx,2149777408
cmp ecx,2149777408
cmove eax,ecx
DB 0F3h,0C3h ;repret
section .text code align=64
global ossl_rsaz_amm52x20_x1_256
ALIGN 32
ossl_rsaz_amm52x20_x1_256:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_ossl_rsaz_amm52x20_x1_256:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
DB 243,15,30,250
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$rsaz_amm52x20_x1_256_body:
vpxord ymm0,ymm0,ymm0
vmovdqa64 ymm1,ymm0
vmovdqa64 ymm16,ymm0
vmovdqa64 ymm17,ymm0
vmovdqa64 ymm18,ymm0
vmovdqa64 ymm19,ymm0
xor r9d,r9d
mov r11,rdx
mov rax,0xfffffffffffff
mov ebx,5
ALIGN 32
$L$loop5:
mov r13,QWORD[r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[rsi]
mulx r12,r13,r13
add r9,r13
mov r10,r12
adc r10,0
mov r13,r8
imul r13,r9
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[rcx]
mulx r12,r13,r13
add r9,r13
adc r10,r12
shr r9,52
sal r10,12
or r9,r10
vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
valignq ymm1,ymm16,ymm1,1
valignq ymm16,ymm17,ymm16,1
valignq ymm17,ymm18,ymm17,1
valignq ymm18,ymm19,ymm18,1
valignq ymm19,ymm0,ymm19,1
vmovq r13,xmm1
add r9,r13
vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
mov r13,QWORD[8+r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[rsi]
mulx r12,r13,r13
add r9,r13
mov r10,r12
adc r10,0
mov r13,r8
imul r13,r9
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[rcx]
mulx r12,r13,r13
add r9,r13
adc r10,r12
shr r9,52
sal r10,12
or r9,r10
vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
valignq ymm1,ymm16,ymm1,1
valignq ymm16,ymm17,ymm16,1
valignq ymm17,ymm18,ymm17,1
valignq ymm18,ymm19,ymm18,1
valignq ymm19,ymm0,ymm19,1
vmovq r13,xmm1
add r9,r13
vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
mov r13,QWORD[16+r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[rsi]
mulx r12,r13,r13
add r9,r13
mov r10,r12
adc r10,0
mov r13,r8
imul r13,r9
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[rcx]
mulx r12,r13,r13
add r9,r13
adc r10,r12
shr r9,52
sal r10,12
or r9,r10
vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
valignq ymm1,ymm16,ymm1,1
valignq ymm16,ymm17,ymm16,1
valignq ymm17,ymm18,ymm17,1
valignq ymm18,ymm19,ymm18,1
valignq ymm19,ymm0,ymm19,1
vmovq r13,xmm1
add r9,r13
vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
mov r13,QWORD[24+r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[rsi]
mulx r12,r13,r13
add r9,r13
mov r10,r12
adc r10,0
mov r13,r8
imul r13,r9
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[rcx]
mulx r12,r13,r13
add r9,r13
adc r10,r12
shr r9,52
sal r10,12
or r9,r10
vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
valignq ymm1,ymm16,ymm1,1
valignq ymm16,ymm17,ymm16,1
valignq ymm17,ymm18,ymm17,1
valignq ymm18,ymm19,ymm18,1
valignq ymm19,ymm0,ymm19,1
vmovq r13,xmm1
add r9,r13
vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
lea r11,[32+r11]
dec ebx
jne NEAR $L$loop5
vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
vpbroadcastq ymm3,r9
vpblendd ymm1,ymm1,ymm3,3
vpsrlq ymm24,ymm1,52
vpsrlq ymm25,ymm16,52
vpsrlq ymm26,ymm17,52
vpsrlq ymm27,ymm18,52
vpsrlq ymm28,ymm19,52
valignq ymm28,ymm28,ymm27,3
valignq ymm27,ymm27,ymm26,3
valignq ymm26,ymm26,ymm25,3
valignq ymm25,ymm25,ymm24,3
valignq ymm24,ymm24,ymm0,3
vpandq ymm1,ymm1,ymm4
vpandq ymm16,ymm16,ymm4
vpandq ymm17,ymm17,ymm4
vpandq ymm18,ymm18,ymm4
vpandq ymm19,ymm19,ymm4
vpaddq ymm1,ymm1,ymm24
vpaddq ymm16,ymm16,ymm25
vpaddq ymm17,ymm17,ymm26
vpaddq ymm18,ymm18,ymm27
vpaddq ymm19,ymm19,ymm28
vpcmpuq k1,ymm4,ymm1,1
vpcmpuq k2,ymm4,ymm16,1
vpcmpuq k3,ymm4,ymm17,1
vpcmpuq k4,ymm4,ymm18,1
vpcmpuq k5,ymm4,ymm19,1
kmovb r14d,k1
kmovb r13d,k2
kmovb r12d,k3
kmovb r11d,k4
kmovb r10d,k5
vpcmpuq k1,ymm4,ymm1,0
vpcmpuq k2,ymm4,ymm16,0
vpcmpuq k3,ymm4,ymm17,0
vpcmpuq k4,ymm4,ymm18,0
vpcmpuq k5,ymm4,ymm19,0
kmovb r9d,k1
kmovb r8d,k2
kmovb ebx,k3
kmovb ecx,k4
kmovb edx,k5
shl r13b,4
or r14b,r13b
shl r11b,4
or r12b,r11b
add r14b,r14b
adc r12b,r12b
adc r10b,r10b
shl r8b,4
or r9b,r8b
shl cl,4
or bl,cl
add r14b,r9b
adc r12b,bl
adc r10b,dl
xor r14b,r9b
xor r12b,bl
xor r10b,dl
kmovb k1,r14d
shr r14b,4
kmovb k2,r14d
kmovb k3,r12d
shr r12b,4
kmovb k4,r12d
kmovb k5,r10d
vpsubq ymm1{k1},ymm1,ymm4
vpsubq ymm16{k2},ymm16,ymm4
vpsubq ymm17{k3},ymm17,ymm4
vpsubq ymm18{k4},ymm18,ymm4
vpsubq ymm19{k5},ymm19,ymm4
vpandq ymm1,ymm1,ymm4
vpandq ymm16,ymm16,ymm4
vpandq ymm17,ymm17,ymm4
vpandq ymm18,ymm18,ymm4
vpandq ymm19,ymm19,ymm4
vmovdqu64 YMMWORD[rdi],ymm1
vmovdqu64 YMMWORD[32+rdi],ymm16
vmovdqu64 YMMWORD[64+rdi],ymm17
vmovdqu64 YMMWORD[96+rdi],ymm18
vmovdqu64 YMMWORD[128+rdi],ymm19
vzeroupper
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$rsaz_amm52x20_x1_256_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_ossl_rsaz_amm52x20_x1_256:
section .data data align=8
ALIGN 32
$L$mask52x4:
DQ 0xfffffffffffff
DQ 0xfffffffffffff
DQ 0xfffffffffffff
DQ 0xfffffffffffff
section .text code align=64
global ossl_rsaz_amm52x20_x2_256
ALIGN 32
ossl_rsaz_amm52x20_x2_256:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_ossl_rsaz_amm52x20_x2_256:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
DB 243,15,30,250
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$rsaz_amm52x20_x2_256_body:
vpxord ymm0,ymm0,ymm0
vmovdqa64 ymm1,ymm0
vmovdqa64 ymm16,ymm0
vmovdqa64 ymm17,ymm0
vmovdqa64 ymm18,ymm0
vmovdqa64 ymm19,ymm0
vmovdqa64 ymm2,ymm0
vmovdqa64 ymm20,ymm0
vmovdqa64 ymm21,ymm0
vmovdqa64 ymm22,ymm0
vmovdqa64 ymm23,ymm0
xor r9d,r9d
xor r15d,r15d
mov r11,rdx
mov rax,0xfffffffffffff
mov ebx,20
ALIGN 32
$L$loop20:
mov r13,QWORD[r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[rsi]
mulx r12,r13,r13
add r9,r13
mov r10,r12
adc r10,0
mov r13,QWORD[r8]
imul r13,r9
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[rcx]
mulx r12,r13,r13
add r9,r13
adc r10,r12
shr r9,52
sal r10,12
or r9,r10
vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
valignq ymm1,ymm16,ymm1,1
valignq ymm16,ymm17,ymm16,1
valignq ymm17,ymm18,ymm17,1
valignq ymm18,ymm19,ymm18,1
valignq ymm19,ymm0,ymm19,1
vmovq r13,xmm1
add r9,r13
vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
mov r13,QWORD[160+r11]
vpbroadcastq ymm3,r13
mov rdx,QWORD[160+rsi]
mulx r12,r13,r13
add r15,r13
mov r10,r12
adc r10,0
mov r13,QWORD[8+r8]
imul r13,r15
and r13,rax
vpbroadcastq ymm4,r13
mov rdx,QWORD[160+rcx]
mulx r12,r13,r13
add r15,r13
adc r10,r12
shr r15,52
sal r10,12
or r15,r10
vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi]
vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi]
vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi]
vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi]
vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi]
vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx]
vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx]
vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx]
vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx]
vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx]
valignq ymm2,ymm20,ymm2,1
valignq ymm20,ymm21,ymm20,1
valignq ymm21,ymm22,ymm21,1
valignq ymm22,ymm23,ymm22,1
valignq ymm23,ymm0,ymm23,1
vmovq r13,xmm2
add r15,r13
vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi]
vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi]
vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi]
vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi]
vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi]
vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx]
vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx]
vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx]
vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx]
vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx]
lea r11,[8+r11]
dec ebx
jne NEAR $L$loop20
vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
vpbroadcastq ymm3,r9
vpblendd ymm1,ymm1,ymm3,3
vpsrlq ymm24,ymm1,52
vpsrlq ymm25,ymm16,52
vpsrlq ymm26,ymm17,52
vpsrlq ymm27,ymm18,52
vpsrlq ymm28,ymm19,52
valignq ymm28,ymm28,ymm27,3
valignq ymm27,ymm27,ymm26,3
valignq ymm26,ymm26,ymm25,3
valignq ymm25,ymm25,ymm24,3
valignq ymm24,ymm24,ymm0,3
vpandq ymm1,ymm1,ymm4
vpandq ymm16,ymm16,ymm4
vpandq ymm17,ymm17,ymm4
vpandq ymm18,ymm18,ymm4
vpandq ymm19,ymm19,ymm4
vpaddq ymm1,ymm1,ymm24
vpaddq ymm16,ymm16,ymm25
vpaddq ymm17,ymm17,ymm26
vpaddq ymm18,ymm18,ymm27
vpaddq ymm19,ymm19,ymm28
vpcmpuq k1,ymm4,ymm1,1
vpcmpuq k2,ymm4,ymm16,1
vpcmpuq k3,ymm4,ymm17,1
vpcmpuq k4,ymm4,ymm18,1
vpcmpuq k5,ymm4,ymm19,1
kmovb r14d,k1
kmovb r13d,k2
kmovb r12d,k3
kmovb r11d,k4
kmovb r10d,k5
vpcmpuq k1,ymm4,ymm1,0
vpcmpuq k2,ymm4,ymm16,0
vpcmpuq k3,ymm4,ymm17,0
vpcmpuq k4,ymm4,ymm18,0
vpcmpuq k5,ymm4,ymm19,0
kmovb r9d,k1
kmovb r8d,k2
kmovb ebx,k3
kmovb ecx,k4
kmovb edx,k5
shl r13b,4
or r14b,r13b
shl r11b,4
or r12b,r11b
add r14b,r14b
adc r12b,r12b
adc r10b,r10b
shl r8b,4
or r9b,r8b
shl cl,4
or bl,cl
add r14b,r9b
adc r12b,bl
adc r10b,dl
xor r14b,r9b
xor r12b,bl
xor r10b,dl
kmovb k1,r14d
shr r14b,4
kmovb k2,r14d
kmovb k3,r12d
shr r12b,4
kmovb k4,r12d
kmovb k5,r10d
vpsubq ymm1{k1},ymm1,ymm4
vpsubq ymm16{k2},ymm16,ymm4
vpsubq ymm17{k3},ymm17,ymm4
vpsubq ymm18{k4},ymm18,ymm4
vpsubq ymm19{k5},ymm19,ymm4
vpandq ymm1,ymm1,ymm4
vpandq ymm16,ymm16,ymm4
vpandq ymm17,ymm17,ymm4
vpandq ymm18,ymm18,ymm4
vpandq ymm19,ymm19,ymm4
vpbroadcastq ymm3,r15
vpblendd ymm2,ymm2,ymm3,3
vpsrlq ymm24,ymm2,52
vpsrlq ymm25,ymm20,52
vpsrlq ymm26,ymm21,52
vpsrlq ymm27,ymm22,52
vpsrlq ymm28,ymm23,52
valignq ymm28,ymm28,ymm27,3
valignq ymm27,ymm27,ymm26,3
valignq ymm26,ymm26,ymm25,3
valignq ymm25,ymm25,ymm24,3
valignq ymm24,ymm24,ymm0,3
vpandq ymm2,ymm2,ymm4
vpandq ymm20,ymm20,ymm4
vpandq ymm21,ymm21,ymm4
vpandq ymm22,ymm22,ymm4
vpandq ymm23,ymm23,ymm4
vpaddq ymm2,ymm2,ymm24
vpaddq ymm20,ymm20,ymm25
vpaddq ymm21,ymm21,ymm26
vpaddq ymm22,ymm22,ymm27
vpaddq ymm23,ymm23,ymm28
vpcmpuq k1,ymm4,ymm2,1
vpcmpuq k2,ymm4,ymm20,1
vpcmpuq k3,ymm4,ymm21,1
vpcmpuq k4,ymm4,ymm22,1
vpcmpuq k5,ymm4,ymm23,1
kmovb r14d,k1
kmovb r13d,k2
kmovb r12d,k3
kmovb r11d,k4
kmovb r10d,k5
vpcmpuq k1,ymm4,ymm2,0
vpcmpuq k2,ymm4,ymm20,0
vpcmpuq k3,ymm4,ymm21,0
vpcmpuq k4,ymm4,ymm22,0
vpcmpuq k5,ymm4,ymm23,0
kmovb r9d,k1
kmovb r8d,k2
kmovb ebx,k3
kmovb ecx,k4
kmovb edx,k5
shl r13b,4
or r14b,r13b
shl r11b,4
or r12b,r11b
add r14b,r14b
adc r12b,r12b
adc r10b,r10b
shl r8b,4
or r9b,r8b
shl cl,4
or bl,cl
add r14b,r9b
adc r12b,bl
adc r10b,dl
xor r14b,r9b
xor r12b,bl
xor r10b,dl
kmovb k1,r14d
shr r14b,4
kmovb k2,r14d
kmovb k3,r12d
shr r12b,4
kmovb k4,r12d
kmovb k5,r10d
vpsubq ymm2{k1},ymm2,ymm4
vpsubq ymm20{k2},ymm20,ymm4
vpsubq ymm21{k3},ymm21,ymm4
vpsubq ymm22{k4},ymm22,ymm4
vpsubq ymm23{k5},ymm23,ymm4
vpandq ymm2,ymm2,ymm4
vpandq ymm20,ymm20,ymm4
vpandq ymm21,ymm21,ymm4
vpandq ymm22,ymm22,ymm4
vpandq ymm23,ymm23,ymm4
vmovdqu64 YMMWORD[rdi],ymm1
vmovdqu64 YMMWORD[32+rdi],ymm16
vmovdqu64 YMMWORD[64+rdi],ymm17
vmovdqu64 YMMWORD[96+rdi],ymm18
vmovdqu64 YMMWORD[128+rdi],ymm19
vmovdqu64 YMMWORD[160+rdi],ymm2
vmovdqu64 YMMWORD[192+rdi],ymm20
vmovdqu64 YMMWORD[224+rdi],ymm21
vmovdqu64 YMMWORD[256+rdi],ymm22
vmovdqu64 YMMWORD[288+rdi],ymm23
vzeroupper
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$rsaz_amm52x20_x2_256_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_ossl_rsaz_amm52x20_x2_256:
section .text code align=64
ALIGN 32
global ossl_extract_multiplier_2x20_win5
ossl_extract_multiplier_2x20_win5:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_ossl_extract_multiplier_2x20_win5:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
DB 243,15,30,250
lea rax,[rcx*4+rcx]
sal rax,5
add rsi,rax
vmovdqa64 ymm23,YMMWORD[$L$ones]
vpbroadcastq ymm22,rdx
lea rax,[10240+rsi]
vpxor xmm4,xmm4,xmm4
vmovdqa64 ymm3,ymm4
vmovdqa64 ymm2,ymm4
vmovdqa64 ymm1,ymm4
vmovdqa64 ymm0,ymm4
vmovdqa64 ymm21,ymm4
ALIGN 32
$L$loop:
vpcmpq k1,ymm22,ymm21,0
add rsi,320
vpaddq ymm21,ymm21,ymm23
vmovdqu64 ymm16,YMMWORD[((-320))+rsi]
vmovdqu64 ymm17,YMMWORD[((-288))+rsi]
vmovdqu64 ymm18,YMMWORD[((-256))+rsi]
vmovdqu64 ymm19,YMMWORD[((-224))+rsi]
vmovdqu64 ymm20,YMMWORD[((-192))+rsi]
vpblendmq ymm0{k1},ymm0,ymm16
vpblendmq ymm1{k1},ymm1,ymm17
vpblendmq ymm2{k1},ymm2,ymm18
vpblendmq ymm3{k1},ymm3,ymm19
vpblendmq ymm4{k1},ymm4,ymm20
cmp rax,rsi
jne NEAR $L$loop
vmovdqu64 YMMWORD[rdi],ymm0
vmovdqu64 YMMWORD[32+rdi],ymm1
vmovdqu64 YMMWORD[64+rdi],ymm2
vmovdqu64 YMMWORD[96+rdi],ymm3
vmovdqu64 YMMWORD[128+rdi],ymm4
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_ossl_extract_multiplier_2x20_win5:
section .data data align=8
ALIGN 32
$L$ones:
DQ 1,1,1,1
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
rsaz_def_handler:
push rsi
push rdi
push rbx
push rbp
push r12
push r13
push r14
push r15
pushfq
sub rsp,64
mov rax,QWORD[120+r8]
mov rbx,QWORD[248+r8]
mov rsi,QWORD[8+r9]
mov r11,QWORD[56+r9]
mov r10d,DWORD[r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jb NEAR $L$common_seh_tail
mov rax,QWORD[152+r8]
mov r10d,DWORD[4+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
lea rax,[48+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
mov r12,QWORD[((-24))+rax]
mov r13,QWORD[((-32))+rax]
mov r14,QWORD[((-40))+rax]
mov r15,QWORD[((-48))+rax]
mov QWORD[144+r8],rbx
mov QWORD[160+r8],rbp
mov QWORD[216+r8],r12
mov QWORD[224+r8],r13
mov QWORD[232+r8],r14
mov QWORD[240+r8],r15
$L$common_seh_tail:
mov rdi,QWORD[8+rax]
mov rsi,QWORD[16+rax]
mov QWORD[152+r8],rax
mov QWORD[168+r8],rsi
mov QWORD[176+r8],rdi
mov rdi,QWORD[40+r9]
mov rsi,r8
mov ecx,154
DD 0xa548f3fc
mov rsi,r9
xor rcx,rcx
mov rdx,QWORD[8+rsi]
mov r8,QWORD[rsi]
mov r9,QWORD[16+rsi]
mov r10,QWORD[40+rsi]
lea r11,[56+rsi]
lea r12,[24+rsi]
mov QWORD[32+rsp],r10
mov QWORD[40+rsp],r11
mov QWORD[48+rsp],r12
mov QWORD[56+rsp],rcx
call QWORD[__imp_RtlVirtualUnwind]
mov eax,1
add rsp,64
popfq
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
pop rdi
pop rsi
DB 0F3h,0C3h ;repret
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_ossl_rsaz_amm52x20_x1_256:
DB 9,0,0,0
DD rsaz_def_handler wrt ..imagebase
DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase
$L$SEH_info_ossl_rsaz_amm52x20_x2_256:
DB 9,0,0,0
DD rsaz_def_handler wrt ..imagebase
DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase
$L$SEH_info_ossl_extract_multiplier_2x20_win5:
DB 9,0,0,0
DD rsaz_def_handler wrt ..imagebase
DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase