default rel %define XMMWORD %define YMMWORD %define ZMMWORD EXTERN OPENSSL_ia32cap_P global ossl_rsaz_avx512ifma_eligible ALIGN 32 ossl_rsaz_avx512ifma_eligible: mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] xor eax,eax and ecx,2149777408 cmp ecx,2149777408 cmove eax,ecx DB 0F3h,0C3h ;repret section .text code align=64 global ossl_rsaz_amm52x20_x1_256 ALIGN 32 ossl_rsaz_amm52x20_x1_256: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ossl_rsaz_amm52x20_x1_256: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] DB 243,15,30,250 push rbx push rbp push r12 push r13 push r14 push r15 $L$rsaz_amm52x20_x1_256_body: vpxord ymm0,ymm0,ymm0 vmovdqa64 ymm1,ymm0 vmovdqa64 ymm16,ymm0 vmovdqa64 ymm17,ymm0 vmovdqa64 ymm18,ymm0 vmovdqa64 ymm19,ymm0 xor r9d,r9d mov r11,rdx mov rax,0xfffffffffffff mov ebx,5 ALIGN 32 $L$loop5: mov r13,QWORD[r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[rsi] mulx r12,r13,r13 add r9,r13 mov r10,r12 adc r10,0 mov r13,r8 imul r13,r9 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[rcx] mulx r12,r13,r13 add r9,r13 adc r10,r12 shr r9,52 sal r10,12 or r9,r10 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52luq ymm1,ymm4,YMMWORD[rcx] vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] valignq ymm1,ymm16,ymm1,1 valignq ymm16,ymm17,ymm16,1 valignq ymm17,ymm18,ymm17,1 valignq ymm18,ymm19,ymm18,1 valignq ymm19,ymm0,ymm19,1 vmovq r13,xmm1 add r9,r13 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52huq ymm1,ymm4,YMMWORD[rcx] vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] mov r13,QWORD[8+r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[rsi] mulx r12,r13,r13 add r9,r13 mov r10,r12 adc r10,0 mov r13,r8 imul r13,r9 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[rcx] mulx r12,r13,r13 add r9,r13 adc r10,r12 shr r9,52 sal r10,12 or r9,r10 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52luq ymm1,ymm4,YMMWORD[rcx] vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] valignq ymm1,ymm16,ymm1,1 valignq ymm16,ymm17,ymm16,1 valignq ymm17,ymm18,ymm17,1 valignq ymm18,ymm19,ymm18,1 valignq ymm19,ymm0,ymm19,1 vmovq r13,xmm1 add r9,r13 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52huq ymm1,ymm4,YMMWORD[rcx] vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] mov r13,QWORD[16+r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[rsi] mulx r12,r13,r13 add r9,r13 mov r10,r12 adc r10,0 mov r13,r8 imul r13,r9 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[rcx] mulx r12,r13,r13 add r9,r13 adc r10,r12 shr r9,52 sal r10,12 or r9,r10 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52luq ymm1,ymm4,YMMWORD[rcx] vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] valignq ymm1,ymm16,ymm1,1 valignq ymm16,ymm17,ymm16,1 valignq ymm17,ymm18,ymm17,1 valignq ymm18,ymm19,ymm18,1 valignq ymm19,ymm0,ymm19,1 vmovq r13,xmm1 add r9,r13 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52huq ymm1,ymm4,YMMWORD[rcx] vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] mov r13,QWORD[24+r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[rsi] mulx r12,r13,r13 add r9,r13 mov r10,r12 adc r10,0 mov r13,r8 imul r13,r9 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[rcx] mulx r12,r13,r13 add r9,r13 adc r10,r12 shr r9,52 sal r10,12 or r9,r10 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52luq ymm1,ymm4,YMMWORD[rcx] vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] valignq ymm1,ymm16,ymm1,1 valignq ymm16,ymm17,ymm16,1 valignq ymm17,ymm18,ymm17,1 valignq ymm18,ymm19,ymm18,1 valignq ymm19,ymm0,ymm19,1 vmovq r13,xmm1 add r9,r13 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52huq ymm1,ymm4,YMMWORD[rcx] vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] lea r11,[32+r11] dec ebx jne NEAR $L$loop5 vmovdqa64 ymm4,YMMWORD[$L$mask52x4] vpbroadcastq ymm3,r9 vpblendd ymm1,ymm1,ymm3,3 vpsrlq ymm24,ymm1,52 vpsrlq ymm25,ymm16,52 vpsrlq ymm26,ymm17,52 vpsrlq ymm27,ymm18,52 vpsrlq ymm28,ymm19,52 valignq ymm28,ymm28,ymm27,3 valignq ymm27,ymm27,ymm26,3 valignq ymm26,ymm26,ymm25,3 valignq ymm25,ymm25,ymm24,3 valignq ymm24,ymm24,ymm0,3 vpandq ymm1,ymm1,ymm4 vpandq ymm16,ymm16,ymm4 vpandq ymm17,ymm17,ymm4 vpandq ymm18,ymm18,ymm4 vpandq ymm19,ymm19,ymm4 vpaddq ymm1,ymm1,ymm24 vpaddq ymm16,ymm16,ymm25 vpaddq ymm17,ymm17,ymm26 vpaddq ymm18,ymm18,ymm27 vpaddq ymm19,ymm19,ymm28 vpcmpuq k1,ymm4,ymm1,1 vpcmpuq k2,ymm4,ymm16,1 vpcmpuq k3,ymm4,ymm17,1 vpcmpuq k4,ymm4,ymm18,1 vpcmpuq k5,ymm4,ymm19,1 kmovb r14d,k1 kmovb r13d,k2 kmovb r12d,k3 kmovb r11d,k4 kmovb r10d,k5 vpcmpuq k1,ymm4,ymm1,0 vpcmpuq k2,ymm4,ymm16,0 vpcmpuq k3,ymm4,ymm17,0 vpcmpuq k4,ymm4,ymm18,0 vpcmpuq k5,ymm4,ymm19,0 kmovb r9d,k1 kmovb r8d,k2 kmovb ebx,k3 kmovb ecx,k4 kmovb edx,k5 shl r13b,4 or r14b,r13b shl r11b,4 or r12b,r11b add r14b,r14b adc r12b,r12b adc r10b,r10b shl r8b,4 or r9b,r8b shl cl,4 or bl,cl add r14b,r9b adc r12b,bl adc r10b,dl xor r14b,r9b xor r12b,bl xor r10b,dl kmovb k1,r14d shr r14b,4 kmovb k2,r14d kmovb k3,r12d shr r12b,4 kmovb k4,r12d kmovb k5,r10d vpsubq ymm1{k1},ymm1,ymm4 vpsubq ymm16{k2},ymm16,ymm4 vpsubq ymm17{k3},ymm17,ymm4 vpsubq ymm18{k4},ymm18,ymm4 vpsubq ymm19{k5},ymm19,ymm4 vpandq ymm1,ymm1,ymm4 vpandq ymm16,ymm16,ymm4 vpandq ymm17,ymm17,ymm4 vpandq ymm18,ymm18,ymm4 vpandq ymm19,ymm19,ymm4 vmovdqu64 YMMWORD[rdi],ymm1 vmovdqu64 YMMWORD[32+rdi],ymm16 vmovdqu64 YMMWORD[64+rdi],ymm17 vmovdqu64 YMMWORD[96+rdi],ymm18 vmovdqu64 YMMWORD[128+rdi],ymm19 vzeroupper mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rsp,[48+rsp] $L$rsaz_amm52x20_x1_256_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_ossl_rsaz_amm52x20_x1_256: section .data data align=8 ALIGN 32 $L$mask52x4: DQ 0xfffffffffffff DQ 0xfffffffffffff DQ 0xfffffffffffff DQ 0xfffffffffffff section .text code align=64 global ossl_rsaz_amm52x20_x2_256 ALIGN 32 ossl_rsaz_amm52x20_x2_256: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ossl_rsaz_amm52x20_x2_256: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] DB 243,15,30,250 push rbx push rbp push r12 push r13 push r14 push r15 $L$rsaz_amm52x20_x2_256_body: vpxord ymm0,ymm0,ymm0 vmovdqa64 ymm1,ymm0 vmovdqa64 ymm16,ymm0 vmovdqa64 ymm17,ymm0 vmovdqa64 ymm18,ymm0 vmovdqa64 ymm19,ymm0 vmovdqa64 ymm2,ymm0 vmovdqa64 ymm20,ymm0 vmovdqa64 ymm21,ymm0 vmovdqa64 ymm22,ymm0 vmovdqa64 ymm23,ymm0 xor r9d,r9d xor r15d,r15d mov r11,rdx mov rax,0xfffffffffffff mov ebx,20 ALIGN 32 $L$loop20: mov r13,QWORD[r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[rsi] mulx r12,r13,r13 add r9,r13 mov r10,r12 adc r10,0 mov r13,QWORD[r8] imul r13,r9 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[rcx] mulx r12,r13,r13 add r9,r13 adc r10,r12 shr r9,52 sal r10,12 or r9,r10 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52luq ymm1,ymm4,YMMWORD[rcx] vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] valignq ymm1,ymm16,ymm1,1 valignq ymm16,ymm17,ymm16,1 valignq ymm17,ymm18,ymm17,1 valignq ymm18,ymm19,ymm18,1 valignq ymm19,ymm0,ymm19,1 vmovq r13,xmm1 add r9,r13 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] vpmadd52huq ymm1,ymm4,YMMWORD[rcx] vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] mov r13,QWORD[160+r11] vpbroadcastq ymm3,r13 mov rdx,QWORD[160+rsi] mulx r12,r13,r13 add r15,r13 mov r10,r12 adc r10,0 mov r13,QWORD[8+r8] imul r13,r15 and r13,rax vpbroadcastq ymm4,r13 mov rdx,QWORD[160+rcx] mulx r12,r13,r13 add r15,r13 adc r10,r12 shr r15,52 sal r10,12 or r15,r10 vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi] vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi] vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi] vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi] vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi] vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx] vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx] vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx] vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx] vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx] valignq ymm2,ymm20,ymm2,1 valignq ymm20,ymm21,ymm20,1 valignq ymm21,ymm22,ymm21,1 valignq ymm22,ymm23,ymm22,1 valignq ymm23,ymm0,ymm23,1 vmovq r13,xmm2 add r15,r13 vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi] vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi] vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi] vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi] vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi] vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx] vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx] vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx] vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx] vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx] lea r11,[8+r11] dec ebx jne NEAR $L$loop20 vmovdqa64 ymm4,YMMWORD[$L$mask52x4] vpbroadcastq ymm3,r9 vpblendd ymm1,ymm1,ymm3,3 vpsrlq ymm24,ymm1,52 vpsrlq ymm25,ymm16,52 vpsrlq ymm26,ymm17,52 vpsrlq ymm27,ymm18,52 vpsrlq ymm28,ymm19,52 valignq ymm28,ymm28,ymm27,3 valignq ymm27,ymm27,ymm26,3 valignq ymm26,ymm26,ymm25,3 valignq ymm25,ymm25,ymm24,3 valignq ymm24,ymm24,ymm0,3 vpandq ymm1,ymm1,ymm4 vpandq ymm16,ymm16,ymm4 vpandq ymm17,ymm17,ymm4 vpandq ymm18,ymm18,ymm4 vpandq ymm19,ymm19,ymm4 vpaddq ymm1,ymm1,ymm24 vpaddq ymm16,ymm16,ymm25 vpaddq ymm17,ymm17,ymm26 vpaddq ymm18,ymm18,ymm27 vpaddq ymm19,ymm19,ymm28 vpcmpuq k1,ymm4,ymm1,1 vpcmpuq k2,ymm4,ymm16,1 vpcmpuq k3,ymm4,ymm17,1 vpcmpuq k4,ymm4,ymm18,1 vpcmpuq k5,ymm4,ymm19,1 kmovb r14d,k1 kmovb r13d,k2 kmovb r12d,k3 kmovb r11d,k4 kmovb r10d,k5 vpcmpuq k1,ymm4,ymm1,0 vpcmpuq k2,ymm4,ymm16,0 vpcmpuq k3,ymm4,ymm17,0 vpcmpuq k4,ymm4,ymm18,0 vpcmpuq k5,ymm4,ymm19,0 kmovb r9d,k1 kmovb r8d,k2 kmovb ebx,k3 kmovb ecx,k4 kmovb edx,k5 shl r13b,4 or r14b,r13b shl r11b,4 or r12b,r11b add r14b,r14b adc r12b,r12b adc r10b,r10b shl r8b,4 or r9b,r8b shl cl,4 or bl,cl add r14b,r9b adc r12b,bl adc r10b,dl xor r14b,r9b xor r12b,bl xor r10b,dl kmovb k1,r14d shr r14b,4 kmovb k2,r14d kmovb k3,r12d shr r12b,4 kmovb k4,r12d kmovb k5,r10d vpsubq ymm1{k1},ymm1,ymm4 vpsubq ymm16{k2},ymm16,ymm4 vpsubq ymm17{k3},ymm17,ymm4 vpsubq ymm18{k4},ymm18,ymm4 vpsubq ymm19{k5},ymm19,ymm4 vpandq ymm1,ymm1,ymm4 vpandq ymm16,ymm16,ymm4 vpandq ymm17,ymm17,ymm4 vpandq ymm18,ymm18,ymm4 vpandq ymm19,ymm19,ymm4 vpbroadcastq ymm3,r15 vpblendd ymm2,ymm2,ymm3,3 vpsrlq ymm24,ymm2,52 vpsrlq ymm25,ymm20,52 vpsrlq ymm26,ymm21,52 vpsrlq ymm27,ymm22,52 vpsrlq ymm28,ymm23,52 valignq ymm28,ymm28,ymm27,3 valignq ymm27,ymm27,ymm26,3 valignq ymm26,ymm26,ymm25,3 valignq ymm25,ymm25,ymm24,3 valignq ymm24,ymm24,ymm0,3 vpandq ymm2,ymm2,ymm4 vpandq ymm20,ymm20,ymm4 vpandq ymm21,ymm21,ymm4 vpandq ymm22,ymm22,ymm4 vpandq ymm23,ymm23,ymm4 vpaddq ymm2,ymm2,ymm24 vpaddq ymm20,ymm20,ymm25 vpaddq ymm21,ymm21,ymm26 vpaddq ymm22,ymm22,ymm27 vpaddq ymm23,ymm23,ymm28 vpcmpuq k1,ymm4,ymm2,1 vpcmpuq k2,ymm4,ymm20,1 vpcmpuq k3,ymm4,ymm21,1 vpcmpuq k4,ymm4,ymm22,1 vpcmpuq k5,ymm4,ymm23,1 kmovb r14d,k1 kmovb r13d,k2 kmovb r12d,k3 kmovb r11d,k4 kmovb r10d,k5 vpcmpuq k1,ymm4,ymm2,0 vpcmpuq k2,ymm4,ymm20,0 vpcmpuq k3,ymm4,ymm21,0 vpcmpuq k4,ymm4,ymm22,0 vpcmpuq k5,ymm4,ymm23,0 kmovb r9d,k1 kmovb r8d,k2 kmovb ebx,k3 kmovb ecx,k4 kmovb edx,k5 shl r13b,4 or r14b,r13b shl r11b,4 or r12b,r11b add r14b,r14b adc r12b,r12b adc r10b,r10b shl r8b,4 or r9b,r8b shl cl,4 or bl,cl add r14b,r9b adc r12b,bl adc r10b,dl xor r14b,r9b xor r12b,bl xor r10b,dl kmovb k1,r14d shr r14b,4 kmovb k2,r14d kmovb k3,r12d shr r12b,4 kmovb k4,r12d kmovb k5,r10d vpsubq ymm2{k1},ymm2,ymm4 vpsubq ymm20{k2},ymm20,ymm4 vpsubq ymm21{k3},ymm21,ymm4 vpsubq ymm22{k4},ymm22,ymm4 vpsubq ymm23{k5},ymm23,ymm4 vpandq ymm2,ymm2,ymm4 vpandq ymm20,ymm20,ymm4 vpandq ymm21,ymm21,ymm4 vpandq ymm22,ymm22,ymm4 vpandq ymm23,ymm23,ymm4 vmovdqu64 YMMWORD[rdi],ymm1 vmovdqu64 YMMWORD[32+rdi],ymm16 vmovdqu64 YMMWORD[64+rdi],ymm17 vmovdqu64 YMMWORD[96+rdi],ymm18 vmovdqu64 YMMWORD[128+rdi],ymm19 vmovdqu64 YMMWORD[160+rdi],ymm2 vmovdqu64 YMMWORD[192+rdi],ymm20 vmovdqu64 YMMWORD[224+rdi],ymm21 vmovdqu64 YMMWORD[256+rdi],ymm22 vmovdqu64 YMMWORD[288+rdi],ymm23 vzeroupper mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rsp,[48+rsp] $L$rsaz_amm52x20_x2_256_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_ossl_rsaz_amm52x20_x2_256: section .text code align=64 ALIGN 32 global ossl_extract_multiplier_2x20_win5 ossl_extract_multiplier_2x20_win5: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ossl_extract_multiplier_2x20_win5: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 DB 243,15,30,250 lea rax,[rcx*4+rcx] sal rax,5 add rsi,rax vmovdqa64 ymm23,YMMWORD[$L$ones] vpbroadcastq ymm22,rdx lea rax,[10240+rsi] vpxor xmm4,xmm4,xmm4 vmovdqa64 ymm3,ymm4 vmovdqa64 ymm2,ymm4 vmovdqa64 ymm1,ymm4 vmovdqa64 ymm0,ymm4 vmovdqa64 ymm21,ymm4 ALIGN 32 $L$loop: vpcmpq k1,ymm22,ymm21,0 add rsi,320 vpaddq ymm21,ymm21,ymm23 vmovdqu64 ymm16,YMMWORD[((-320))+rsi] vmovdqu64 ymm17,YMMWORD[((-288))+rsi] vmovdqu64 ymm18,YMMWORD[((-256))+rsi] vmovdqu64 ymm19,YMMWORD[((-224))+rsi] vmovdqu64 ymm20,YMMWORD[((-192))+rsi] vpblendmq ymm0{k1},ymm0,ymm16 vpblendmq ymm1{k1},ymm1,ymm17 vpblendmq ymm2{k1},ymm2,ymm18 vpblendmq ymm3{k1},ymm3,ymm19 vpblendmq ymm4{k1},ymm4,ymm20 cmp rax,rsi jne NEAR $L$loop vmovdqu64 YMMWORD[rdi],ymm0 vmovdqu64 YMMWORD[32+rdi],ymm1 vmovdqu64 YMMWORD[64+rdi],ymm2 vmovdqu64 YMMWORD[96+rdi],ymm3 vmovdqu64 YMMWORD[128+rdi],ymm4 mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_ossl_extract_multiplier_2x20_win5: section .data data align=8 ALIGN 32 $L$ones: DQ 1,1,1,1 EXTERN __imp_RtlVirtualUnwind ALIGN 16 rsaz_def_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi DB 0F3h,0C3h ;repret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_ossl_rsaz_amm52x20_x1_256: DB 9,0,0,0 DD rsaz_def_handler wrt ..imagebase DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase $L$SEH_info_ossl_rsaz_amm52x20_x2_256: DB 9,0,0,0 DD rsaz_def_handler wrt ..imagebase DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase $L$SEH_info_ossl_extract_multiplier_2x20_win5: DB 9,0,0,0 DD rsaz_def_handler wrt ..imagebase DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase