mirror of
https://github.com/CloverHackyColor/CloverBootloader.git
synced 2024-12-04 13:23:26 +01:00
902 lines
24 KiB
ArmAsm
902 lines
24 KiB
ArmAsm
|
|
.globl ossl_rsaz_avx512ifma_eligible
|
|
.type ossl_rsaz_avx512ifma_eligible,@function
|
|
.align 32
|
|
ossl_rsaz_avx512ifma_eligible:
|
|
movl OPENSSL_ia32cap_P+8(%rip),%ecx
|
|
xorl %eax,%eax
|
|
andl $2149777408,%ecx
|
|
cmpl $2149777408,%ecx
|
|
cmovel %ecx,%eax
|
|
.byte 0xf3,0xc3
|
|
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
|
|
.text
|
|
|
|
.globl ossl_rsaz_amm52x20_x1_256
|
|
.type ossl_rsaz_amm52x20_x1_256,@function
|
|
.align 32
|
|
ossl_rsaz_amm52x20_x1_256:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Lrsaz_amm52x20_x1_256_body:
|
|
|
|
|
|
vpxord %ymm0,%ymm0,%ymm0
|
|
vmovdqa64 %ymm0,%ymm1
|
|
vmovdqa64 %ymm0,%ymm16
|
|
vmovdqa64 %ymm0,%ymm17
|
|
vmovdqa64 %ymm0,%ymm18
|
|
vmovdqa64 %ymm0,%ymm19
|
|
|
|
xorl %r9d,%r9d
|
|
|
|
movq %rdx,%r11
|
|
movq $0xfffffffffffff,%rax
|
|
|
|
|
|
movl $5,%ebx
|
|
|
|
.align 32
|
|
.Lloop5:
|
|
movq 0(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
vpmadd52luq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52luq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52luq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52luq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52luq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52luq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52luq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52luq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52luq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52luq 128(%rcx),%ymm4,%ymm19
|
|
|
|
|
|
valignq $1,%ymm1,%ymm16,%ymm1
|
|
valignq $1,%ymm16,%ymm17,%ymm16
|
|
valignq $1,%ymm17,%ymm18,%ymm17
|
|
valignq $1,%ymm18,%ymm19,%ymm18
|
|
valignq $1,%ymm19,%ymm0,%ymm19
|
|
|
|
vmovq %xmm1,%r13
|
|
addq %r13,%r9
|
|
|
|
vpmadd52huq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52huq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52huq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52huq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52huq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52huq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52huq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52huq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52huq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52huq 128(%rcx),%ymm4,%ymm19
|
|
movq 8(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
vpmadd52luq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52luq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52luq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52luq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52luq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52luq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52luq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52luq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52luq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52luq 128(%rcx),%ymm4,%ymm19
|
|
|
|
|
|
valignq $1,%ymm1,%ymm16,%ymm1
|
|
valignq $1,%ymm16,%ymm17,%ymm16
|
|
valignq $1,%ymm17,%ymm18,%ymm17
|
|
valignq $1,%ymm18,%ymm19,%ymm18
|
|
valignq $1,%ymm19,%ymm0,%ymm19
|
|
|
|
vmovq %xmm1,%r13
|
|
addq %r13,%r9
|
|
|
|
vpmadd52huq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52huq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52huq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52huq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52huq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52huq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52huq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52huq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52huq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52huq 128(%rcx),%ymm4,%ymm19
|
|
movq 16(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
vpmadd52luq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52luq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52luq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52luq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52luq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52luq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52luq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52luq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52luq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52luq 128(%rcx),%ymm4,%ymm19
|
|
|
|
|
|
valignq $1,%ymm1,%ymm16,%ymm1
|
|
valignq $1,%ymm16,%ymm17,%ymm16
|
|
valignq $1,%ymm17,%ymm18,%ymm17
|
|
valignq $1,%ymm18,%ymm19,%ymm18
|
|
valignq $1,%ymm19,%ymm0,%ymm19
|
|
|
|
vmovq %xmm1,%r13
|
|
addq %r13,%r9
|
|
|
|
vpmadd52huq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52huq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52huq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52huq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52huq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52huq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52huq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52huq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52huq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52huq 128(%rcx),%ymm4,%ymm19
|
|
movq 24(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
vpmadd52luq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52luq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52luq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52luq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52luq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52luq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52luq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52luq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52luq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52luq 128(%rcx),%ymm4,%ymm19
|
|
|
|
|
|
valignq $1,%ymm1,%ymm16,%ymm1
|
|
valignq $1,%ymm16,%ymm17,%ymm16
|
|
valignq $1,%ymm17,%ymm18,%ymm17
|
|
valignq $1,%ymm18,%ymm19,%ymm18
|
|
valignq $1,%ymm19,%ymm0,%ymm19
|
|
|
|
vmovq %xmm1,%r13
|
|
addq %r13,%r9
|
|
|
|
vpmadd52huq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52huq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52huq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52huq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52huq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52huq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52huq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52huq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52huq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52huq 128(%rcx),%ymm4,%ymm19
|
|
leaq 32(%r11),%r11
|
|
decl %ebx
|
|
jne .Lloop5
|
|
|
|
vmovdqa64 .Lmask52x4(%rip),%ymm4
|
|
|
|
vpbroadcastq %r9,%ymm3
|
|
vpblendd $3,%ymm3,%ymm1,%ymm1
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm1,%ymm24
|
|
vpsrlq $52,%ymm16,%ymm25
|
|
vpsrlq $52,%ymm17,%ymm26
|
|
vpsrlq $52,%ymm18,%ymm27
|
|
vpsrlq $52,%ymm19,%ymm28
|
|
|
|
|
|
valignq $3,%ymm27,%ymm28,%ymm28
|
|
valignq $3,%ymm26,%ymm27,%ymm27
|
|
valignq $3,%ymm25,%ymm26,%ymm26
|
|
valignq $3,%ymm24,%ymm25,%ymm25
|
|
valignq $3,%ymm0,%ymm24,%ymm24
|
|
|
|
|
|
vpandq %ymm4,%ymm1,%ymm1
|
|
vpandq %ymm4,%ymm16,%ymm16
|
|
vpandq %ymm4,%ymm17,%ymm17
|
|
vpandq %ymm4,%ymm18,%ymm18
|
|
vpandq %ymm4,%ymm19,%ymm19
|
|
|
|
|
|
vpaddq %ymm24,%ymm1,%ymm1
|
|
vpaddq %ymm25,%ymm16,%ymm16
|
|
vpaddq %ymm26,%ymm17,%ymm17
|
|
vpaddq %ymm27,%ymm18,%ymm18
|
|
vpaddq %ymm28,%ymm19,%ymm19
|
|
|
|
|
|
|
|
vpcmpuq $1,%ymm1,%ymm4,%k1
|
|
vpcmpuq $1,%ymm16,%ymm4,%k2
|
|
vpcmpuq $1,%ymm17,%ymm4,%k3
|
|
vpcmpuq $1,%ymm18,%ymm4,%k4
|
|
vpcmpuq $1,%ymm19,%ymm4,%k5
|
|
kmovb %k1,%r14d
|
|
kmovb %k2,%r13d
|
|
kmovb %k3,%r12d
|
|
kmovb %k4,%r11d
|
|
kmovb %k5,%r10d
|
|
|
|
|
|
vpcmpuq $0,%ymm1,%ymm4,%k1
|
|
vpcmpuq $0,%ymm16,%ymm4,%k2
|
|
vpcmpuq $0,%ymm17,%ymm4,%k3
|
|
vpcmpuq $0,%ymm18,%ymm4,%k4
|
|
vpcmpuq $0,%ymm19,%ymm4,%k5
|
|
kmovb %k1,%r9d
|
|
kmovb %k2,%r8d
|
|
kmovb %k3,%ebx
|
|
kmovb %k4,%ecx
|
|
kmovb %k5,%edx
|
|
|
|
|
|
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r12b,%r12b
|
|
adcb %r10b,%r10b
|
|
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
shlb $4,%cl
|
|
orb %cl,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %bl,%r12b
|
|
adcb %dl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %bl,%r12b
|
|
xorb %dl,%r10b
|
|
|
|
kmovb %r14d,%k1
|
|
shrb $4,%r14b
|
|
kmovb %r14d,%k2
|
|
kmovb %r12d,%k3
|
|
shrb $4,%r12b
|
|
kmovb %r12d,%k4
|
|
kmovb %r10d,%k5
|
|
|
|
|
|
vpsubq %ymm4,%ymm1,%ymm1{%k1}
|
|
vpsubq %ymm4,%ymm16,%ymm16{%k2}
|
|
vpsubq %ymm4,%ymm17,%ymm17{%k3}
|
|
vpsubq %ymm4,%ymm18,%ymm18{%k4}
|
|
vpsubq %ymm4,%ymm19,%ymm19{%k5}
|
|
|
|
vpandq %ymm4,%ymm1,%ymm1
|
|
vpandq %ymm4,%ymm16,%ymm16
|
|
vpandq %ymm4,%ymm17,%ymm17
|
|
vpandq %ymm4,%ymm18,%ymm18
|
|
vpandq %ymm4,%ymm19,%ymm19
|
|
|
|
vmovdqu64 %ymm1,(%rdi)
|
|
vmovdqu64 %ymm16,32(%rdi)
|
|
vmovdqu64 %ymm17,64(%rdi)
|
|
vmovdqu64 %ymm18,96(%rdi)
|
|
vmovdqu64 %ymm19,128(%rdi)
|
|
|
|
vzeroupper
|
|
movq 0(%rsp),%r15
|
|
.cfi_restore %r15
|
|
movq 8(%rsp),%r14
|
|
.cfi_restore %r14
|
|
movq 16(%rsp),%r13
|
|
.cfi_restore %r13
|
|
movq 24(%rsp),%r12
|
|
.cfi_restore %r12
|
|
movq 32(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
movq 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
leaq 48(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -48
|
|
.Lrsaz_amm52x20_x1_256_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
|
|
.data
|
|
.align 32
|
|
.Lmask52x4:
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.text
|
|
|
|
.globl ossl_rsaz_amm52x20_x2_256
|
|
.type ossl_rsaz_amm52x20_x2_256,@function
|
|
.align 32
|
|
ossl_rsaz_amm52x20_x2_256:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Lrsaz_amm52x20_x2_256_body:
|
|
|
|
|
|
vpxord %ymm0,%ymm0,%ymm0
|
|
vmovdqa64 %ymm0,%ymm1
|
|
vmovdqa64 %ymm0,%ymm16
|
|
vmovdqa64 %ymm0,%ymm17
|
|
vmovdqa64 %ymm0,%ymm18
|
|
vmovdqa64 %ymm0,%ymm19
|
|
vmovdqa64 %ymm0,%ymm2
|
|
vmovdqa64 %ymm0,%ymm20
|
|
vmovdqa64 %ymm0,%ymm21
|
|
vmovdqa64 %ymm0,%ymm22
|
|
vmovdqa64 %ymm0,%ymm23
|
|
|
|
xorl %r9d,%r9d
|
|
xorl %r15d,%r15d
|
|
|
|
movq %rdx,%r11
|
|
movq $0xfffffffffffff,%rax
|
|
|
|
movl $20,%ebx
|
|
|
|
.align 32
|
|
.Lloop20:
|
|
movq 0(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq (%r8),%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
vpmadd52luq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52luq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52luq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52luq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52luq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52luq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52luq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52luq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52luq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52luq 128(%rcx),%ymm4,%ymm19
|
|
|
|
|
|
valignq $1,%ymm1,%ymm16,%ymm1
|
|
valignq $1,%ymm16,%ymm17,%ymm16
|
|
valignq $1,%ymm17,%ymm18,%ymm17
|
|
valignq $1,%ymm18,%ymm19,%ymm18
|
|
valignq $1,%ymm19,%ymm0,%ymm19
|
|
|
|
vmovq %xmm1,%r13
|
|
addq %r13,%r9
|
|
|
|
vpmadd52huq 0(%rsi),%ymm3,%ymm1
|
|
vpmadd52huq 32(%rsi),%ymm3,%ymm16
|
|
vpmadd52huq 64(%rsi),%ymm3,%ymm17
|
|
vpmadd52huq 96(%rsi),%ymm3,%ymm18
|
|
vpmadd52huq 128(%rsi),%ymm3,%ymm19
|
|
|
|
vpmadd52huq 0(%rcx),%ymm4,%ymm1
|
|
vpmadd52huq 32(%rcx),%ymm4,%ymm16
|
|
vpmadd52huq 64(%rcx),%ymm4,%ymm17
|
|
vpmadd52huq 96(%rcx),%ymm4,%ymm18
|
|
vpmadd52huq 128(%rcx),%ymm4,%ymm19
|
|
movq 160(%r11),%r13
|
|
|
|
vpbroadcastq %r13,%ymm3
|
|
movq 160(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r15
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq 8(%r8),%r13
|
|
imulq %r15,%r13
|
|
andq %rax,%r13
|
|
|
|
vpbroadcastq %r13,%ymm4
|
|
movq 160(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r15
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r15
|
|
salq $12,%r10
|
|
orq %r10,%r15
|
|
|
|
vpmadd52luq 160(%rsi),%ymm3,%ymm2
|
|
vpmadd52luq 192(%rsi),%ymm3,%ymm20
|
|
vpmadd52luq 224(%rsi),%ymm3,%ymm21
|
|
vpmadd52luq 256(%rsi),%ymm3,%ymm22
|
|
vpmadd52luq 288(%rsi),%ymm3,%ymm23
|
|
|
|
vpmadd52luq 160(%rcx),%ymm4,%ymm2
|
|
vpmadd52luq 192(%rcx),%ymm4,%ymm20
|
|
vpmadd52luq 224(%rcx),%ymm4,%ymm21
|
|
vpmadd52luq 256(%rcx),%ymm4,%ymm22
|
|
vpmadd52luq 288(%rcx),%ymm4,%ymm23
|
|
|
|
|
|
valignq $1,%ymm2,%ymm20,%ymm2
|
|
valignq $1,%ymm20,%ymm21,%ymm20
|
|
valignq $1,%ymm21,%ymm22,%ymm21
|
|
valignq $1,%ymm22,%ymm23,%ymm22
|
|
valignq $1,%ymm23,%ymm0,%ymm23
|
|
|
|
vmovq %xmm2,%r13
|
|
addq %r13,%r15
|
|
|
|
vpmadd52huq 160(%rsi),%ymm3,%ymm2
|
|
vpmadd52huq 192(%rsi),%ymm3,%ymm20
|
|
vpmadd52huq 224(%rsi),%ymm3,%ymm21
|
|
vpmadd52huq 256(%rsi),%ymm3,%ymm22
|
|
vpmadd52huq 288(%rsi),%ymm3,%ymm23
|
|
|
|
vpmadd52huq 160(%rcx),%ymm4,%ymm2
|
|
vpmadd52huq 192(%rcx),%ymm4,%ymm20
|
|
vpmadd52huq 224(%rcx),%ymm4,%ymm21
|
|
vpmadd52huq 256(%rcx),%ymm4,%ymm22
|
|
vpmadd52huq 288(%rcx),%ymm4,%ymm23
|
|
leaq 8(%r11),%r11
|
|
decl %ebx
|
|
jne .Lloop20
|
|
|
|
vmovdqa64 .Lmask52x4(%rip),%ymm4
|
|
|
|
vpbroadcastq %r9,%ymm3
|
|
vpblendd $3,%ymm3,%ymm1,%ymm1
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm1,%ymm24
|
|
vpsrlq $52,%ymm16,%ymm25
|
|
vpsrlq $52,%ymm17,%ymm26
|
|
vpsrlq $52,%ymm18,%ymm27
|
|
vpsrlq $52,%ymm19,%ymm28
|
|
|
|
|
|
valignq $3,%ymm27,%ymm28,%ymm28
|
|
valignq $3,%ymm26,%ymm27,%ymm27
|
|
valignq $3,%ymm25,%ymm26,%ymm26
|
|
valignq $3,%ymm24,%ymm25,%ymm25
|
|
valignq $3,%ymm0,%ymm24,%ymm24
|
|
|
|
|
|
vpandq %ymm4,%ymm1,%ymm1
|
|
vpandq %ymm4,%ymm16,%ymm16
|
|
vpandq %ymm4,%ymm17,%ymm17
|
|
vpandq %ymm4,%ymm18,%ymm18
|
|
vpandq %ymm4,%ymm19,%ymm19
|
|
|
|
|
|
vpaddq %ymm24,%ymm1,%ymm1
|
|
vpaddq %ymm25,%ymm16,%ymm16
|
|
vpaddq %ymm26,%ymm17,%ymm17
|
|
vpaddq %ymm27,%ymm18,%ymm18
|
|
vpaddq %ymm28,%ymm19,%ymm19
|
|
|
|
|
|
|
|
vpcmpuq $1,%ymm1,%ymm4,%k1
|
|
vpcmpuq $1,%ymm16,%ymm4,%k2
|
|
vpcmpuq $1,%ymm17,%ymm4,%k3
|
|
vpcmpuq $1,%ymm18,%ymm4,%k4
|
|
vpcmpuq $1,%ymm19,%ymm4,%k5
|
|
kmovb %k1,%r14d
|
|
kmovb %k2,%r13d
|
|
kmovb %k3,%r12d
|
|
kmovb %k4,%r11d
|
|
kmovb %k5,%r10d
|
|
|
|
|
|
vpcmpuq $0,%ymm1,%ymm4,%k1
|
|
vpcmpuq $0,%ymm16,%ymm4,%k2
|
|
vpcmpuq $0,%ymm17,%ymm4,%k3
|
|
vpcmpuq $0,%ymm18,%ymm4,%k4
|
|
vpcmpuq $0,%ymm19,%ymm4,%k5
|
|
kmovb %k1,%r9d
|
|
kmovb %k2,%r8d
|
|
kmovb %k3,%ebx
|
|
kmovb %k4,%ecx
|
|
kmovb %k5,%edx
|
|
|
|
|
|
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r12b,%r12b
|
|
adcb %r10b,%r10b
|
|
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
shlb $4,%cl
|
|
orb %cl,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %bl,%r12b
|
|
adcb %dl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %bl,%r12b
|
|
xorb %dl,%r10b
|
|
|
|
kmovb %r14d,%k1
|
|
shrb $4,%r14b
|
|
kmovb %r14d,%k2
|
|
kmovb %r12d,%k3
|
|
shrb $4,%r12b
|
|
kmovb %r12d,%k4
|
|
kmovb %r10d,%k5
|
|
|
|
|
|
vpsubq %ymm4,%ymm1,%ymm1{%k1}
|
|
vpsubq %ymm4,%ymm16,%ymm16{%k2}
|
|
vpsubq %ymm4,%ymm17,%ymm17{%k3}
|
|
vpsubq %ymm4,%ymm18,%ymm18{%k4}
|
|
vpsubq %ymm4,%ymm19,%ymm19{%k5}
|
|
|
|
vpandq %ymm4,%ymm1,%ymm1
|
|
vpandq %ymm4,%ymm16,%ymm16
|
|
vpandq %ymm4,%ymm17,%ymm17
|
|
vpandq %ymm4,%ymm18,%ymm18
|
|
vpandq %ymm4,%ymm19,%ymm19
|
|
|
|
vpbroadcastq %r15,%ymm3
|
|
vpblendd $3,%ymm3,%ymm2,%ymm2
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm2,%ymm24
|
|
vpsrlq $52,%ymm20,%ymm25
|
|
vpsrlq $52,%ymm21,%ymm26
|
|
vpsrlq $52,%ymm22,%ymm27
|
|
vpsrlq $52,%ymm23,%ymm28
|
|
|
|
|
|
valignq $3,%ymm27,%ymm28,%ymm28
|
|
valignq $3,%ymm26,%ymm27,%ymm27
|
|
valignq $3,%ymm25,%ymm26,%ymm26
|
|
valignq $3,%ymm24,%ymm25,%ymm25
|
|
valignq $3,%ymm0,%ymm24,%ymm24
|
|
|
|
|
|
vpandq %ymm4,%ymm2,%ymm2
|
|
vpandq %ymm4,%ymm20,%ymm20
|
|
vpandq %ymm4,%ymm21,%ymm21
|
|
vpandq %ymm4,%ymm22,%ymm22
|
|
vpandq %ymm4,%ymm23,%ymm23
|
|
|
|
|
|
vpaddq %ymm24,%ymm2,%ymm2
|
|
vpaddq %ymm25,%ymm20,%ymm20
|
|
vpaddq %ymm26,%ymm21,%ymm21
|
|
vpaddq %ymm27,%ymm22,%ymm22
|
|
vpaddq %ymm28,%ymm23,%ymm23
|
|
|
|
|
|
|
|
vpcmpuq $1,%ymm2,%ymm4,%k1
|
|
vpcmpuq $1,%ymm20,%ymm4,%k2
|
|
vpcmpuq $1,%ymm21,%ymm4,%k3
|
|
vpcmpuq $1,%ymm22,%ymm4,%k4
|
|
vpcmpuq $1,%ymm23,%ymm4,%k5
|
|
kmovb %k1,%r14d
|
|
kmovb %k2,%r13d
|
|
kmovb %k3,%r12d
|
|
kmovb %k4,%r11d
|
|
kmovb %k5,%r10d
|
|
|
|
|
|
vpcmpuq $0,%ymm2,%ymm4,%k1
|
|
vpcmpuq $0,%ymm20,%ymm4,%k2
|
|
vpcmpuq $0,%ymm21,%ymm4,%k3
|
|
vpcmpuq $0,%ymm22,%ymm4,%k4
|
|
vpcmpuq $0,%ymm23,%ymm4,%k5
|
|
kmovb %k1,%r9d
|
|
kmovb %k2,%r8d
|
|
kmovb %k3,%ebx
|
|
kmovb %k4,%ecx
|
|
kmovb %k5,%edx
|
|
|
|
|
|
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r12b,%r12b
|
|
adcb %r10b,%r10b
|
|
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
shlb $4,%cl
|
|
orb %cl,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %bl,%r12b
|
|
adcb %dl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %bl,%r12b
|
|
xorb %dl,%r10b
|
|
|
|
kmovb %r14d,%k1
|
|
shrb $4,%r14b
|
|
kmovb %r14d,%k2
|
|
kmovb %r12d,%k3
|
|
shrb $4,%r12b
|
|
kmovb %r12d,%k4
|
|
kmovb %r10d,%k5
|
|
|
|
|
|
vpsubq %ymm4,%ymm2,%ymm2{%k1}
|
|
vpsubq %ymm4,%ymm20,%ymm20{%k2}
|
|
vpsubq %ymm4,%ymm21,%ymm21{%k3}
|
|
vpsubq %ymm4,%ymm22,%ymm22{%k4}
|
|
vpsubq %ymm4,%ymm23,%ymm23{%k5}
|
|
|
|
vpandq %ymm4,%ymm2,%ymm2
|
|
vpandq %ymm4,%ymm20,%ymm20
|
|
vpandq %ymm4,%ymm21,%ymm21
|
|
vpandq %ymm4,%ymm22,%ymm22
|
|
vpandq %ymm4,%ymm23,%ymm23
|
|
|
|
vmovdqu64 %ymm1,(%rdi)
|
|
vmovdqu64 %ymm16,32(%rdi)
|
|
vmovdqu64 %ymm17,64(%rdi)
|
|
vmovdqu64 %ymm18,96(%rdi)
|
|
vmovdqu64 %ymm19,128(%rdi)
|
|
|
|
vmovdqu64 %ymm2,160(%rdi)
|
|
vmovdqu64 %ymm20,192(%rdi)
|
|
vmovdqu64 %ymm21,224(%rdi)
|
|
vmovdqu64 %ymm22,256(%rdi)
|
|
vmovdqu64 %ymm23,288(%rdi)
|
|
|
|
vzeroupper
|
|
movq 0(%rsp),%r15
|
|
.cfi_restore %r15
|
|
movq 8(%rsp),%r14
|
|
.cfi_restore %r14
|
|
movq 16(%rsp),%r13
|
|
.cfi_restore %r13
|
|
movq 24(%rsp),%r12
|
|
.cfi_restore %r12
|
|
movq 32(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
movq 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
leaq 48(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -48
|
|
.Lrsaz_amm52x20_x2_256_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256
|
|
.text
|
|
|
|
.align 32
|
|
.globl ossl_extract_multiplier_2x20_win5
|
|
.type ossl_extract_multiplier_2x20_win5,@function
|
|
ossl_extract_multiplier_2x20_win5:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
leaq (%rcx,%rcx,4),%rax
|
|
salq $5,%rax
|
|
addq %rax,%rsi
|
|
|
|
vmovdqa64 .Lones(%rip),%ymm23
|
|
vpbroadcastq %rdx,%ymm22
|
|
leaq 10240(%rsi),%rax
|
|
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vmovdqa64 %ymm4,%ymm3
|
|
vmovdqa64 %ymm4,%ymm2
|
|
vmovdqa64 %ymm4,%ymm1
|
|
vmovdqa64 %ymm4,%ymm0
|
|
vmovdqa64 %ymm4,%ymm21
|
|
|
|
.align 32
|
|
.Lloop:
|
|
vpcmpq $0,%ymm21,%ymm22,%k1
|
|
addq $320,%rsi
|
|
vpaddq %ymm23,%ymm21,%ymm21
|
|
vmovdqu64 -320(%rsi),%ymm16
|
|
vmovdqu64 -288(%rsi),%ymm17
|
|
vmovdqu64 -256(%rsi),%ymm18
|
|
vmovdqu64 -224(%rsi),%ymm19
|
|
vmovdqu64 -192(%rsi),%ymm20
|
|
vpblendmq %ymm16,%ymm0,%ymm0{%k1}
|
|
vpblendmq %ymm17,%ymm1,%ymm1{%k1}
|
|
vpblendmq %ymm18,%ymm2,%ymm2{%k1}
|
|
vpblendmq %ymm19,%ymm3,%ymm3{%k1}
|
|
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
|
|
cmpq %rsi,%rax
|
|
jne .Lloop
|
|
|
|
vmovdqu64 %ymm0,(%rdi)
|
|
vmovdqu64 %ymm1,32(%rdi)
|
|
vmovdqu64 %ymm2,64(%rdi)
|
|
vmovdqu64 %ymm3,96(%rdi)
|
|
vmovdqu64 %ymm4,128(%rdi)
|
|
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
|
|
.data
|
|
.align 32
|
|
.Lones:
|
|
.quad 1,1,1,1
|
|
.section ".note.gnu.property", "a"
|
|
.p2align 3
|
|
.long 1f - 0f
|
|
.long 4f - 1f
|
|
.long 5
|
|
0:
|
|
# "GNU" encoded with .byte, since .asciz isn't supported
|
|
# on Solaris.
|
|
.byte 0x47
|
|
.byte 0x4e
|
|
.byte 0x55
|
|
.byte 0
|
|
1:
|
|
.p2align 3
|
|
.long 0xc0000002
|
|
.long 3f - 2f
|
|
2:
|
|
.long 3
|
|
3:
|
|
.p2align 3
|
|
4:
|