#include <machine/asm.h>
.text	

.globl	ossl_rsaz_amm52x40_x1_ifma256
.type	ossl_rsaz_amm52x40_x1_ifma256,@function
.align	32
ossl_rsaz_amm52x40_x1_ifma256:
.cfi_startproc	
.byte	243,15,30,250
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	vpxord	%ymm0,%ymm0,%ymm0
	vmovdqa64	%ymm0,%ymm3
	vmovdqa64	%ymm0,%ymm4
	vmovdqa64	%ymm0,%ymm5
	vmovdqa64	%ymm0,%ymm6
	vmovdqa64	%ymm0,%ymm7
	vmovdqa64	%ymm0,%ymm8
	vmovdqa64	%ymm0,%ymm9
	vmovdqa64	%ymm0,%ymm10
	vmovdqa64	%ymm0,%ymm11
	vmovdqa64	%ymm0,%ymm12

	xorl	%r9d,%r9d

	movq	%rdx,%r11
	movq	$0xfffffffffffff,%rax


	movl	$10,%ebx

.align	32
.Lloop10:
	movq	0(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm1,%ymm3
	vpmadd52luq	32(%rsi),%ymm1,%ymm4
	vpmadd52luq	64(%rsi),%ymm1,%ymm5
	vpmadd52luq	96(%rsi),%ymm1,%ymm6
	vpmadd52luq	128(%rsi),%ymm1,%ymm7
	vpmadd52luq	160(%rsi),%ymm1,%ymm8
	vpmadd52luq	192(%rsi),%ymm1,%ymm9
	vpmadd52luq	224(%rsi),%ymm1,%ymm10
	vpmadd52luq	256(%rsi),%ymm1,%ymm11
	vpmadd52luq	288(%rsi),%ymm1,%ymm12

	vpmadd52luq	0(%rcx),%ymm2,%ymm3
	vpmadd52luq	32(%rcx),%ymm2,%ymm4
	vpmadd52luq	64(%rcx),%ymm2,%ymm5
	vpmadd52luq	96(%rcx),%ymm2,%ymm6
	vpmadd52luq	128(%rcx),%ymm2,%ymm7
	vpmadd52luq	160(%rcx),%ymm2,%ymm8
	vpmadd52luq	192(%rcx),%ymm2,%ymm9
	vpmadd52luq	224(%rcx),%ymm2,%ymm10
	vpmadd52luq	256(%rcx),%ymm2,%ymm11
	vpmadd52luq	288(%rcx),%ymm2,%ymm12


	valignq	$1,%ymm3,%ymm4,%ymm3
	valignq	$1,%ymm4,%ymm5,%ymm4
	valignq	$1,%ymm5,%ymm6,%ymm5
	valignq	$1,%ymm6,%ymm7,%ymm6
	valignq	$1,%ymm7,%ymm8,%ymm7
	valignq	$1,%ymm8,%ymm9,%ymm8
	valignq	$1,%ymm9,%ymm10,%ymm9
	valignq	$1,%ymm10,%ymm11,%ymm10
	valignq	$1,%ymm11,%ymm12,%ymm11
	valignq	$1,%ymm12,%ymm0,%ymm12

	vmovq	%xmm3,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm1,%ymm3
	vpmadd52huq	32(%rsi),%ymm1,%ymm4
	vpmadd52huq	64(%rsi),%ymm1,%ymm5
	vpmadd52huq	96(%rsi),%ymm1,%ymm6
	vpmadd52huq	128(%rsi),%ymm1,%ymm7
	vpmadd52huq	160(%rsi),%ymm1,%ymm8
	vpmadd52huq	192(%rsi),%ymm1,%ymm9
	vpmadd52huq	224(%rsi),%ymm1,%ymm10
	vpmadd52huq	256(%rsi),%ymm1,%ymm11
	vpmadd52huq	288(%rsi),%ymm1,%ymm12

	vpmadd52huq	0(%rcx),%ymm2,%ymm3
	vpmadd52huq	32(%rcx),%ymm2,%ymm4
	vpmadd52huq	64(%rcx),%ymm2,%ymm5
	vpmadd52huq	96(%rcx),%ymm2,%ymm6
	vpmadd52huq	128(%rcx),%ymm2,%ymm7
	vpmadd52huq	160(%rcx),%ymm2,%ymm8
	vpmadd52huq	192(%rcx),%ymm2,%ymm9
	vpmadd52huq	224(%rcx),%ymm2,%ymm10
	vpmadd52huq	256(%rcx),%ymm2,%ymm11
	vpmadd52huq	288(%rcx),%ymm2,%ymm12
	movq	8(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm1,%ymm3
	vpmadd52luq	32(%rsi),%ymm1,%ymm4
	vpmadd52luq	64(%rsi),%ymm1,%ymm5
	vpmadd52luq	96(%rsi),%ymm1,%ymm6
	vpmadd52luq	128(%rsi),%ymm1,%ymm7
	vpmadd52luq	160(%rsi),%ymm1,%ymm8
	vpmadd52luq	192(%rsi),%ymm1,%ymm9
	vpmadd52luq	224(%rsi),%ymm1,%ymm10
	vpmadd52luq	256(%rsi),%ymm1,%ymm11
	vpmadd52luq	288(%rsi),%ymm1,%ymm12

	vpmadd52luq	0(%rcx),%ymm2,%ymm3
	vpmadd52luq	32(%rcx),%ymm2,%ymm4
	vpmadd52luq	64(%rcx),%ymm2,%ymm5
	vpmadd52luq	96(%rcx),%ymm2,%ymm6
	vpmadd52luq	128(%rcx),%ymm2,%ymm7
	vpmadd52luq	160(%rcx),%ymm2,%ymm8
	vpmadd52luq	192(%rcx),%ymm2,%ymm9
	vpmadd52luq	224(%rcx),%ymm2,%ymm10
	vpmadd52luq	256(%rcx),%ymm2,%ymm11
	vpmadd52luq	288(%rcx),%ymm2,%ymm12


	valignq	$1,%ymm3,%ymm4,%ymm3
	valignq	$1,%ymm4,%ymm5,%ymm4
	valignq	$1,%ymm5,%ymm6,%ymm5
	valignq	$1,%ymm6,%ymm7,%ymm6
	valignq	$1,%ymm7,%ymm8,%ymm7
	valignq	$1,%ymm8,%ymm9,%ymm8
	valignq	$1,%ymm9,%ymm10,%ymm9
	valignq	$1,%ymm10,%ymm11,%ymm10
	valignq	$1,%ymm11,%ymm12,%ymm11
	valignq	$1,%ymm12,%ymm0,%ymm12

	vmovq	%xmm3,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm1,%ymm3
	vpmadd52huq	32(%rsi),%ymm1,%ymm4
	vpmadd52huq	64(%rsi),%ymm1,%ymm5
	vpmadd52huq	96(%rsi),%ymm1,%ymm6
	vpmadd52huq	128(%rsi),%ymm1,%ymm7
	vpmadd52huq	160(%rsi),%ymm1,%ymm8
	vpmadd52huq	192(%rsi),%ymm1,%ymm9
	vpmadd52huq	224(%rsi),%ymm1,%ymm10
	vpmadd52huq	256(%rsi),%ymm1,%ymm11
	vpmadd52huq	288(%rsi),%ymm1,%ymm12

	vpmadd52huq	0(%rcx),%ymm2,%ymm3
	vpmadd52huq	32(%rcx),%ymm2,%ymm4
	vpmadd52huq	64(%rcx),%ymm2,%ymm5
	vpmadd52huq	96(%rcx),%ymm2,%ymm6
	vpmadd52huq	128(%rcx),%ymm2,%ymm7
	vpmadd52huq	160(%rcx),%ymm2,%ymm8
	vpmadd52huq	192(%rcx),%ymm2,%ymm9
	vpmadd52huq	224(%rcx),%ymm2,%ymm10
	vpmadd52huq	256(%rcx),%ymm2,%ymm11
	vpmadd52huq	288(%rcx),%ymm2,%ymm12
	movq	16(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm1,%ymm3
	vpmadd52luq	32(%rsi),%ymm1,%ymm4
	vpmadd52luq	64(%rsi),%ymm1,%ymm5
	vpmadd52luq	96(%rsi),%ymm1,%ymm6
	vpmadd52luq	128(%rsi),%ymm1,%ymm7
	vpmadd52luq	160(%rsi),%ymm1,%ymm8
	vpmadd52luq	192(%rsi),%ymm1,%ymm9
	vpmadd52luq	224(%rsi),%ymm1,%ymm10
	vpmadd52luq	256(%rsi),%ymm1,%ymm11
	vpmadd52luq	288(%rsi),%ymm1,%ymm12

	vpmadd52luq	0(%rcx),%ymm2,%ymm3
	vpmadd52luq	32(%rcx),%ymm2,%ymm4
	vpmadd52luq	64(%rcx),%ymm2,%ymm5
	vpmadd52luq	96(%rcx),%ymm2,%ymm6
	vpmadd52luq	128(%rcx),%ymm2,%ymm7
	vpmadd52luq	160(%rcx),%ymm2,%ymm8
	vpmadd52luq	192(%rcx),%ymm2,%ymm9
	vpmadd52luq	224(%rcx),%ymm2,%ymm10
	vpmadd52luq	256(%rcx),%ymm2,%ymm11
	vpmadd52luq	288(%rcx),%ymm2,%ymm12


	valignq	$1,%ymm3,%ymm4,%ymm3
	valignq	$1,%ymm4,%ymm5,%ymm4
	valignq	$1,%ymm5,%ymm6,%ymm5
	valignq	$1,%ymm6,%ymm7,%ymm6
	valignq	$1,%ymm7,%ymm8,%ymm7
	valignq	$1,%ymm8,%ymm9,%ymm8
	valignq	$1,%ymm9,%ymm10,%ymm9
	valignq	$1,%ymm10,%ymm11,%ymm10
	valignq	$1,%ymm11,%ymm12,%ymm11
	valignq	$1,%ymm12,%ymm0,%ymm12

	vmovq	%xmm3,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm1,%ymm3
	vpmadd52huq	32(%rsi),%ymm1,%ymm4
	vpmadd52huq	64(%rsi),%ymm1,%ymm5
	vpmadd52huq	96(%rsi),%ymm1,%ymm6
	vpmadd52huq	128(%rsi),%ymm1,%ymm7
	vpmadd52huq	160(%rsi),%ymm1,%ymm8
	vpmadd52huq	192(%rsi),%ymm1,%ymm9
	vpmadd52huq	224(%rsi),%ymm1,%ymm10
	vpmadd52huq	256(%rsi),%ymm1,%ymm11
	vpmadd52huq	288(%rsi),%ymm1,%ymm12

	vpmadd52huq	0(%rcx),%ymm2,%ymm3
	vpmadd52huq	32(%rcx),%ymm2,%ymm4
	vpmadd52huq	64(%rcx),%ymm2,%ymm5
	vpmadd52huq	96(%rcx),%ymm2,%ymm6
	vpmadd52huq	128(%rcx),%ymm2,%ymm7
	vpmadd52huq	160(%rcx),%ymm2,%ymm8
	vpmadd52huq	192(%rcx),%ymm2,%ymm9
	vpmadd52huq	224(%rcx),%ymm2,%ymm10
	vpmadd52huq	256(%rcx),%ymm2,%ymm11
	vpmadd52huq	288(%rcx),%ymm2,%ymm12
	movq	24(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm1,%ymm3
	vpmadd52luq	32(%rsi),%ymm1,%ymm4
	vpmadd52luq	64(%rsi),%ymm1,%ymm5
	vpmadd52luq	96(%rsi),%ymm1,%ymm6
	vpmadd52luq	128(%rsi),%ymm1,%ymm7
	vpmadd52luq	160(%rsi),%ymm1,%ymm8
	vpmadd52luq	192(%rsi),%ymm1,%ymm9
	vpmadd52luq	224(%rsi),%ymm1,%ymm10
	vpmadd52luq	256(%rsi),%ymm1,%ymm11
	vpmadd52luq	288(%rsi),%ymm1,%ymm12

	vpmadd52luq	0(%rcx),%ymm2,%ymm3
	vpmadd52luq	32(%rcx),%ymm2,%ymm4
	vpmadd52luq	64(%rcx),%ymm2,%ymm5
	vpmadd52luq	96(%rcx),%ymm2,%ymm6
	vpmadd52luq	128(%rcx),%ymm2,%ymm7
	vpmadd52luq	160(%rcx),%ymm2,%ymm8
	vpmadd52luq	192(%rcx),%ymm2,%ymm9
	vpmadd52luq	224(%rcx),%ymm2,%ymm10
	vpmadd52luq	256(%rcx),%ymm2,%ymm11
	vpmadd52luq	288(%rcx),%ymm2,%ymm12


	valignq	$1,%ymm3,%ymm4,%ymm3
	valignq	$1,%ymm4,%ymm5,%ymm4
	valignq	$1,%ymm5,%ymm6,%ymm5
	valignq	$1,%ymm6,%ymm7,%ymm6
	valignq	$1,%ymm7,%ymm8,%ymm7
	valignq	$1,%ymm8,%ymm9,%ymm8
	valignq	$1,%ymm9,%ymm10,%ymm9
	valignq	$1,%ymm10,%ymm11,%ymm10
	valignq	$1,%ymm11,%ymm12,%ymm11
	valignq	$1,%ymm12,%ymm0,%ymm12

	vmovq	%xmm3,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm1,%ymm3
	vpmadd52huq	32(%rsi),%ymm1,%ymm4
	vpmadd52huq	64(%rsi),%ymm1,%ymm5
	vpmadd52huq	96(%rsi),%ymm1,%ymm6
	vpmadd52huq	128(%rsi),%ymm1,%ymm7
	vpmadd52huq	160(%rsi),%ymm1,%ymm8
	vpmadd52huq	192(%rsi),%ymm1,%ymm9
	vpmadd52huq	224(%rsi),%ymm1,%ymm10
	vpmadd52huq	256(%rsi),%ymm1,%ymm11
	vpmadd52huq	288(%rsi),%ymm1,%ymm12

	vpmadd52huq	0(%rcx),%ymm2,%ymm3
	vpmadd52huq	32(%rcx),%ymm2,%ymm4
	vpmadd52huq	64(%rcx),%ymm2,%ymm5
	vpmadd52huq	96(%rcx),%ymm2,%ymm6
	vpmadd52huq	128(%rcx),%ymm2,%ymm7
	vpmadd52huq	160(%rcx),%ymm2,%ymm8
	vpmadd52huq	192(%rcx),%ymm2,%ymm9
	vpmadd52huq	224(%rcx),%ymm2,%ymm10
	vpmadd52huq	256(%rcx),%ymm2,%ymm11
	vpmadd52huq	288(%rcx),%ymm2,%ymm12
	leaq	32(%r11),%r11
	decl	%ebx
	jne	.Lloop10

	vpbroadcastq	%r9,%ymm0
	vpblendd	$3,%ymm0,%ymm3,%ymm3



	vpsrlq	$52,%ymm3,%ymm0
	vpsrlq	$52,%ymm4,%ymm1
	vpsrlq	$52,%ymm5,%ymm2
	vpsrlq	$52,%ymm6,%ymm23
	vpsrlq	$52,%ymm7,%ymm24
	vpsrlq	$52,%ymm8,%ymm25
	vpsrlq	$52,%ymm9,%ymm26
	vpsrlq	$52,%ymm10,%ymm27
	vpsrlq	$52,%ymm11,%ymm28
	vpsrlq	$52,%ymm12,%ymm29


	valignq	$3,%ymm28,%ymm29,%ymm29
	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm23,%ymm24,%ymm24
	valignq	$3,%ymm2,%ymm23,%ymm23
	valignq	$3,%ymm1,%ymm2,%ymm2
	valignq	$3,%ymm0,%ymm1,%ymm1
	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0


	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12


	vpaddq	%ymm0,%ymm3,%ymm3
	vpaddq	%ymm1,%ymm4,%ymm4
	vpaddq	%ymm2,%ymm5,%ymm5
	vpaddq	%ymm23,%ymm6,%ymm6
	vpaddq	%ymm24,%ymm7,%ymm7
	vpaddq	%ymm25,%ymm8,%ymm8
	vpaddq	%ymm26,%ymm9,%ymm9
	vpaddq	%ymm27,%ymm10,%ymm10
	vpaddq	%ymm28,%ymm11,%ymm11
	vpaddq	%ymm29,%ymm12,%ymm12



	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	shlb	$4,%r13b
	orb	%r13b,%r14b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
	kmovb	%k1,%r13d
	kmovb	%k2,%r12d
	shlb	$4,%r12b
	orb	%r12b,%r13b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
	kmovb	%k1,%r12d
	kmovb	%k2,%r11d
	shlb	$4,%r11b
	orb	%r11b,%r12b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
	kmovb	%k1,%r11d
	kmovb	%k2,%r10d
	shlb	$4,%r10b
	orb	%r10b,%r11b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm11,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm12,%k2
	kmovb	%k1,%r10d
	kmovb	%k2,%r9d
	shlb	$4,%r9b
	orb	%r9b,%r10b

	addb	%r14b,%r14b
	adcb	%r13b,%r13b
	adcb	%r12b,%r12b
	adcb	%r11b,%r11b
	adcb	%r10b,%r10b


	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	shlb	$4,%r8b
	orb	%r8b,%r9b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
	kmovb	%k1,%r8d
	kmovb	%k2,%edx
	shlb	$4,%dl
	orb	%dl,%r8b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
	kmovb	%k1,%edx
	kmovb	%k2,%ecx
	shlb	$4,%cl
	orb	%cl,%dl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
	kmovb	%k1,%ecx
	kmovb	%k2,%ebx
	shlb	$4,%bl
	orb	%bl,%cl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm11,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm12,%k2
	kmovb	%k1,%ebx
	kmovb	%k2,%eax
	shlb	$4,%al
	orb	%al,%bl

	addb	%r9b,%r14b
	adcb	%r8b,%r13b
	adcb	%dl,%r12b
	adcb	%cl,%r11b
	adcb	%bl,%r10b

	xorb	%r9b,%r14b
	xorb	%r8b,%r13b
	xorb	%dl,%r12b
	xorb	%cl,%r11b
	xorb	%bl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r13d,%k3
	shrb	$4,%r13b
	kmovb	%r13d,%k4
	kmovb	%r12d,%k5
	shrb	$4,%r12b
	kmovb	%r12d,%k6
	kmovb	%r11d,%k7

	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}

	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9

	shrb	$4,%r11b
	kmovb	%r11d,%k1
	kmovb	%r10d,%k2
	shrb	$4,%r10b
	kmovb	%r10d,%k3

	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm11{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm12{%k3}

	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12

	vmovdqu64	%ymm3,0(%rdi)
	vmovdqu64	%ymm4,32(%rdi)
	vmovdqu64	%ymm5,64(%rdi)
	vmovdqu64	%ymm6,96(%rdi)
	vmovdqu64	%ymm7,128(%rdi)
	vmovdqu64	%ymm8,160(%rdi)
	vmovdqu64	%ymm9,192(%rdi)
	vmovdqu64	%ymm10,224(%rdi)
	vmovdqu64	%ymm11,256(%rdi)
	vmovdqu64	%ymm12,288(%rdi)

	vzeroupper
	leaq	(%rsp),%rax
.cfi_def_cfa_register	%rax
	movq	0(%rax),%r15
.cfi_restore	%r15
	movq	8(%rax),%r14
.cfi_restore	%r14
	movq	16(%rax),%r13
.cfi_restore	%r13
	movq	24(%rax),%r12
.cfi_restore	%r12
	movq	32(%rax),%rbp
.cfi_restore	%rbp
	movq	40(%rax),%rbx
.cfi_restore	%rbx
	leaq	48(%rax),%rsp
.cfi_def_cfa	%rsp,8
.Lossl_rsaz_amm52x40_x1_ifma256_epilogue:

	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
.section	.rodata
.align	32
.Lmask52x4:
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.text	

.globl	ossl_rsaz_amm52x40_x2_ifma256
.type	ossl_rsaz_amm52x40_x2_ifma256,@function
.align	32
ossl_rsaz_amm52x40_x2_ifma256:
.cfi_startproc	
.byte	243,15,30,250
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	vpxord	%ymm0,%ymm0,%ymm0
	vmovdqa64	%ymm0,%ymm3
	vmovdqa64	%ymm0,%ymm4
	vmovdqa64	%ymm0,%ymm5
	vmovdqa64	%ymm0,%ymm6
	vmovdqa64	%ymm0,%ymm7
	vmovdqa64	%ymm0,%ymm8
	vmovdqa64	%ymm0,%ymm9
	vmovdqa64	%ymm0,%ymm10
	vmovdqa64	%ymm0,%ymm11
	vmovdqa64	%ymm0,%ymm12

	vmovdqa64	%ymm0,%ymm13
	vmovdqa64	%ymm0,%ymm14
	vmovdqa64	%ymm0,%ymm15
	vmovdqa64	%ymm0,%ymm16
	vmovdqa64	%ymm0,%ymm17
	vmovdqa64	%ymm0,%ymm18
	vmovdqa64	%ymm0,%ymm19
	vmovdqa64	%ymm0,%ymm20
	vmovdqa64	%ymm0,%ymm21
	vmovdqa64	%ymm0,%ymm22


	xorl	%r9d,%r9d
	xorl	%r15d,%r15d

	movq	%rdx,%r11
	movq	$0xfffffffffffff,%rax

	movl	$40,%ebx

.align	32
.Lloop40:
	movq	0(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	(%r8),%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm1,%ymm3
	vpmadd52luq	32(%rsi),%ymm1,%ymm4
	vpmadd52luq	64(%rsi),%ymm1,%ymm5
	vpmadd52luq	96(%rsi),%ymm1,%ymm6
	vpmadd52luq	128(%rsi),%ymm1,%ymm7
	vpmadd52luq	160(%rsi),%ymm1,%ymm8
	vpmadd52luq	192(%rsi),%ymm1,%ymm9
	vpmadd52luq	224(%rsi),%ymm1,%ymm10
	vpmadd52luq	256(%rsi),%ymm1,%ymm11
	vpmadd52luq	288(%rsi),%ymm1,%ymm12

	vpmadd52luq	0(%rcx),%ymm2,%ymm3
	vpmadd52luq	32(%rcx),%ymm2,%ymm4
	vpmadd52luq	64(%rcx),%ymm2,%ymm5
	vpmadd52luq	96(%rcx),%ymm2,%ymm6
	vpmadd52luq	128(%rcx),%ymm2,%ymm7
	vpmadd52luq	160(%rcx),%ymm2,%ymm8
	vpmadd52luq	192(%rcx),%ymm2,%ymm9
	vpmadd52luq	224(%rcx),%ymm2,%ymm10
	vpmadd52luq	256(%rcx),%ymm2,%ymm11
	vpmadd52luq	288(%rcx),%ymm2,%ymm12


	valignq	$1,%ymm3,%ymm4,%ymm3
	valignq	$1,%ymm4,%ymm5,%ymm4
	valignq	$1,%ymm5,%ymm6,%ymm5
	valignq	$1,%ymm6,%ymm7,%ymm6
	valignq	$1,%ymm7,%ymm8,%ymm7
	valignq	$1,%ymm8,%ymm9,%ymm8
	valignq	$1,%ymm9,%ymm10,%ymm9
	valignq	$1,%ymm10,%ymm11,%ymm10
	valignq	$1,%ymm11,%ymm12,%ymm11
	valignq	$1,%ymm12,%ymm0,%ymm12

	vmovq	%xmm3,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm1,%ymm3
	vpmadd52huq	32(%rsi),%ymm1,%ymm4
	vpmadd52huq	64(%rsi),%ymm1,%ymm5
	vpmadd52huq	96(%rsi),%ymm1,%ymm6
	vpmadd52huq	128(%rsi),%ymm1,%ymm7
	vpmadd52huq	160(%rsi),%ymm1,%ymm8
	vpmadd52huq	192(%rsi),%ymm1,%ymm9
	vpmadd52huq	224(%rsi),%ymm1,%ymm10
	vpmadd52huq	256(%rsi),%ymm1,%ymm11
	vpmadd52huq	288(%rsi),%ymm1,%ymm12

	vpmadd52huq	0(%rcx),%ymm2,%ymm3
	vpmadd52huq	32(%rcx),%ymm2,%ymm4
	vpmadd52huq	64(%rcx),%ymm2,%ymm5
	vpmadd52huq	96(%rcx),%ymm2,%ymm6
	vpmadd52huq	128(%rcx),%ymm2,%ymm7
	vpmadd52huq	160(%rcx),%ymm2,%ymm8
	vpmadd52huq	192(%rcx),%ymm2,%ymm9
	vpmadd52huq	224(%rcx),%ymm2,%ymm10
	vpmadd52huq	256(%rcx),%ymm2,%ymm11
	vpmadd52huq	288(%rcx),%ymm2,%ymm12
	movq	320(%r11),%r13

	vpbroadcastq	%r13,%ymm1
	movq	320(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r15
	movq	%r12,%r10
	adcq	$0,%r10

	movq	8(%r8),%r13
	imulq	%r15,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm2
	movq	320(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r15
	adcq	%r12,%r10

	shrq	$52,%r15
	salq	$12,%r10
	orq	%r10,%r15

	vpmadd52luq	320(%rsi),%ymm1,%ymm13
	vpmadd52luq	352(%rsi),%ymm1,%ymm14
	vpmadd52luq	384(%rsi),%ymm1,%ymm15
	vpmadd52luq	416(%rsi),%ymm1,%ymm16
	vpmadd52luq	448(%rsi),%ymm1,%ymm17
	vpmadd52luq	480(%rsi),%ymm1,%ymm18
	vpmadd52luq	512(%rsi),%ymm1,%ymm19
	vpmadd52luq	544(%rsi),%ymm1,%ymm20
	vpmadd52luq	576(%rsi),%ymm1,%ymm21
	vpmadd52luq	608(%rsi),%ymm1,%ymm22

	vpmadd52luq	320(%rcx),%ymm2,%ymm13
	vpmadd52luq	352(%rcx),%ymm2,%ymm14
	vpmadd52luq	384(%rcx),%ymm2,%ymm15
	vpmadd52luq	416(%rcx),%ymm2,%ymm16
	vpmadd52luq	448(%rcx),%ymm2,%ymm17
	vpmadd52luq	480(%rcx),%ymm2,%ymm18
	vpmadd52luq	512(%rcx),%ymm2,%ymm19
	vpmadd52luq	544(%rcx),%ymm2,%ymm20
	vpmadd52luq	576(%rcx),%ymm2,%ymm21
	vpmadd52luq	608(%rcx),%ymm2,%ymm22


	valignq	$1,%ymm13,%ymm14,%ymm13
	valignq	$1,%ymm14,%ymm15,%ymm14
	valignq	$1,%ymm15,%ymm16,%ymm15
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm20,%ymm19
	valignq	$1,%ymm20,%ymm21,%ymm20
	valignq	$1,%ymm21,%ymm22,%ymm21
	valignq	$1,%ymm22,%ymm0,%ymm22

	vmovq	%xmm13,%r13
	addq	%r13,%r15

	vpmadd52huq	320(%rsi),%ymm1,%ymm13
	vpmadd52huq	352(%rsi),%ymm1,%ymm14
	vpmadd52huq	384(%rsi),%ymm1,%ymm15
	vpmadd52huq	416(%rsi),%ymm1,%ymm16
	vpmadd52huq	448(%rsi),%ymm1,%ymm17
	vpmadd52huq	480(%rsi),%ymm1,%ymm18
	vpmadd52huq	512(%rsi),%ymm1,%ymm19
	vpmadd52huq	544(%rsi),%ymm1,%ymm20
	vpmadd52huq	576(%rsi),%ymm1,%ymm21
	vpmadd52huq	608(%rsi),%ymm1,%ymm22

	vpmadd52huq	320(%rcx),%ymm2,%ymm13
	vpmadd52huq	352(%rcx),%ymm2,%ymm14
	vpmadd52huq	384(%rcx),%ymm2,%ymm15
	vpmadd52huq	416(%rcx),%ymm2,%ymm16
	vpmadd52huq	448(%rcx),%ymm2,%ymm17
	vpmadd52huq	480(%rcx),%ymm2,%ymm18
	vpmadd52huq	512(%rcx),%ymm2,%ymm19
	vpmadd52huq	544(%rcx),%ymm2,%ymm20
	vpmadd52huq	576(%rcx),%ymm2,%ymm21
	vpmadd52huq	608(%rcx),%ymm2,%ymm22
	leaq	8(%r11),%r11
	decl	%ebx
	jne	.Lloop40

	vpbroadcastq	%r9,%ymm0
	vpblendd	$3,%ymm0,%ymm3,%ymm3



	vpsrlq	$52,%ymm3,%ymm0
	vpsrlq	$52,%ymm4,%ymm1
	vpsrlq	$52,%ymm5,%ymm2
	vpsrlq	$52,%ymm6,%ymm23
	vpsrlq	$52,%ymm7,%ymm24
	vpsrlq	$52,%ymm8,%ymm25
	vpsrlq	$52,%ymm9,%ymm26
	vpsrlq	$52,%ymm10,%ymm27
	vpsrlq	$52,%ymm11,%ymm28
	vpsrlq	$52,%ymm12,%ymm29


	valignq	$3,%ymm28,%ymm29,%ymm29
	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm23,%ymm24,%ymm24
	valignq	$3,%ymm2,%ymm23,%ymm23
	valignq	$3,%ymm1,%ymm2,%ymm2
	valignq	$3,%ymm0,%ymm1,%ymm1
	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0


	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12


	vpaddq	%ymm0,%ymm3,%ymm3
	vpaddq	%ymm1,%ymm4,%ymm4
	vpaddq	%ymm2,%ymm5,%ymm5
	vpaddq	%ymm23,%ymm6,%ymm6
	vpaddq	%ymm24,%ymm7,%ymm7
	vpaddq	%ymm25,%ymm8,%ymm8
	vpaddq	%ymm26,%ymm9,%ymm9
	vpaddq	%ymm27,%ymm10,%ymm10
	vpaddq	%ymm28,%ymm11,%ymm11
	vpaddq	%ymm29,%ymm12,%ymm12



	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	shlb	$4,%r13b
	orb	%r13b,%r14b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
	kmovb	%k1,%r13d
	kmovb	%k2,%r12d
	shlb	$4,%r12b
	orb	%r12b,%r13b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
	kmovb	%k1,%r12d
	kmovb	%k2,%r11d
	shlb	$4,%r11b
	orb	%r11b,%r12b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
	kmovb	%k1,%r11d
	kmovb	%k2,%r10d
	shlb	$4,%r10b
	orb	%r10b,%r11b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm11,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm12,%k2
	kmovb	%k1,%r10d
	kmovb	%k2,%r9d
	shlb	$4,%r9b
	orb	%r9b,%r10b

	addb	%r14b,%r14b
	adcb	%r13b,%r13b
	adcb	%r12b,%r12b
	adcb	%r11b,%r11b
	adcb	%r10b,%r10b


	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	shlb	$4,%r8b
	orb	%r8b,%r9b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
	kmovb	%k1,%r8d
	kmovb	%k2,%edx
	shlb	$4,%dl
	orb	%dl,%r8b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
	kmovb	%k1,%edx
	kmovb	%k2,%ecx
	shlb	$4,%cl
	orb	%cl,%dl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
	kmovb	%k1,%ecx
	kmovb	%k2,%ebx
	shlb	$4,%bl
	orb	%bl,%cl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm11,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm12,%k2
	kmovb	%k1,%ebx
	kmovb	%k2,%eax
	shlb	$4,%al
	orb	%al,%bl

	addb	%r9b,%r14b
	adcb	%r8b,%r13b
	adcb	%dl,%r12b
	adcb	%cl,%r11b
	adcb	%bl,%r10b

	xorb	%r9b,%r14b
	xorb	%r8b,%r13b
	xorb	%dl,%r12b
	xorb	%cl,%r11b
	xorb	%bl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r13d,%k3
	shrb	$4,%r13b
	kmovb	%r13d,%k4
	kmovb	%r12d,%k5
	shrb	$4,%r12b
	kmovb	%r12d,%k6
	kmovb	%r11d,%k7

	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}

	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9

	shrb	$4,%r11b
	kmovb	%r11d,%k1
	kmovb	%r10d,%k2
	shrb	$4,%r10b
	kmovb	%r10d,%k3

	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm11{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm12{%k3}

	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12

	vpbroadcastq	%r15,%ymm0
	vpblendd	$3,%ymm0,%ymm13,%ymm13



	vpsrlq	$52,%ymm13,%ymm0
	vpsrlq	$52,%ymm14,%ymm1
	vpsrlq	$52,%ymm15,%ymm2
	vpsrlq	$52,%ymm16,%ymm23
	vpsrlq	$52,%ymm17,%ymm24
	vpsrlq	$52,%ymm18,%ymm25
	vpsrlq	$52,%ymm19,%ymm26
	vpsrlq	$52,%ymm20,%ymm27
	vpsrlq	$52,%ymm21,%ymm28
	vpsrlq	$52,%ymm22,%ymm29


	valignq	$3,%ymm28,%ymm29,%ymm29
	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm23,%ymm24,%ymm24
	valignq	$3,%ymm2,%ymm23,%ymm23
	valignq	$3,%ymm1,%ymm2,%ymm2
	valignq	$3,%ymm0,%ymm1,%ymm1
	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0


	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22


	vpaddq	%ymm0,%ymm13,%ymm13
	vpaddq	%ymm1,%ymm14,%ymm14
	vpaddq	%ymm2,%ymm15,%ymm15
	vpaddq	%ymm23,%ymm16,%ymm16
	vpaddq	%ymm24,%ymm17,%ymm17
	vpaddq	%ymm25,%ymm18,%ymm18
	vpaddq	%ymm26,%ymm19,%ymm19
	vpaddq	%ymm27,%ymm20,%ymm20
	vpaddq	%ymm28,%ymm21,%ymm21
	vpaddq	%ymm29,%ymm22,%ymm22



	vpcmpuq	$6,.Lmask52x4(%rip),%ymm13,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm14,%k2
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	shlb	$4,%r13b
	orb	%r13b,%r14b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm15,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm16,%k2
	kmovb	%k1,%r13d
	kmovb	%k2,%r12d
	shlb	$4,%r12b
	orb	%r12b,%r13b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm17,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm18,%k2
	kmovb	%k1,%r12d
	kmovb	%k2,%r11d
	shlb	$4,%r11b
	orb	%r11b,%r12b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm19,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm20,%k2
	kmovb	%k1,%r11d
	kmovb	%k2,%r10d
	shlb	$4,%r10b
	orb	%r10b,%r11b

	vpcmpuq	$6,.Lmask52x4(%rip),%ymm21,%k1
	vpcmpuq	$6,.Lmask52x4(%rip),%ymm22,%k2
	kmovb	%k1,%r10d
	kmovb	%k2,%r9d
	shlb	$4,%r9b
	orb	%r9b,%r10b

	addb	%r14b,%r14b
	adcb	%r13b,%r13b
	adcb	%r12b,%r12b
	adcb	%r11b,%r11b
	adcb	%r10b,%r10b


	vpcmpuq	$0,.Lmask52x4(%rip),%ymm13,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm14,%k2
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	shlb	$4,%r8b
	orb	%r8b,%r9b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm15,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm16,%k2
	kmovb	%k1,%r8d
	kmovb	%k2,%edx
	shlb	$4,%dl
	orb	%dl,%r8b

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm17,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm18,%k2
	kmovb	%k1,%edx
	kmovb	%k2,%ecx
	shlb	$4,%cl
	orb	%cl,%dl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm19,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm20,%k2
	kmovb	%k1,%ecx
	kmovb	%k2,%ebx
	shlb	$4,%bl
	orb	%bl,%cl

	vpcmpuq	$0,.Lmask52x4(%rip),%ymm21,%k1
	vpcmpuq	$0,.Lmask52x4(%rip),%ymm22,%k2
	kmovb	%k1,%ebx
	kmovb	%k2,%eax
	shlb	$4,%al
	orb	%al,%bl

	addb	%r9b,%r14b
	adcb	%r8b,%r13b
	adcb	%dl,%r12b
	adcb	%cl,%r11b
	adcb	%bl,%r10b

	xorb	%r9b,%r14b
	xorb	%r8b,%r13b
	xorb	%dl,%r12b
	xorb	%cl,%r11b
	xorb	%bl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r13d,%k3
	shrb	$4,%r13b
	kmovb	%r13d,%k4
	kmovb	%r12d,%k5
	shrb	$4,%r12b
	kmovb	%r12d,%k6
	kmovb	%r11d,%k7

	vpsubq	.Lmask52x4(%rip),%ymm13,%ymm13{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm14,%ymm14{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm15,%ymm15{%k3}
	vpsubq	.Lmask52x4(%rip),%ymm16,%ymm16{%k4}
	vpsubq	.Lmask52x4(%rip),%ymm17,%ymm17{%k5}
	vpsubq	.Lmask52x4(%rip),%ymm18,%ymm18{%k6}
	vpsubq	.Lmask52x4(%rip),%ymm19,%ymm19{%k7}

	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19

	shrb	$4,%r11b
	kmovb	%r11d,%k1
	kmovb	%r10d,%k2
	shrb	$4,%r10b
	kmovb	%r10d,%k3

	vpsubq	.Lmask52x4(%rip),%ymm20,%ymm20{%k1}
	vpsubq	.Lmask52x4(%rip),%ymm21,%ymm21{%k2}
	vpsubq	.Lmask52x4(%rip),%ymm22,%ymm22{%k3}

	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22

	vmovdqu64	%ymm3,0(%rdi)
	vmovdqu64	%ymm4,32(%rdi)
	vmovdqu64	%ymm5,64(%rdi)
	vmovdqu64	%ymm6,96(%rdi)
	vmovdqu64	%ymm7,128(%rdi)
	vmovdqu64	%ymm8,160(%rdi)
	vmovdqu64	%ymm9,192(%rdi)
	vmovdqu64	%ymm10,224(%rdi)
	vmovdqu64	%ymm11,256(%rdi)
	vmovdqu64	%ymm12,288(%rdi)

	vmovdqu64	%ymm13,320(%rdi)
	vmovdqu64	%ymm14,352(%rdi)
	vmovdqu64	%ymm15,384(%rdi)
	vmovdqu64	%ymm16,416(%rdi)
	vmovdqu64	%ymm17,448(%rdi)
	vmovdqu64	%ymm18,480(%rdi)
	vmovdqu64	%ymm19,512(%rdi)
	vmovdqu64	%ymm20,544(%rdi)
	vmovdqu64	%ymm21,576(%rdi)
	vmovdqu64	%ymm22,608(%rdi)

	vzeroupper
	leaq	(%rsp),%rax
.cfi_def_cfa_register	%rax
	movq	0(%rax),%r15
.cfi_restore	%r15
	movq	8(%rax),%r14
.cfi_restore	%r14
	movq	16(%rax),%r13
.cfi_restore	%r13
	movq	24(%rax),%r12
.cfi_restore	%r12
	movq	32(%rax),%rbp
.cfi_restore	%rbp
	movq	40(%rax),%rbx
.cfi_restore	%rbx
	leaq	48(%rax),%rsp
.cfi_def_cfa	%rsp,8
.Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
.text	

.align	32
.globl	ossl_extract_multiplier_2x40_win5
.type	ossl_extract_multiplier_2x40_win5,@function
ossl_extract_multiplier_2x40_win5:
.cfi_startproc	
.byte	243,15,30,250
	vmovdqa64	.Lones(%rip),%ymm24
	vpbroadcastq	%rdx,%ymm22
	vpbroadcastq	%rcx,%ymm23
	leaq	20480(%rsi),%rax


	movq	%rsi,%r10


	vpxor	%xmm0,%xmm0,%xmm0
	vmovdqa64	%ymm0,%ymm1
	vmovdqa64	%ymm0,%ymm2
	vmovdqa64	%ymm0,%ymm3
	vmovdqa64	%ymm0,%ymm4
	vmovdqa64	%ymm0,%ymm5
	vmovdqa64	%ymm0,%ymm16
	vmovdqa64	%ymm0,%ymm17
	vmovdqa64	%ymm0,%ymm18
	vmovdqa64	%ymm0,%ymm19
	vpxorq	%ymm21,%ymm21,%ymm21
.align	32
.Lloop_0:
	vpcmpq	$0,%ymm21,%ymm22,%k1
	vmovdqu64	0(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm0,%ymm0{%k1}
	vmovdqu64	32(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm1,%ymm1{%k1}
	vmovdqu64	64(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm2,%ymm2{%k1}
	vmovdqu64	96(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm3,%ymm3{%k1}
	vmovdqu64	128(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
	vmovdqu64	160(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm5,%ymm5{%k1}
	vmovdqu64	192(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm16,%ymm16{%k1}
	vmovdqu64	224(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm17,%ymm17{%k1}
	vmovdqu64	256(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm18,%ymm18{%k1}
	vmovdqu64	288(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm19,%ymm19{%k1}
	vpaddq	%ymm24,%ymm21,%ymm21
	addq	$640,%rsi
	cmpq	%rsi,%rax
	jne	.Lloop_0
	vmovdqu64	%ymm0,0(%rdi)
	vmovdqu64	%ymm1,32(%rdi)
	vmovdqu64	%ymm2,64(%rdi)
	vmovdqu64	%ymm3,96(%rdi)
	vmovdqu64	%ymm4,128(%rdi)
	vmovdqu64	%ymm5,160(%rdi)
	vmovdqu64	%ymm16,192(%rdi)
	vmovdqu64	%ymm17,224(%rdi)
	vmovdqu64	%ymm18,256(%rdi)
	vmovdqu64	%ymm19,288(%rdi)
	movq	%r10,%rsi
	vpxorq	%ymm21,%ymm21,%ymm21
.align	32
.Lloop_320:
	vpcmpq	$0,%ymm21,%ymm23,%k1
	vmovdqu64	320(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm0,%ymm0{%k1}
	vmovdqu64	352(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm1,%ymm1{%k1}
	vmovdqu64	384(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm2,%ymm2{%k1}
	vmovdqu64	416(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm3,%ymm3{%k1}
	vmovdqu64	448(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
	vmovdqu64	480(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm5,%ymm5{%k1}
	vmovdqu64	512(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm16,%ymm16{%k1}
	vmovdqu64	544(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm17,%ymm17{%k1}
	vmovdqu64	576(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm18,%ymm18{%k1}
	vmovdqu64	608(%rsi),%ymm20
	vpblendmq	%ymm20,%ymm19,%ymm19{%k1}
	vpaddq	%ymm24,%ymm21,%ymm21
	addq	$640,%rsi
	cmpq	%rsi,%rax
	jne	.Lloop_320
	vmovdqu64	%ymm0,320(%rdi)
	vmovdqu64	%ymm1,352(%rdi)
	vmovdqu64	%ymm2,384(%rdi)
	vmovdqu64	%ymm3,416(%rdi)
	vmovdqu64	%ymm4,448(%rdi)
	vmovdqu64	%ymm5,480(%rdi)
	vmovdqu64	%ymm16,512(%rdi)
	vmovdqu64	%ymm17,544(%rdi)
	vmovdqu64	%ymm18,576(%rdi)
	vmovdqu64	%ymm19,608(%rdi)

	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
.section	.rodata
.align	32
.Lones:
.quad	1,1,1,1
.Lzeros:
.quad	0,0,0,0
	.section ".note.gnu.property", "a"
	.p2align 3
	.long 1f - 0f
	.long 4f - 1f
	.long 5
0:
	# "GNU" encoded with .byte, since .asciz isn't supported
	# on Solaris.
	.byte 0x47
	.byte 0x4e
	.byte 0x55
	.byte 0
1:
	.p2align 3
	.long 0xc0000002
	.long 3f - 2f
2:
	.long 3
3:
	.p2align 3
4:
