#include "arm_asm.h"
// Copyright 2022-2025  The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License").  You may not use
// this file except in compliance with the License.  You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
//
// ChaCha20 for ARMv8 via SVE
//
// $output is the last argument if it looks like a file (it has an extension)
// $flavour is the first argument if it doesn't look like a file
#include "arm_arch.h"

.arch	armv8-a


.hidden	OPENSSL_armcap_P

.text

.section	.rodata
.align	5
.type	_chacha_sve_consts,%object
_chacha_sve_consts:
.Lchacha20_consts:
.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
.Lrot8:
.word	0x02010003,0x04040404,0x02010003,0x04040404
.size	_chacha_sve_consts,.-_chacha_sve_consts

.previous

.globl	ChaCha20_ctr32_sve
.type	ChaCha20_ctr32_sve,%function
.align	5
ChaCha20_ctr32_sve:
	AARCH64_VALID_CALL_TARGET
.inst	0x04a0e3e5	//cntw x5, ALL, MUL #1
	cmp	x2,x5,lsl #6
	b.lt	.Lreturn
	mov	x7,0
	adrp	x6,OPENSSL_armcap_P
	ldr	w6,[x6,#:lo12:OPENSSL_armcap_P]
	tst	w6,#ARMV8_SVE2
	b.eq	1f
	mov	x7,1
	b	2f
1:
	cmp	x5,4
	b.le	.Lreturn
	adrp	x6,.Lrot8
	add	x6,x6,#:lo12:.Lrot8
	ldp	w9,w10,[x6]
.inst	0x04aa4d3f	//index z31.s,w9,w10
2:
	AARCH64_SIGN_LINK_REGISTER
	stp	d8,d9,[sp,-192]!
	stp	d10,d11,[sp,16]
	stp	d12,d13,[sp,32]
	stp	d14,d15,[sp,48]
	stp	x16,x17,[sp,64]
	stp	x18,x19,[sp,80]
	stp	x20,x21,[sp,96]
	stp	x22,x23,[sp,112]
	stp	x24,x25,[sp,128]
	stp	x26,x27,[sp,144]
	stp	x28,x29,[sp,160]
	str	x30,[sp,176]

	adrp	x6,.Lchacha20_consts
	add	x6,x6,#:lo12:.Lchacha20_consts
	ldp	x23,x24,[x6]
	ldp	x25,x26,[x3]
	ldp	x27,x28,[x3, 16]
	ldp	x29,x30,[x4]
.inst	0x2599e3e0	//ptrues p0.s,ALL
#ifdef	__AARCH64EB__
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x29,x29,#32
	ror	x30,x30,#32
#endif
	cbz	x7, 1f
.align	5
100:
	subs	x7,x2,x5,lsl #6
	b.lt	110f
	mov	x2,x7
	b.eq	101f
	cmp	x2,64
	b.lt	101f
	mixin=1
	lsr	x8,x23,#32
.inst	0x05a03ae0	//dup z0.s,w23
.inst	0x05a03af9	//dup z25.s,w23
.if	mixin == 1
	mov	w7,w23
.endif
.inst	0x05a03904	//dup z4.s,w8
.inst	0x05a0391a	//dup z26.s,w8
	lsr	x10,x24,#32
.inst	0x05a03b08	//dup z8.s,w24
.inst	0x05a03b1b	//dup z27.s,w24
.if	mixin == 1
	mov	w9,w24
.endif
.inst	0x05a0394c	//dup z12.s,w10
.inst	0x05a0395c	//dup z28.s,w10
	lsr	x12,x25,#32
.inst	0x05a03b21	//dup z1.s,w25
.inst	0x05a03b3d	//dup z29.s,w25
.if	mixin == 1
	mov	w11,w25
.endif
.inst	0x05a03985	//dup z5.s,w12
.inst	0x05a0399e	//dup z30.s,w12
	lsr	x14,x26,#32
.inst	0x05a03b49	//dup z9.s,w26
.inst	0x05a03b55	//dup z21.s,w26
.if	mixin == 1
	mov	w13,w26
.endif
.inst	0x05a039cd	//dup z13.s,w14
.inst	0x05a039d6	//dup z22.s,w14
	lsr	x16,x27,#32
.inst	0x05a03b62	//dup z2.s,w27
.inst	0x05a03b77	//dup z23.s,w27
.if	mixin == 1
	mov	w15,w27
.endif
.inst	0x05a03a06	//dup z6.s,w16
.inst	0x05a03a18	//dup z24.s,w16
	lsr	x18,x28,#32
.inst	0x05a03b8a	//dup z10.s,w28
.inst	0x05a03b91	//dup z17.s,w28
.if	mixin == 1
	mov	w17,w28
.endif
.inst	0x05a03a4e	//dup z14.s,w18
.inst	0x05a03a52	//dup z18.s,w18
	lsr	x22,x30,#32
.inst	0x05a03bcb	//dup z11.s,w30
.inst	0x05a03bd4	//dup z20.s,w30
.if	mixin == 1
	mov	w21,w30
.endif
.inst	0x05a03acf	//dup z15.s,w22
.inst	0x05a03adf	//dup z31.s,w22
.if	mixin == 1
	add	w20,w29,#1
	mov	w19,w29
.inst	0x04a14690	//index z16.s,w20,1
.inst	0x04a14683	//index z3.s,w20,1
.else
.inst	0x04a147b0	//index z16.s,w29,1
.inst	0x04a147a3	//index z3.s,w29,1
.endif
	lsr	x20,x29,#32
.inst	0x05a03a87	//dup z7.s,w20
.inst	0x05a03a93	//dup z19.s,w20
	mov	x6,#10
10:
.align	5
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04703403	//xar z3.s,z3.s,z0.s,16
.if	mixin == 1
	ror	w19,w19,16
.endif
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04703487	//xar z7.s,z7.s,z4.s,16
.if	mixin == 1
	ror	w20,w20,16
.endif
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x0470350b	//xar z11.s,z11.s,z8.s,16
.if	mixin == 1
	ror	w21,w21,16
.endif
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x0470358f	//xar z15.s,z15.s,z12.s,16
.if	mixin == 1
	ror	w22,w22,16
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x046c3441	//xar z1.s,z1.s,z2.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x046c34c5	//xar z5.s,z5.s,z6.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x046c3549	//xar z9.s,z9.s,z10.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046c35cd	//xar z13.s,z13.s,z14.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04683403	//xar z3.s,z3.s,z0.s,24
.if	mixin == 1
	ror	w19,w19,24
.endif
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04683487	//xar z7.s,z7.s,z4.s,24
.if	mixin == 1
	ror	w20,w20,24
.endif
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x0468350b	//xar z11.s,z11.s,z8.s,24
.if	mixin == 1
	ror	w21,w21,24
.endif
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x0468358f	//xar z15.s,z15.s,z12.s,24
.if	mixin == 1
	ror	w22,w22,24
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04673441	//xar z1.s,z1.s,z2.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x046734c5	//xar z5.s,z5.s,z6.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04673549	//xar z9.s,z9.s,z10.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046735cd	//xar z13.s,z13.s,z14.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x0470340f	//xar z15.s,z15.s,z0.s,16
.if	mixin == 1
	ror	w22,w22,16
.endif
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04703483	//xar z3.s,z3.s,z4.s,16
.if	mixin == 1
	ror	w19,w19,16
.endif
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04703507	//xar z7.s,z7.s,z8.s,16
.if	mixin == 1
	ror	w20,w20,16
.endif
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x0470358b	//xar z11.s,z11.s,z12.s,16
.if	mixin == 1
	ror	w21,w21,16
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x046c3545	//xar z5.s,z5.s,z10.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x046c35c9	//xar z9.s,z9.s,z14.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x046c344d	//xar z13.s,z13.s,z2.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046c34c1	//xar z1.s,z1.s,z6.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x0468340f	//xar z15.s,z15.s,z0.s,24
.if	mixin == 1
	ror	w22,w22,24
.endif
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04683483	//xar z3.s,z3.s,z4.s,24
.if	mixin == 1
	ror	w19,w19,24
.endif
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04683507	//xar z7.s,z7.s,z8.s,24
.if	mixin == 1
	ror	w20,w20,24
.endif
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x0468358b	//xar z11.s,z11.s,z12.s,24
.if	mixin == 1
	ror	w21,w21,24
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04673545	//xar z5.s,z5.s,z10.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x046735c9	//xar z9.s,z9.s,z14.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x0467344d	//xar z13.s,z13.s,z2.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046734c1	//xar z1.s,z1.s,z6.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
	sub	x6,x6,1
	cbnz	x6,10b
.if	mixin == 1
	add	w7,w7,w23
.endif
.inst	0x04b90000	//add z0.s,z0.s,z25.s
.if	mixin == 1
	add	x8,x8,x23,lsr #32
.endif
.inst	0x04ba0084	//add z4.s,z4.s,z26.s
.if	mixin == 1
	add	x7,x7,x8,lsl #32  // pack
.endif
.if	mixin == 1
	add	w9,w9,w24
.endif
.inst	0x04bb0108	//add z8.s,z8.s,z27.s
.if	mixin == 1
	add	x10,x10,x24,lsr #32
.endif
.inst	0x04bc018c	//add z12.s,z12.s,z28.s
.if	mixin == 1
	add	x9,x9,x10,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x8,x10,[x1],#16
.endif
.if	mixin == 1
	add	w11,w11,w25
.endif
.inst	0x04bd0021	//add z1.s,z1.s,z29.s
.if	mixin == 1
	add	x12,x12,x25,lsr #32
.endif
.inst	0x04be00a5	//add z5.s,z5.s,z30.s
.if	mixin == 1
	add	x11,x11,x12,lsl #32  // pack
.endif
.if	mixin == 1
	add	w13,w13,w26
.endif
.inst	0x04b50129	//add z9.s,z9.s,z21.s
.if	mixin == 1
	add	x14,x14,x26,lsr #32
.endif
.inst	0x04b601ad	//add z13.s,z13.s,z22.s
.if	mixin == 1
	add	x13,x13,x14,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x12,x14,[x1],#16
.endif
.if	mixin == 1
	add	w15,w15,w27
.endif
.inst	0x04b70042	//add z2.s,z2.s,z23.s
.if	mixin == 1
	add	x16,x16,x27,lsr #32
.endif
.inst	0x04b800c6	//add z6.s,z6.s,z24.s
.if	mixin == 1
	add	x15,x15,x16,lsl #32  // pack
.endif
.if	mixin == 1
	add	w17,w17,w28
.endif
.inst	0x04b1014a	//add z10.s,z10.s,z17.s
.if	mixin == 1
	add	x18,x18,x28,lsr #32
.endif
.inst	0x04b201ce	//add z14.s,z14.s,z18.s
.if	mixin == 1
	add	x17,x17,x18,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x16,x18,[x1],#16
.endif
.if	mixin == 1
	add	w19,w19,w29
.endif
.inst	0x04b00063	//add z3.s,z3.s,z16.s
.if	mixin == 1
	add	x20,x20,x29,lsr #32
.endif
.inst	0x04b300e7	//add z7.s,z7.s,z19.s
.if	mixin == 1
	add	x19,x19,x20,lsl #32  // pack
.endif
.if	mixin == 1
	add	w21,w21,w30
.endif
.inst	0x04b4016b	//add z11.s,z11.s,z20.s
.if	mixin == 1
	add	x22,x22,x30,lsr #32
.endif
.inst	0x04bf01ef	//add z15.s,z15.s,z31.s
.if	mixin == 1
	add	x21,x21,x22,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x20,x22,[x1],#16
.endif
#ifdef	__AARCH64EB__
	rev	x7,x7
.inst	0x05a48000	//revb z0.s,p0/m,z0.s
.inst	0x05a48084	//revb z4.s,p0/m,z4.s
	rev	x9,x9
.inst	0x05a48108	//revb z8.s,p0/m,z8.s
.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
	rev	x11,x11
.inst	0x05a48021	//revb z1.s,p0/m,z1.s
.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
	rev	x13,x13
.inst	0x05a48129	//revb z9.s,p0/m,z9.s
.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
	rev	x15,x15
.inst	0x05a48042	//revb z2.s,p0/m,z2.s
.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
	rev	x17,x17
.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
	rev	x19,x19
.inst	0x05a48063	//revb z3.s,p0/m,z3.s
.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
	rev	x21,x21
.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
#endif
.if	mixin == 1
	add	x29,x29,#1
.endif
	cmp	x5,4
	b.ne	200f
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.if	mixin == 1
	eor	x11,x11,x12
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x13,x13,x14
.endif
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23021	//eor z1.d,z1.d,z18.d
.inst	0x04b33042	//eor z2.d,z2.d,z19.d
.inst	0x04b43063	//eor z3.d,z3.d,z20.d
.inst	0x04b53084	//eor z4.d,z4.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13108	//eor z8.d,z8.d,z17.d
.inst	0x04b23129	//eor z9.d,z9.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	b	210f
200:
.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s

.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d

.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s

.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s

.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d

.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x11,x11,x12
.endif
.if	mixin == 1
	eor	x13,x13,x14
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23084	//eor z4.d,z4.d,z18.d
.inst	0x04b33108	//eor z8.d,z8.d,z19.d
.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
.inst	0x04b53021	//eor z1.d,z1.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b73129	//eor z9.d,z9.d,z23.d
.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13042	//eor z2.d,z2.d,z17.d
.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b53063	//eor z3.d,z3.d,z21.d
.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
210:
.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
	subs	x2,x2,64
	b.gt	100b
	b	110f
101:
	mixin=0
	lsr	x8,x23,#32
.inst	0x05a03ae0	//dup z0.s,w23
.inst	0x05a03af9	//dup z25.s,w23
.if	mixin == 1
	mov	w7,w23
.endif
.inst	0x05a03904	//dup z4.s,w8
.inst	0x05a0391a	//dup z26.s,w8
	lsr	x10,x24,#32
.inst	0x05a03b08	//dup z8.s,w24
.inst	0x05a03b1b	//dup z27.s,w24
.if	mixin == 1
	mov	w9,w24
.endif
.inst	0x05a0394c	//dup z12.s,w10
.inst	0x05a0395c	//dup z28.s,w10
	lsr	x12,x25,#32
.inst	0x05a03b21	//dup z1.s,w25
.inst	0x05a03b3d	//dup z29.s,w25
.if	mixin == 1
	mov	w11,w25
.endif
.inst	0x05a03985	//dup z5.s,w12
.inst	0x05a0399e	//dup z30.s,w12
	lsr	x14,x26,#32
.inst	0x05a03b49	//dup z9.s,w26
.inst	0x05a03b55	//dup z21.s,w26
.if	mixin == 1
	mov	w13,w26
.endif
.inst	0x05a039cd	//dup z13.s,w14
.inst	0x05a039d6	//dup z22.s,w14
	lsr	x16,x27,#32
.inst	0x05a03b62	//dup z2.s,w27
.inst	0x05a03b77	//dup z23.s,w27
.if	mixin == 1
	mov	w15,w27
.endif
.inst	0x05a03a06	//dup z6.s,w16
.inst	0x05a03a18	//dup z24.s,w16
	lsr	x18,x28,#32
.inst	0x05a03b8a	//dup z10.s,w28
.inst	0x05a03b91	//dup z17.s,w28
.if	mixin == 1
	mov	w17,w28
.endif
.inst	0x05a03a4e	//dup z14.s,w18
.inst	0x05a03a52	//dup z18.s,w18
	lsr	x22,x30,#32
.inst	0x05a03bcb	//dup z11.s,w30
.inst	0x05a03bd4	//dup z20.s,w30
.if	mixin == 1
	mov	w21,w30
.endif
.inst	0x05a03acf	//dup z15.s,w22
.inst	0x05a03adf	//dup z31.s,w22
.if	mixin == 1
	add	w20,w29,#1
	mov	w19,w29
.inst	0x04a14690	//index z16.s,w20,1
.inst	0x04a14683	//index z3.s,w20,1
.else
.inst	0x04a147b0	//index z16.s,w29,1
.inst	0x04a147a3	//index z3.s,w29,1
.endif
	lsr	x20,x29,#32
.inst	0x05a03a87	//dup z7.s,w20
.inst	0x05a03a93	//dup z19.s,w20
	mov	x6,#10
10:
.align	5
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04703403	//xar z3.s,z3.s,z0.s,16
.if	mixin == 1
	ror	w19,w19,16
.endif
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04703487	//xar z7.s,z7.s,z4.s,16
.if	mixin == 1
	ror	w20,w20,16
.endif
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x0470350b	//xar z11.s,z11.s,z8.s,16
.if	mixin == 1
	ror	w21,w21,16
.endif
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x0470358f	//xar z15.s,z15.s,z12.s,16
.if	mixin == 1
	ror	w22,w22,16
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x046c3441	//xar z1.s,z1.s,z2.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x046c34c5	//xar z5.s,z5.s,z6.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x046c3549	//xar z9.s,z9.s,z10.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046c35cd	//xar z13.s,z13.s,z14.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04683403	//xar z3.s,z3.s,z0.s,24
.if	mixin == 1
	ror	w19,w19,24
.endif
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04683487	//xar z7.s,z7.s,z4.s,24
.if	mixin == 1
	ror	w20,w20,24
.endif
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x0468350b	//xar z11.s,z11.s,z8.s,24
.if	mixin == 1
	ror	w21,w21,24
.endif
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x0468358f	//xar z15.s,z15.s,z12.s,24
.if	mixin == 1
	ror	w22,w22,24
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04673441	//xar z1.s,z1.s,z2.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x046734c5	//xar z5.s,z5.s,z6.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04673549	//xar z9.s,z9.s,z10.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046735cd	//xar z13.s,z13.s,z14.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x0470340f	//xar z15.s,z15.s,z0.s,16
.if	mixin == 1
	ror	w22,w22,16
.endif
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04703483	//xar z3.s,z3.s,z4.s,16
.if	mixin == 1
	ror	w19,w19,16
.endif
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04703507	//xar z7.s,z7.s,z8.s,16
.if	mixin == 1
	ror	w20,w20,16
.endif
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x0470358b	//xar z11.s,z11.s,z12.s,16
.if	mixin == 1
	ror	w21,w21,16
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x046c3545	//xar z5.s,z5.s,z10.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x046c35c9	//xar z9.s,z9.s,z14.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x046c344d	//xar z13.s,z13.s,z2.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046c34c1	//xar z1.s,z1.s,z6.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x0468340f	//xar z15.s,z15.s,z0.s,24
.if	mixin == 1
	ror	w22,w22,24
.endif
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04683483	//xar z3.s,z3.s,z4.s,24
.if	mixin == 1
	ror	w19,w19,24
.endif
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04683507	//xar z7.s,z7.s,z8.s,24
.if	mixin == 1
	ror	w20,w20,24
.endif
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x0468358b	//xar z11.s,z11.s,z12.s,24
.if	mixin == 1
	ror	w21,w21,24
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04673545	//xar z5.s,z5.s,z10.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x046735c9	//xar z9.s,z9.s,z14.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x0467344d	//xar z13.s,z13.s,z2.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046734c1	//xar z1.s,z1.s,z6.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
	sub	x6,x6,1
	cbnz	x6,10b
.if	mixin == 1
	add	w7,w7,w23
.endif
.inst	0x04b90000	//add z0.s,z0.s,z25.s
.if	mixin == 1
	add	x8,x8,x23,lsr #32
.endif
.inst	0x04ba0084	//add z4.s,z4.s,z26.s
.if	mixin == 1
	add	x7,x7,x8,lsl #32  // pack
.endif
.if	mixin == 1
	add	w9,w9,w24
.endif
.inst	0x04bb0108	//add z8.s,z8.s,z27.s
.if	mixin == 1
	add	x10,x10,x24,lsr #32
.endif
.inst	0x04bc018c	//add z12.s,z12.s,z28.s
.if	mixin == 1
	add	x9,x9,x10,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x8,x10,[x1],#16
.endif
.if	mixin == 1
	add	w11,w11,w25
.endif
.inst	0x04bd0021	//add z1.s,z1.s,z29.s
.if	mixin == 1
	add	x12,x12,x25,lsr #32
.endif
.inst	0x04be00a5	//add z5.s,z5.s,z30.s
.if	mixin == 1
	add	x11,x11,x12,lsl #32  // pack
.endif
.if	mixin == 1
	add	w13,w13,w26
.endif
.inst	0x04b50129	//add z9.s,z9.s,z21.s
.if	mixin == 1
	add	x14,x14,x26,lsr #32
.endif
.inst	0x04b601ad	//add z13.s,z13.s,z22.s
.if	mixin == 1
	add	x13,x13,x14,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x12,x14,[x1],#16
.endif
.if	mixin == 1
	add	w15,w15,w27
.endif
.inst	0x04b70042	//add z2.s,z2.s,z23.s
.if	mixin == 1
	add	x16,x16,x27,lsr #32
.endif
.inst	0x04b800c6	//add z6.s,z6.s,z24.s
.if	mixin == 1
	add	x15,x15,x16,lsl #32  // pack
.endif
.if	mixin == 1
	add	w17,w17,w28
.endif
.inst	0x04b1014a	//add z10.s,z10.s,z17.s
.if	mixin == 1
	add	x18,x18,x28,lsr #32
.endif
.inst	0x04b201ce	//add z14.s,z14.s,z18.s
.if	mixin == 1
	add	x17,x17,x18,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x16,x18,[x1],#16
.endif
.if	mixin == 1
	add	w19,w19,w29
.endif
.inst	0x04b00063	//add z3.s,z3.s,z16.s
.if	mixin == 1
	add	x20,x20,x29,lsr #32
.endif
.inst	0x04b300e7	//add z7.s,z7.s,z19.s
.if	mixin == 1
	add	x19,x19,x20,lsl #32  // pack
.endif
.if	mixin == 1
	add	w21,w21,w30
.endif
.inst	0x04b4016b	//add z11.s,z11.s,z20.s
.if	mixin == 1
	add	x22,x22,x30,lsr #32
.endif
.inst	0x04bf01ef	//add z15.s,z15.s,z31.s
.if	mixin == 1
	add	x21,x21,x22,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x20,x22,[x1],#16
.endif
#ifdef	__AARCH64EB__
	rev	x7,x7
.inst	0x05a48000	//revb z0.s,p0/m,z0.s
.inst	0x05a48084	//revb z4.s,p0/m,z4.s
	rev	x9,x9
.inst	0x05a48108	//revb z8.s,p0/m,z8.s
.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
	rev	x11,x11
.inst	0x05a48021	//revb z1.s,p0/m,z1.s
.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
	rev	x13,x13
.inst	0x05a48129	//revb z9.s,p0/m,z9.s
.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
	rev	x15,x15
.inst	0x05a48042	//revb z2.s,p0/m,z2.s
.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
	rev	x17,x17
.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
	rev	x19,x19
.inst	0x05a48063	//revb z3.s,p0/m,z3.s
.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
	rev	x21,x21
.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
#endif
.if	mixin == 1
	add	x29,x29,#1
.endif
	cmp	x5,4
	b.ne	200f
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.if	mixin == 1
	eor	x11,x11,x12
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x13,x13,x14
.endif
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23021	//eor z1.d,z1.d,z18.d
.inst	0x04b33042	//eor z2.d,z2.d,z19.d
.inst	0x04b43063	//eor z3.d,z3.d,z20.d
.inst	0x04b53084	//eor z4.d,z4.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13108	//eor z8.d,z8.d,z17.d
.inst	0x04b23129	//eor z9.d,z9.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	b	210f
200:
.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s

.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d

.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s

.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s

.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d

.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x11,x11,x12
.endif
.if	mixin == 1
	eor	x13,x13,x14
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23084	//eor z4.d,z4.d,z18.d
.inst	0x04b33108	//eor z8.d,z8.d,z19.d
.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
.inst	0x04b53021	//eor z1.d,z1.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b73129	//eor z9.d,z9.d,z23.d
.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13042	//eor z2.d,z2.d,z17.d
.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b53063	//eor z3.d,z3.d,z21.d
.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
210:
.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
110:
	b	2f
1:
.align	5
100:
	subs	x7,x2,x5,lsl #6
	b.lt	110f
	mov	x2,x7
	b.eq	101f
	cmp	x2,64
	b.lt	101f
	mixin=1
	lsr	x8,x23,#32
.inst	0x05a03ae0	//dup z0.s,w23
.inst	0x05a03af9	//dup z25.s,w23
.if	mixin == 1
	mov	w7,w23
.endif
.inst	0x05a03904	//dup z4.s,w8
.inst	0x05a0391a	//dup z26.s,w8
	lsr	x10,x24,#32
.inst	0x05a03b08	//dup z8.s,w24
.inst	0x05a03b1b	//dup z27.s,w24
.if	mixin == 1
	mov	w9,w24
.endif
.inst	0x05a0394c	//dup z12.s,w10
.inst	0x05a0395c	//dup z28.s,w10
	lsr	x12,x25,#32
.inst	0x05a03b21	//dup z1.s,w25
.inst	0x05a03b3d	//dup z29.s,w25
.if	mixin == 1
	mov	w11,w25
.endif
.inst	0x05a03985	//dup z5.s,w12
.inst	0x05a0399e	//dup z30.s,w12
	lsr	x14,x26,#32
.inst	0x05a03b49	//dup z9.s,w26
.inst	0x05a03b55	//dup z21.s,w26
.if	mixin == 1
	mov	w13,w26
.endif
.inst	0x05a039cd	//dup z13.s,w14
.inst	0x05a039d6	//dup z22.s,w14
	lsr	x16,x27,#32
.inst	0x05a03b62	//dup z2.s,w27
.inst	0x05a03b77	//dup z23.s,w27
.if	mixin == 1
	mov	w15,w27
.endif
.inst	0x05a03a06	//dup z6.s,w16
.inst	0x05a03a18	//dup z24.s,w16
	lsr	x18,x28,#32
.inst	0x05a03b8a	//dup z10.s,w28
.if	mixin == 1
	mov	w17,w28
.endif
.inst	0x05a03a4e	//dup z14.s,w18
	lsr	x22,x30,#32
.inst	0x05a03bcb	//dup z11.s,w30
.if	mixin == 1
	mov	w21,w30
.endif
.inst	0x05a03acf	//dup z15.s,w22
.if	mixin == 1
	add	w20,w29,#1
	mov	w19,w29
.inst	0x04a14690	//index z16.s,w20,1
.inst	0x04a14683	//index z3.s,w20,1
.else
.inst	0x04a147b0	//index z16.s,w29,1
.inst	0x04a147a3	//index z3.s,w29,1
.endif
	lsr	x20,x29,#32
.inst	0x05a03a87	//dup z7.s,w20
	mov	x6,#10
10:
.align	5
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.inst	0x04a03063	//eor z3.d,z3.d,z0.d
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x05a58063	//revh z3.s,p0/m,z3.s
.if	mixin == 1
	ror	w19,w19,#16
.endif
.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
.if	mixin == 1
	ror	w20,w20,#16
.endif
.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
.if	mixin == 1
	ror	w21,w21,#16
.endif
.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
.if	mixin == 1
	ror	w22,w22,#16
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.inst	0x04a23021	//eor z1.d,z1.d,z2.d
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046c9c31	//lsl z17.s,z1.s,12
.inst	0x046c9cb2	//lsl z18.s,z5.s,12
.inst	0x046c9d33	//lsl z19.s,z9.s,12
.inst	0x046c9db4	//lsl z20.s,z13.s,12
.inst	0x046c9421	//lsr z1.s,z1.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x046c94a5	//lsr z5.s,z5.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.inst	0x046c9529	//lsr z9.s,z9.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.inst	0x046c95ad	//lsr z13.s,z13.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x04713021	//orr z1.d,z1.d,z17.d
.inst	0x047230a5	//orr z5.d,z5.d,z18.d
.inst	0x04733129	//orr z9.d,z9.d,z19.d
.inst	0x047431ad	//orr z13.d,z13.d,z20.d
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.inst	0x04a03063	//eor z3.d,z3.d,z0.d
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
.if	mixin == 1
	ror	w19,w19,#24
.endif
.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
.if	mixin == 1
	ror	w20,w20,#24
.endif
.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
.if	mixin == 1
	ror	w21,w21,#24
.endif
.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
.if	mixin == 1
	ror	w22,w22,#24
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.inst	0x04a23021	//eor z1.d,z1.d,z2.d
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x04679c31	//lsl z17.s,z1.s,7
.inst	0x04679cb2	//lsl z18.s,z5.s,7
.inst	0x04679d33	//lsl z19.s,z9.s,7
.inst	0x04679db4	//lsl z20.s,z13.s,7
.inst	0x04679421	//lsr z1.s,z1.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.inst	0x046794a5	//lsr z5.s,z5.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.inst	0x04679529	//lsr z9.s,z9.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.inst	0x046795ad	//lsr z13.s,z13.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04713021	//orr z1.d,z1.d,z17.d
.inst	0x047230a5	//orr z5.d,z5.d,z18.d
.inst	0x04733129	//orr z9.d,z9.d,z19.d
.inst	0x047431ad	//orr z13.d,z13.d,z20.d
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x04a43063	//eor z3.d,z3.d,z4.d
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
.if	mixin == 1
	ror	w22,w22,#16
.endif
.inst	0x05a58063	//revh z3.s,p0/m,z3.s
.if	mixin == 1
	ror	w19,w19,#16
.endif
.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
.if	mixin == 1
	ror	w20,w20,#16
.endif
.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
.if	mixin == 1
	ror	w21,w21,#16
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x04a63021	//eor z1.d,z1.d,z6.d
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046c9cb1	//lsl z17.s,z5.s,12
.inst	0x046c9d32	//lsl z18.s,z9.s,12
.inst	0x046c9db3	//lsl z19.s,z13.s,12
.inst	0x046c9c34	//lsl z20.s,z1.s,12
.inst	0x046c94a5	//lsr z5.s,z5.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.inst	0x046c9529	//lsr z9.s,z9.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.inst	0x046c95ad	//lsr z13.s,z13.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x046c9421	//lsr z1.s,z1.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x047130a5	//orr z5.d,z5.d,z17.d
.inst	0x04723129	//orr z9.d,z9.d,z18.d
.inst	0x047331ad	//orr z13.d,z13.d,z19.d
.inst	0x04743021	//orr z1.d,z1.d,z20.d
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x04a43063	//eor z3.d,z3.d,z4.d
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
.if	mixin == 1
	ror	w22,w22,#24
.endif
.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
.if	mixin == 1
	ror	w19,w19,#24
.endif
.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
.if	mixin == 1
	ror	w20,w20,#24
.endif
.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
.if	mixin == 1
	ror	w21,w21,#24
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x04a63021	//eor z1.d,z1.d,z6.d
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x04679cb1	//lsl z17.s,z5.s,7
.inst	0x04679d32	//lsl z18.s,z9.s,7
.inst	0x04679db3	//lsl z19.s,z13.s,7
.inst	0x04679c34	//lsl z20.s,z1.s,7
.inst	0x046794a5	//lsr z5.s,z5.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.inst	0x04679529	//lsr z9.s,z9.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.inst	0x046795ad	//lsr z13.s,z13.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04679421	//lsr z1.s,z1.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.inst	0x047130a5	//orr z5.d,z5.d,z17.d
.inst	0x04723129	//orr z9.d,z9.d,z18.d
.inst	0x047331ad	//orr z13.d,z13.d,z19.d
.inst	0x04743021	//orr z1.d,z1.d,z20.d
	sub	x6,x6,1
	cbnz	x6,10b
	lsr	x6,x28,#32
.inst	0x05a03b91	//dup z17.s,w28
.inst	0x05a038d2	//dup z18.s,w6
	lsr	x6,x29,#32
.inst	0x05a038d3	//dup z19.s,w6
	lsr	x6,x30,#32
.if	mixin == 1
	add	w7,w7,w23
.endif
.inst	0x04b90000	//add z0.s,z0.s,z25.s
.if	mixin == 1
	add	x8,x8,x23,lsr #32
.endif
.inst	0x04ba0084	//add z4.s,z4.s,z26.s
.if	mixin == 1
	add	x7,x7,x8,lsl #32  // pack
.endif
.if	mixin == 1
	add	w9,w9,w24
.endif
.inst	0x04bb0108	//add z8.s,z8.s,z27.s
.if	mixin == 1
	add	x10,x10,x24,lsr #32
.endif
.inst	0x04bc018c	//add z12.s,z12.s,z28.s
.if	mixin == 1
	add	x9,x9,x10,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x8,x10,[x1],#16
.endif
.if	mixin == 1
	add	w11,w11,w25
.endif
.inst	0x04bd0021	//add z1.s,z1.s,z29.s
.if	mixin == 1
	add	x12,x12,x25,lsr #32
.endif
.inst	0x04be00a5	//add z5.s,z5.s,z30.s
.if	mixin == 1
	add	x11,x11,x12,lsl #32  // pack
.endif
.if	mixin == 1
	add	w13,w13,w26
.endif
.inst	0x04b50129	//add z9.s,z9.s,z21.s
.if	mixin == 1
	add	x14,x14,x26,lsr #32
.endif
.inst	0x04b601ad	//add z13.s,z13.s,z22.s
.if	mixin == 1
	add	x13,x13,x14,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x12,x14,[x1],#16
.endif
.if	mixin == 1
	add	w15,w15,w27
.endif
.inst	0x04b70042	//add z2.s,z2.s,z23.s
.if	mixin == 1
	add	x16,x16,x27,lsr #32
.endif
.inst	0x04b800c6	//add z6.s,z6.s,z24.s
.if	mixin == 1
	add	x15,x15,x16,lsl #32  // pack
.endif
.if	mixin == 1
	add	w17,w17,w28
.endif
.inst	0x04b1014a	//add z10.s,z10.s,z17.s
.if	mixin == 1
	add	x18,x18,x28,lsr #32
.endif
.inst	0x04b201ce	//add z14.s,z14.s,z18.s
.if	mixin == 1
	add	x17,x17,x18,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x16,x18,[x1],#16
.endif
.inst	0x05a03bd4	//dup z20.s,w30
.inst	0x05a038d9	//dup z25.s,w6	// bak[15] not available for SVE
.if	mixin == 1
	add	w19,w19,w29
.endif
.inst	0x04b00063	//add z3.s,z3.s,z16.s
.if	mixin == 1
	add	x20,x20,x29,lsr #32
.endif
.inst	0x04b300e7	//add z7.s,z7.s,z19.s
.if	mixin == 1
	add	x19,x19,x20,lsl #32  // pack
.endif
.if	mixin == 1
	add	w21,w21,w30
.endif
.inst	0x04b4016b	//add z11.s,z11.s,z20.s
.if	mixin == 1
	add	x22,x22,x30,lsr #32
.endif
.inst	0x04b901ef	//add z15.s,z15.s,z25.s
.if	mixin == 1
	add	x21,x21,x22,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x20,x22,[x1],#16
.endif
#ifdef	__AARCH64EB__
	rev	x7,x7
.inst	0x05a48000	//revb z0.s,p0/m,z0.s
.inst	0x05a48084	//revb z4.s,p0/m,z4.s
	rev	x9,x9
.inst	0x05a48108	//revb z8.s,p0/m,z8.s
.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
	rev	x11,x11
.inst	0x05a48021	//revb z1.s,p0/m,z1.s
.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
	rev	x13,x13
.inst	0x05a48129	//revb z9.s,p0/m,z9.s
.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
	rev	x15,x15
.inst	0x05a48042	//revb z2.s,p0/m,z2.s
.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
	rev	x17,x17
.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
	rev	x19,x19
.inst	0x05a48063	//revb z3.s,p0/m,z3.s
.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
	rev	x21,x21
.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
#endif
.if	mixin == 1
	add	x29,x29,#1
.endif
	cmp	x5,4
	b.ne	200f
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.if	mixin == 1
	eor	x11,x11,x12
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x13,x13,x14
.endif
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23021	//eor z1.d,z1.d,z18.d
.inst	0x04b33042	//eor z2.d,z2.d,z19.d
.inst	0x04b43063	//eor z3.d,z3.d,z20.d
.inst	0x04b53084	//eor z4.d,z4.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13108	//eor z8.d,z8.d,z17.d
.inst	0x04b23129	//eor z9.d,z9.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	b	210f
200:
.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s

.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d

.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s

.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s

.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d

.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x11,x11,x12
.endif
.if	mixin == 1
	eor	x13,x13,x14
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23084	//eor z4.d,z4.d,z18.d
.inst	0x04b33108	//eor z8.d,z8.d,z19.d
.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
.inst	0x04b53021	//eor z1.d,z1.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b73129	//eor z9.d,z9.d,z23.d
.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13042	//eor z2.d,z2.d,z17.d
.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b53063	//eor z3.d,z3.d,z21.d
.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
210:
.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
	subs	x2,x2,64
	b.gt	100b
	b	110f
101:
	mixin=0
	lsr	x8,x23,#32
.inst	0x05a03ae0	//dup z0.s,w23
.inst	0x05a03af9	//dup z25.s,w23
.if	mixin == 1
	mov	w7,w23
.endif
.inst	0x05a03904	//dup z4.s,w8
.inst	0x05a0391a	//dup z26.s,w8
	lsr	x10,x24,#32
.inst	0x05a03b08	//dup z8.s,w24
.inst	0x05a03b1b	//dup z27.s,w24
.if	mixin == 1
	mov	w9,w24
.endif
.inst	0x05a0394c	//dup z12.s,w10
.inst	0x05a0395c	//dup z28.s,w10
	lsr	x12,x25,#32
.inst	0x05a03b21	//dup z1.s,w25
.inst	0x05a03b3d	//dup z29.s,w25
.if	mixin == 1
	mov	w11,w25
.endif
.inst	0x05a03985	//dup z5.s,w12
.inst	0x05a0399e	//dup z30.s,w12
	lsr	x14,x26,#32
.inst	0x05a03b49	//dup z9.s,w26
.inst	0x05a03b55	//dup z21.s,w26
.if	mixin == 1
	mov	w13,w26
.endif
.inst	0x05a039cd	//dup z13.s,w14
.inst	0x05a039d6	//dup z22.s,w14
	lsr	x16,x27,#32
.inst	0x05a03b62	//dup z2.s,w27
.inst	0x05a03b77	//dup z23.s,w27
.if	mixin == 1
	mov	w15,w27
.endif
.inst	0x05a03a06	//dup z6.s,w16
.inst	0x05a03a18	//dup z24.s,w16
	lsr	x18,x28,#32
.inst	0x05a03b8a	//dup z10.s,w28
.if	mixin == 1
	mov	w17,w28
.endif
.inst	0x05a03a4e	//dup z14.s,w18
	lsr	x22,x30,#32
.inst	0x05a03bcb	//dup z11.s,w30
.if	mixin == 1
	mov	w21,w30
.endif
.inst	0x05a03acf	//dup z15.s,w22
.if	mixin == 1
	add	w20,w29,#1
	mov	w19,w29
.inst	0x04a14690	//index z16.s,w20,1
.inst	0x04a14683	//index z3.s,w20,1
.else
.inst	0x04a147b0	//index z16.s,w29,1
.inst	0x04a147a3	//index z3.s,w29,1
.endif
	lsr	x20,x29,#32
.inst	0x05a03a87	//dup z7.s,w20
	mov	x6,#10
10:
.align	5
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.inst	0x04a03063	//eor z3.d,z3.d,z0.d
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x05a58063	//revh z3.s,p0/m,z3.s
.if	mixin == 1
	ror	w19,w19,#16
.endif
.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
.if	mixin == 1
	ror	w20,w20,#16
.endif
.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
.if	mixin == 1
	ror	w21,w21,#16
.endif
.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
.if	mixin == 1
	ror	w22,w22,#16
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.inst	0x04a23021	//eor z1.d,z1.d,z2.d
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x046c9c31	//lsl z17.s,z1.s,12
.inst	0x046c9cb2	//lsl z18.s,z5.s,12
.inst	0x046c9d33	//lsl z19.s,z9.s,12
.inst	0x046c9db4	//lsl z20.s,z13.s,12
.inst	0x046c9421	//lsr z1.s,z1.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x046c94a5	//lsr z5.s,z5.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.inst	0x046c9529	//lsr z9.s,z9.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.inst	0x046c95ad	//lsr z13.s,z13.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x04713021	//orr z1.d,z1.d,z17.d
.inst	0x047230a5	//orr z5.d,z5.d,z18.d
.inst	0x04733129	//orr z9.d,z9.d,z19.d
.inst	0x047431ad	//orr z13.d,z13.d,z20.d
.inst	0x04a10000	//add z0.s,z0.s,z1.s
.if	mixin == 1
	add	w7,w7,w11
.endif
.inst	0x04a50084	//add z4.s,z4.s,z5.s
.if	mixin == 1
	add	w8,w8,w12
.endif
.inst	0x04a90108	//add z8.s,z8.s,z9.s
.if	mixin == 1
	add	w9,w9,w13
.endif
.inst	0x04ad018c	//add z12.s,z12.s,z13.s
.if	mixin == 1
	add	w10,w10,w14
.endif
.inst	0x04a03063	//eor z3.d,z3.d,z0.d
.if	mixin == 1
	eor	w19,w19,w7
.endif
.inst	0x04a430e7	//eor z7.d,z7.d,z4.d
.if	mixin == 1
	eor	w20,w20,w8
.endif
.inst	0x04a8316b	//eor z11.d,z11.d,z8.d
.if	mixin == 1
	eor	w21,w21,w9
.endif
.inst	0x04ac31ef	//eor z15.d,z15.d,z12.d
.if	mixin == 1
	eor	w22,w22,w10
.endif
.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
.if	mixin == 1
	ror	w19,w19,#24
.endif
.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
.if	mixin == 1
	ror	w20,w20,#24
.endif
.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
.if	mixin == 1
	ror	w21,w21,#24
.endif
.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
.if	mixin == 1
	ror	w22,w22,#24
.endif
.inst	0x04a30042	//add z2.s,z2.s,z3.s
.if	mixin == 1
	add	w15,w15,w19
.endif
.inst	0x04a700c6	//add z6.s,z6.s,z7.s
.if	mixin == 1
	add	w16,w16,w20
.endif
.inst	0x04ab014a	//add z10.s,z10.s,z11.s
.if	mixin == 1
	add	w17,w17,w21
.endif
.inst	0x04af01ce	//add z14.s,z14.s,z15.s
.if	mixin == 1
	add	w18,w18,w22
.endif
.inst	0x04a23021	//eor z1.d,z1.d,z2.d
.if	mixin == 1
	eor	w11,w11,w15
.endif
.inst	0x04a630a5	//eor z5.d,z5.d,z6.d
.if	mixin == 1
	eor	w12,w12,w16
.endif
.inst	0x04aa3129	//eor z9.d,z9.d,z10.d
.if	mixin == 1
	eor	w13,w13,w17
.endif
.inst	0x04ae31ad	//eor z13.d,z13.d,z14.d
.if	mixin == 1
	eor	w14,w14,w18
.endif
.inst	0x04679c31	//lsl z17.s,z1.s,7
.inst	0x04679cb2	//lsl z18.s,z5.s,7
.inst	0x04679d33	//lsl z19.s,z9.s,7
.inst	0x04679db4	//lsl z20.s,z13.s,7
.inst	0x04679421	//lsr z1.s,z1.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.inst	0x046794a5	//lsr z5.s,z5.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.inst	0x04679529	//lsr z9.s,z9.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.inst	0x046795ad	//lsr z13.s,z13.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04713021	//orr z1.d,z1.d,z17.d
.inst	0x047230a5	//orr z5.d,z5.d,z18.d
.inst	0x04733129	//orr z9.d,z9.d,z19.d
.inst	0x047431ad	//orr z13.d,z13.d,z20.d
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x04a43063	//eor z3.d,z3.d,z4.d
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x05a581ef	//revh z15.s,p0/m,z15.s
.if	mixin == 1
	ror	w22,w22,#16
.endif
.inst	0x05a58063	//revh z3.s,p0/m,z3.s
.if	mixin == 1
	ror	w19,w19,#16
.endif
.inst	0x05a580e7	//revh z7.s,p0/m,z7.s
.if	mixin == 1
	ror	w20,w20,#16
.endif
.inst	0x05a5816b	//revh z11.s,p0/m,z11.s
.if	mixin == 1
	ror	w21,w21,#16
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x04a63021	//eor z1.d,z1.d,z6.d
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x046c9cb1	//lsl z17.s,z5.s,12
.inst	0x046c9d32	//lsl z18.s,z9.s,12
.inst	0x046c9db3	//lsl z19.s,z13.s,12
.inst	0x046c9c34	//lsl z20.s,z1.s,12
.inst	0x046c94a5	//lsr z5.s,z5.s,20
.if	mixin == 1
	ror	w12,w12,20
.endif
.inst	0x046c9529	//lsr z9.s,z9.s,20
.if	mixin == 1
	ror	w13,w13,20
.endif
.inst	0x046c95ad	//lsr z13.s,z13.s,20
.if	mixin == 1
	ror	w14,w14,20
.endif
.inst	0x046c9421	//lsr z1.s,z1.s,20
.if	mixin == 1
	ror	w11,w11,20
.endif
.inst	0x047130a5	//orr z5.d,z5.d,z17.d
.inst	0x04723129	//orr z9.d,z9.d,z18.d
.inst	0x047331ad	//orr z13.d,z13.d,z19.d
.inst	0x04743021	//orr z1.d,z1.d,z20.d
.inst	0x04a50000	//add z0.s,z0.s,z5.s
.if	mixin == 1
	add	w7,w7,w12
.endif
.inst	0x04a90084	//add z4.s,z4.s,z9.s
.if	mixin == 1
	add	w8,w8,w13
.endif
.inst	0x04ad0108	//add z8.s,z8.s,z13.s
.if	mixin == 1
	add	w9,w9,w14
.endif
.inst	0x04a1018c	//add z12.s,z12.s,z1.s
.if	mixin == 1
	add	w10,w10,w11
.endif
.inst	0x04a031ef	//eor z15.d,z15.d,z0.d
.if	mixin == 1
	eor	w22,w22,w7
.endif
.inst	0x04a43063	//eor z3.d,z3.d,z4.d
.if	mixin == 1
	eor	w19,w19,w8
.endif
.inst	0x04a830e7	//eor z7.d,z7.d,z8.d
.if	mixin == 1
	eor	w20,w20,w9
.endif
.inst	0x04ac316b	//eor z11.d,z11.d,z12.d
.if	mixin == 1
	eor	w21,w21,w10
.endif
.inst	0x053f31ef	//tbl z15.b,{z15.b},z31.b
.if	mixin == 1
	ror	w22,w22,#24
.endif
.inst	0x053f3063	//tbl z3.b,{z3.b},z31.b
.if	mixin == 1
	ror	w19,w19,#24
.endif
.inst	0x053f30e7	//tbl z7.b,{z7.b},z31.b
.if	mixin == 1
	ror	w20,w20,#24
.endif
.inst	0x053f316b	//tbl z11.b,{z11.b},z31.b
.if	mixin == 1
	ror	w21,w21,#24
.endif
.inst	0x04af014a	//add z10.s,z10.s,z15.s
.if	mixin == 1
	add	w17,w17,w22
.endif
.inst	0x04a301ce	//add z14.s,z14.s,z3.s
.if	mixin == 1
	add	w18,w18,w19
.endif
.inst	0x04a70042	//add z2.s,z2.s,z7.s
.if	mixin == 1
	add	w15,w15,w20
.endif
.inst	0x04ab00c6	//add z6.s,z6.s,z11.s
.if	mixin == 1
	add	w16,w16,w21
.endif
.inst	0x04aa30a5	//eor z5.d,z5.d,z10.d
.if	mixin == 1
	eor	w12,w12,w17
.endif
.inst	0x04ae3129	//eor z9.d,z9.d,z14.d
.if	mixin == 1
	eor	w13,w13,w18
.endif
.inst	0x04a231ad	//eor z13.d,z13.d,z2.d
.if	mixin == 1
	eor	w14,w14,w15
.endif
.inst	0x04a63021	//eor z1.d,z1.d,z6.d
.if	mixin == 1
	eor	w11,w11,w16
.endif
.inst	0x04679cb1	//lsl z17.s,z5.s,7
.inst	0x04679d32	//lsl z18.s,z9.s,7
.inst	0x04679db3	//lsl z19.s,z13.s,7
.inst	0x04679c34	//lsl z20.s,z1.s,7
.inst	0x046794a5	//lsr z5.s,z5.s,25
.if	mixin == 1
	ror	w12,w12,25
.endif
.inst	0x04679529	//lsr z9.s,z9.s,25
.if	mixin == 1
	ror	w13,w13,25
.endif
.inst	0x046795ad	//lsr z13.s,z13.s,25
.if	mixin == 1
	ror	w14,w14,25
.endif
.inst	0x04679421	//lsr z1.s,z1.s,25
.if	mixin == 1
	ror	w11,w11,25
.endif
.inst	0x047130a5	//orr z5.d,z5.d,z17.d
.inst	0x04723129	//orr z9.d,z9.d,z18.d
.inst	0x047331ad	//orr z13.d,z13.d,z19.d
.inst	0x04743021	//orr z1.d,z1.d,z20.d
	sub	x6,x6,1
	cbnz	x6,10b
	lsr	x6,x28,#32
.inst	0x05a03b91	//dup z17.s,w28
.inst	0x05a038d2	//dup z18.s,w6
	lsr	x6,x29,#32
.inst	0x05a038d3	//dup z19.s,w6
	lsr	x6,x30,#32
.if	mixin == 1
	add	w7,w7,w23
.endif
.inst	0x04b90000	//add z0.s,z0.s,z25.s
.if	mixin == 1
	add	x8,x8,x23,lsr #32
.endif
.inst	0x04ba0084	//add z4.s,z4.s,z26.s
.if	mixin == 1
	add	x7,x7,x8,lsl #32  // pack
.endif
.if	mixin == 1
	add	w9,w9,w24
.endif
.inst	0x04bb0108	//add z8.s,z8.s,z27.s
.if	mixin == 1
	add	x10,x10,x24,lsr #32
.endif
.inst	0x04bc018c	//add z12.s,z12.s,z28.s
.if	mixin == 1
	add	x9,x9,x10,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x8,x10,[x1],#16
.endif
.if	mixin == 1
	add	w11,w11,w25
.endif
.inst	0x04bd0021	//add z1.s,z1.s,z29.s
.if	mixin == 1
	add	x12,x12,x25,lsr #32
.endif
.inst	0x04be00a5	//add z5.s,z5.s,z30.s
.if	mixin == 1
	add	x11,x11,x12,lsl #32  // pack
.endif
.if	mixin == 1
	add	w13,w13,w26
.endif
.inst	0x04b50129	//add z9.s,z9.s,z21.s
.if	mixin == 1
	add	x14,x14,x26,lsr #32
.endif
.inst	0x04b601ad	//add z13.s,z13.s,z22.s
.if	mixin == 1
	add	x13,x13,x14,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x12,x14,[x1],#16
.endif
.if	mixin == 1
	add	w15,w15,w27
.endif
.inst	0x04b70042	//add z2.s,z2.s,z23.s
.if	mixin == 1
	add	x16,x16,x27,lsr #32
.endif
.inst	0x04b800c6	//add z6.s,z6.s,z24.s
.if	mixin == 1
	add	x15,x15,x16,lsl #32  // pack
.endif
.if	mixin == 1
	add	w17,w17,w28
.endif
.inst	0x04b1014a	//add z10.s,z10.s,z17.s
.if	mixin == 1
	add	x18,x18,x28,lsr #32
.endif
.inst	0x04b201ce	//add z14.s,z14.s,z18.s
.if	mixin == 1
	add	x17,x17,x18,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x16,x18,[x1],#16
.endif
.inst	0x05a03bd4	//dup z20.s,w30
.inst	0x05a038d9	//dup z25.s,w6	// bak[15] not available for SVE
.if	mixin == 1
	add	w19,w19,w29
.endif
.inst	0x04b00063	//add z3.s,z3.s,z16.s
.if	mixin == 1
	add	x20,x20,x29,lsr #32
.endif
.inst	0x04b300e7	//add z7.s,z7.s,z19.s
.if	mixin == 1
	add	x19,x19,x20,lsl #32  // pack
.endif
.if	mixin == 1
	add	w21,w21,w30
.endif
.inst	0x04b4016b	//add z11.s,z11.s,z20.s
.if	mixin == 1
	add	x22,x22,x30,lsr #32
.endif
.inst	0x04b901ef	//add z15.s,z15.s,z25.s
.if	mixin == 1
	add	x21,x21,x22,lsl #32  // pack
.endif
.if	mixin == 1
	ldp	x20,x22,[x1],#16
.endif
#ifdef	__AARCH64EB__
	rev	x7,x7
.inst	0x05a48000	//revb z0.s,p0/m,z0.s
.inst	0x05a48084	//revb z4.s,p0/m,z4.s
	rev	x9,x9
.inst	0x05a48108	//revb z8.s,p0/m,z8.s
.inst	0x05a4818c	//revb z12.s,p0/m,z12.s
	rev	x11,x11
.inst	0x05a48021	//revb z1.s,p0/m,z1.s
.inst	0x05a480a5	//revb z5.s,p0/m,z5.s
	rev	x13,x13
.inst	0x05a48129	//revb z9.s,p0/m,z9.s
.inst	0x05a481ad	//revb z13.s,p0/m,z13.s
	rev	x15,x15
.inst	0x05a48042	//revb z2.s,p0/m,z2.s
.inst	0x05a480c6	//revb z6.s,p0/m,z6.s
	rev	x17,x17
.inst	0x05a4814a	//revb z10.s,p0/m,z10.s
.inst	0x05a481ce	//revb z14.s,p0/m,z14.s
	rev	x19,x19
.inst	0x05a48063	//revb z3.s,p0/m,z3.s
.inst	0x05a480e7	//revb z7.s,p0/m,z7.s
	rev	x21,x21
.inst	0x05a4816b	//revb z11.s,p0/m,z11.s
.inst	0x05a481ef	//revb z15.s,p0/m,z15.s
#endif
.if	mixin == 1
	add	x29,x29,#1
.endif
	cmp	x5,4
	b.ne	200f
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.if	mixin == 1
	eor	x11,x11,x12
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x13,x13,x14
.endif
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23021	//eor z1.d,z1.d,z18.d
.inst	0x04b33042	//eor z2.d,z2.d,z19.d
.inst	0x04b43063	//eor z3.d,z3.d,z20.d
.inst	0x04b53084	//eor z4.d,z4.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b730c6	//eor z6.d,z6.d,z23.d
.inst	0x04b830e7	//eor z7.d,z7.d,z24.d
	ld1	{v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
	ld1	{v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13108	//eor z8.d,z8.d,z17.d
.inst	0x04b23129	//eor z9.d,z9.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b4316b	//eor z11.d,z11.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b5318c	//eor z12.d,z12.d,z21.d
.inst	0x04b631ad	//eor z13.d,z13.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b731ce	//eor z14.d,z14.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	st1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	b	210f
200:
.inst	0x05a16011	//zip1 z17.s,z0.s,z1.s
.inst	0x05a16412	//zip2 z18.s,z0.s,z1.s
.inst	0x05a36053	//zip1 z19.s,z2.s,z3.s
.inst	0x05a36454	//zip2 z20.s,z2.s,z3.s

.inst	0x05a56095	//zip1 z21.s,z4.s,z5.s
.inst	0x05a56496	//zip2 z22.s,z4.s,z5.s
.inst	0x05a760d7	//zip1 z23.s,z6.s,z7.s
.inst	0x05a764d8	//zip2 z24.s,z6.s,z7.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36621	//zip2 z1.d,z17.d,z19.d
.inst	0x05f46242	//zip1 z2.d,z18.d,z20.d
.inst	0x05f46643	//zip2 z3.d,z18.d,z20.d

.inst	0x05f762a4	//zip1 z4.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c6	//zip1 z6.d,z22.d,z24.d
.inst	0x05f866c7	//zip2 z7.d,z22.d,z24.d
.if	mixin == 1
	eor	x7,x7,x8
.endif
.if	mixin == 1
	eor	x9,x9,x10
.endif
.inst	0x05a96111	//zip1 z17.s,z8.s,z9.s
.inst	0x05a96512	//zip2 z18.s,z8.s,z9.s
.inst	0x05ab6153	//zip1 z19.s,z10.s,z11.s
.inst	0x05ab6554	//zip2 z20.s,z10.s,z11.s

.inst	0x05ad6195	//zip1 z21.s,z12.s,z13.s
.inst	0x05ad6596	//zip2 z22.s,z12.s,z13.s
.inst	0x05af61d7	//zip1 z23.s,z14.s,z15.s
.inst	0x05af65d8	//zip2 z24.s,z14.s,z15.s

.inst	0x05f36228	//zip1 z8.d,z17.d,z19.d
.inst	0x05f36629	//zip2 z9.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664b	//zip2 z11.d,z18.d,z20.d

.inst	0x05f762ac	//zip1 z12.d,z21.d,z23.d
.inst	0x05f766ad	//zip2 z13.d,z21.d,z23.d
.inst	0x05f862ce	//zip1 z14.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x11,x11,x12
.endif
.if	mixin == 1
	eor	x13,x13,x14
.endif
.inst	0x05a46011	//zip1 z17.s,z0.s,z4.s
.inst	0x05a46412	//zip2 z18.s,z0.s,z4.s
.inst	0x05ac6113	//zip1 z19.s,z8.s,z12.s
.inst	0x05ac6514	//zip2 z20.s,z8.s,z12.s

.inst	0x05a56035	//zip1 z21.s,z1.s,z5.s
.inst	0x05a56436	//zip2 z22.s,z1.s,z5.s
.inst	0x05ad6137	//zip1 z23.s,z9.s,z13.s
.inst	0x05ad6538	//zip2 z24.s,z9.s,z13.s

.inst	0x05f36220	//zip1 z0.d,z17.d,z19.d
.inst	0x05f36624	//zip2 z4.d,z17.d,z19.d
.inst	0x05f46248	//zip1 z8.d,z18.d,z20.d
.inst	0x05f4664c	//zip2 z12.d,z18.d,z20.d

.inst	0x05f762a1	//zip1 z1.d,z21.d,z23.d
.inst	0x05f766a5	//zip2 z5.d,z21.d,z23.d
.inst	0x05f862c9	//zip1 z9.d,z22.d,z24.d
.inst	0x05f866cd	//zip2 z13.d,z22.d,z24.d
.if	mixin == 1
	eor	x15,x15,x16
.endif
.if	mixin == 1
	eor	x17,x17,x18
.endif
.inst	0x05a66051	//zip1 z17.s,z2.s,z6.s
.inst	0x05a66452	//zip2 z18.s,z2.s,z6.s
.inst	0x05ae6153	//zip1 z19.s,z10.s,z14.s
.inst	0x05ae6554	//zip2 z20.s,z10.s,z14.s

.inst	0x05a76075	//zip1 z21.s,z3.s,z7.s
.inst	0x05a76476	//zip2 z22.s,z3.s,z7.s
.inst	0x05af6177	//zip1 z23.s,z11.s,z15.s
.inst	0x05af6578	//zip2 z24.s,z11.s,z15.s

.inst	0x05f36222	//zip1 z2.d,z17.d,z19.d
.inst	0x05f36626	//zip2 z6.d,z17.d,z19.d
.inst	0x05f4624a	//zip1 z10.d,z18.d,z20.d
.inst	0x05f4664e	//zip2 z14.d,z18.d,z20.d

.inst	0x05f762a3	//zip1 z3.d,z21.d,z23.d
.inst	0x05f766a7	//zip2 z7.d,z21.d,z23.d
.inst	0x05f862cb	//zip1 z11.d,z22.d,z24.d
.inst	0x05f866cf	//zip2 z15.d,z22.d,z24.d
.if	mixin == 1
	eor	x19,x19,x20
.endif
.if	mixin == 1
	eor	x21,x21,x22
.endif
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.inst	0x04b13000	//eor z0.d,z0.d,z17.d
.inst	0x04b23084	//eor z4.d,z4.d,z18.d
.inst	0x04b33108	//eor z8.d,z8.d,z19.d
.inst	0x04b4318c	//eor z12.d,z12.d,z20.d
.inst	0x04b53021	//eor z1.d,z1.d,z21.d
.inst	0x04b630a5	//eor z5.d,z5.d,z22.d
.inst	0x04b73129	//eor z9.d,z9.d,z23.d
.inst	0x04b831ad	//eor z13.d,z13.d,z24.d
.inst	0xa540a031	//ld1w {z17.s},p0/z,[x1,#0,MUL VL]
.inst	0xa541a032	//ld1w {z18.s},p0/z,[x1,#1,MUL VL]
.inst	0xa542a033	//ld1w {z19.s},p0/z,[x1,#2,MUL VL]
.inst	0xa543a034	//ld1w {z20.s},p0/z,[x1,#3,MUL VL]
.inst	0xa544a035	//ld1w {z21.s},p0/z,[x1,#4,MUL VL]
.inst	0xa545a036	//ld1w {z22.s},p0/z,[x1,#5,MUL VL]
.inst	0xa546a037	//ld1w {z23.s},p0/z,[x1,#6,MUL VL]
.inst	0xa547a038	//ld1w {z24.s},p0/z,[x1,#7,MUL VL]
.inst	0x04215101	//addvl x1,x1,8
.if	mixin == 1
	stp	x7,x9,[x0],#16
.endif
.inst	0x04b13042	//eor z2.d,z2.d,z17.d
.inst	0x04b230c6	//eor z6.d,z6.d,z18.d
.if	mixin == 1
	stp	x11,x13,[x0],#16
.endif
.inst	0x04b3314a	//eor z10.d,z10.d,z19.d
.inst	0x04b431ce	//eor z14.d,z14.d,z20.d
.if	mixin == 1
	stp	x15,x17,[x0],#16
.endif
.inst	0x04b53063	//eor z3.d,z3.d,z21.d
.inst	0x04b630e7	//eor z7.d,z7.d,z22.d
.if	mixin == 1
	stp	x19,x21,[x0],#16
.endif
.inst	0x04b7316b	//eor z11.d,z11.d,z23.d
.inst	0x04b831ef	//eor z15.d,z15.d,z24.d
.inst	0xe540e000	//st1w {z0.s},p0,[x0,#0,MUL VL]
.inst	0xe541e004	//st1w {z4.s},p0,[x0,#1,MUL VL]
.inst	0xe542e008	//st1w {z8.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00c	//st1w {z12.s},p0,[x0,#3,MUL VL]
.inst	0xe544e001	//st1w {z1.s},p0,[x0,#4,MUL VL]
.inst	0xe545e005	//st1w {z5.s},p0,[x0,#5,MUL VL]
.inst	0xe546e009	//st1w {z9.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00d	//st1w {z13.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
.inst	0xe540e002	//st1w {z2.s},p0,[x0,#0,MUL VL]
.inst	0xe541e006	//st1w {z6.s},p0,[x0,#1,MUL VL]
.inst	0xe542e00a	//st1w {z10.s},p0,[x0,#2,MUL VL]
.inst	0xe543e00e	//st1w {z14.s},p0,[x0,#3,MUL VL]
.inst	0xe544e003	//st1w {z3.s},p0,[x0,#4,MUL VL]
.inst	0xe545e007	//st1w {z7.s},p0,[x0,#5,MUL VL]
.inst	0xe546e00b	//st1w {z11.s},p0,[x0,#6,MUL VL]
.inst	0xe547e00f	//st1w {z15.s},p0,[x0,#7,MUL VL]
.inst	0x04205100	//addvl x0,x0,8
210:
.inst	0x04b0e3fd	//incw x29, ALL, MUL #1
110:
2:
	str	w29,[x4]
	ldp	d10,d11,[sp,16]
	ldp	d12,d13,[sp,32]
	ldp	d14,d15,[sp,48]
	ldp	x16,x17,[sp,64]
	ldp	x18,x19,[sp,80]
	ldp	x20,x21,[sp,96]
	ldp	x22,x23,[sp,112]
	ldp	x24,x25,[sp,128]
	ldp	x26,x27,[sp,144]
	ldp	x28,x29,[sp,160]
	ldr	x30,[sp,176]
	ldp	d8,d9,[sp],192
	AARCH64_VALIDATE_LINK_REGISTER
.Lreturn:
	ret
.size	ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
