2034 lines
53 KiB
ArmAsm
2034 lines
53 KiB
ArmAsm
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
//st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]!
|
|
//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X18, X19, [sp, #-16]!
|
|
stp X20, X21, [sp, #-16]!
|
|
stp X22, X23, [sp, #-16]!
|
|
stp X24, X25, [sp, #-16]!
|
|
stp X26, X27, [sp, #-16]!
|
|
stp X28, X29, [sp, #-16]!
|
|
stp X30, X29, [sp, #-16]!
|
|
.endm
|
|
|
|
.macro pop_v_regs
|
|
ldp X30, X29, [sp], #16
|
|
ldp X28, X29, [sp], #16
|
|
ldp X26, X27, [sp], #16
|
|
ldp X24, X25, [sp], #16
|
|
ldp X22, X23, [sp], #16
|
|
ldp X20, X21, [sp], #16
|
|
ldp X18, X19, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
|
|
//ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
|
|
|
|
.text
|
|
.p2align 2
|
|
.global ixheaacd_post_twid_overlap_add_armv8
|
|
|
|
ixheaacd_post_twid_overlap_add_armv8:
|
|
|
|
// STMFD sp!, {x4-x12}
|
|
push_v_regs
|
|
//stp x19, x20,[sp,#-16]!
|
|
//VPUSH {d8 - d15}
|
|
|
|
//LDR w4, [sp, #100]
|
|
//sxtw x4,w4
|
|
//LDR w5, [sp, #104]
|
|
//sxtw x5,w5
|
|
//LDR w6, [sp, #108]
|
|
//sxtw x6,w6
|
|
MOV x16, x5
|
|
MOV x17, x7
|
|
LSL x9, x3, #2
|
|
ASR x9, x9, #1
|
|
ADD x6, x6, x9
|
|
SUB x6, x6, #4
|
|
|
|
MOV w8, #7500
|
|
sxtw x8, w8
|
|
ADD x2, x2, x8
|
|
|
|
|
|
|
|
movi v18.4h, #50
|
|
sub x20, x5, #15
|
|
neg x9, x20
|
|
movi v20.4s, #0x80, LSL #8
|
|
dup v16.4s, w5
|
|
SUB x5, x5, #16
|
|
//STR w5, [sp, #116]
|
|
MOV w25, w5
|
|
sxtw x25, w25
|
|
MOV x8, #1
|
|
LSL x8, x8, x9
|
|
//STR w8, [sp, #120]
|
|
MOV w26, w8
|
|
|
|
//sxtw x8,w8
|
|
|
|
|
|
ARM_PROLOGUE:
|
|
|
|
|
|
LDR w8, [x1], #4
|
|
sxtw x8, w8
|
|
LDR w9, [x1], #4
|
|
sxtw x9, w9
|
|
|
|
LDR w10, [x2], #4
|
|
sxtw x10, w10
|
|
|
|
AND w19, w10, 0xFFFF
|
|
sxth x19, w19
|
|
ASR w10, w10, #16
|
|
// SMULWT x11, x8, x10
|
|
//
|
|
// SMULWB x12, x9, x10
|
|
// SMULWB x5, x8, x10
|
|
// SMLAWT x7, x9, x10, x5
|
|
|
|
SMULL x11, w8, w10
|
|
ASR x11, x11, #16
|
|
SMULL x12, w9, w19
|
|
ASR x12, x12, #16
|
|
SMULL x5, w8, w19
|
|
ASR x5, x5, #16
|
|
SMULL x7, w9, w10
|
|
ASR x7, x7, #16
|
|
ADD x7, x7, x5
|
|
|
|
SUB x8, x12, x11
|
|
MVN x5, x7
|
|
ADD x5, x5, #1
|
|
|
|
|
|
MOV x9, #50
|
|
MOV x12, #-50
|
|
AND w19, w9, 0xFFFF
|
|
sxth x19, w19
|
|
SMULL x10, w5, w19
|
|
ASR x10, x10, #16
|
|
AND w19, w12, 0xFFFF
|
|
sxth x19, w19
|
|
SMULL x11, w8, w19
|
|
ASR x11, x11, #16
|
|
|
|
ADD x8, x8, x10
|
|
ADD x5, x5, x11
|
|
|
|
//LDR w11, [sp, #104]
|
|
MOV w11, w16
|
|
sxth x11, w11
|
|
LDR w10, [x6], #-32
|
|
sxtw x10, w10
|
|
|
|
AND w19, w10, 0xFFFF
|
|
sxth x19, w19
|
|
ASR w20, w10, #16
|
|
|
|
//SMULWB x7, x8, x10
|
|
SMULL x7, w8, w19
|
|
ASR x7, x7, #16
|
|
MVN x8, x8
|
|
ADD x8, x8, #1
|
|
//SMULWT x12, x8, x10
|
|
SMULL x12, w8, w20
|
|
ASR x12, x12, #16
|
|
|
|
CMP x11, #0
|
|
BLT NEXT
|
|
|
|
SUB x9, x11, #16
|
|
negs x9, x9
|
|
|
|
|
|
|
|
|
|
// LDR w8, [sp, #120]
|
|
//sxtw x8,w8
|
|
MOV v1.s[0], w26
|
|
MOV v2.s[0], w5
|
|
|
|
//sQADD w5, w5, w8
|
|
//ASR w5, w5, w9
|
|
|
|
SQADD v2.2s, v2.2s, v1.2s
|
|
MOV w5, v2.s[0]
|
|
ASR w5, w5, w9
|
|
|
|
SUB x9, x11, #31
|
|
negs x9, x9
|
|
ASR x20, x7, x9
|
|
//MOV x8, x20
|
|
ADDS x8, x20, #0
|
|
BGE NEXT2
|
|
CMN x8, #1
|
|
NEXT2:
|
|
MOV x20, #0x80000000
|
|
csel x7, x20, x7, LT
|
|
MOV x20, #0x7fffffff
|
|
csel x7, x20, x7, GT
|
|
LSL x20, x7, x11
|
|
csel x7, x20, x7, EQ
|
|
|
|
SUB x9, x11, #31
|
|
negs x9, x9
|
|
ASR x20, x12, x9
|
|
//MOV x8, x20
|
|
ADDS x8, x20, #0
|
|
BGE NEXT3
|
|
CMN x8, #1
|
|
NEXT3:
|
|
MOV x20, #0x80000000
|
|
csel x12, x20, x12, LT
|
|
MOV x20, #0x7fffffff
|
|
csel x12, x20, x12, GT
|
|
LSL x20, x12, x11
|
|
csel x12, x20, x12, EQ
|
|
|
|
B NEXT1
|
|
NEXT:
|
|
MVN w11, w11
|
|
ADD w11, w11, #1
|
|
ASR w5, w5, w11
|
|
MOV w8, #0x8000
|
|
|
|
MOV v1.s[0], w8
|
|
MOV v2.s[0], w5
|
|
|
|
//QADD x5, x5, x8
|
|
|
|
SQADD v2.2s, v2.2s, v1.2s
|
|
MOV w5, v2.s[0]
|
|
|
|
ASR w5, w5, #16
|
|
ASR w7, w7, w11
|
|
ASR w12, w12, w11
|
|
|
|
NEXT1:
|
|
LDR w9, [x4]
|
|
sxtw x9, w9
|
|
MOV w8, #0x8000
|
|
//sxtw x8,w8
|
|
|
|
STR w5, [x4], #4
|
|
sxtw x5, w5
|
|
|
|
|
|
ROR w20, w10, #16
|
|
//UXTH x5, x10, ROR #16
|
|
UXTH w5, w20
|
|
UXTH w10, w10
|
|
|
|
|
|
dup v0.2s, w9
|
|
dup v2.2s, w10
|
|
dup v3.2s, w5
|
|
//VZIP.32 D2, D3
|
|
ZIP1 v28.2s, v2.2s, v3.2s
|
|
ZIP2 v3.2s, v2.2s, v3.2s
|
|
MOV v2.8b, v28.8b
|
|
sMULL v0.2d, v2.2s, v0.2s
|
|
Sqxtn v8.2s, v0.2d
|
|
|
|
|
|
dup v0.2s, w12
|
|
dup v1.2s, w7
|
|
|
|
//VZIP.32 D0, D1
|
|
|
|
ZIP1 v28.2s, v0.2s, v1.2s
|
|
ZIP2 v1.2s, v0.2s, v1.2s
|
|
MOV v0.8b, v28.8b
|
|
|
|
SQSUB v8.2s, v0.2s , v8.2s
|
|
|
|
|
|
sQshL v8.2s, v8.2s, #2
|
|
dup v0.2s, w8
|
|
SQADD v8.2s, v8.2s , v0.2s
|
|
sshR v8.2s, v8.2s, #16
|
|
|
|
|
|
|
|
MOV x7, x17
|
|
//sxtw x7,w7
|
|
LSL x10, x7, #1
|
|
|
|
ASR x5, x3, #1
|
|
//SMULBB x5, x10, x5
|
|
AND w5, w5, 0xFFFF
|
|
sxth x5, w5
|
|
AND w19, w10, 0xFFFF
|
|
sxth x19, w19
|
|
SMULL x5, w19, w5
|
|
|
|
ADD x5, x5, x0
|
|
SUB x0, x5, x10
|
|
MVN x9, x10
|
|
ADD x9, x9, #1
|
|
|
|
ST1 {v8.h}[2], [x0], x9
|
|
ST1 {v8.h}[0], [x5], x10
|
|
|
|
|
|
MOV x8, x1
|
|
LSL x12, x3, #2
|
|
|
|
ADD x1, x1, x12
|
|
|
|
SUB x1, x1, #40
|
|
|
|
MOV x12, #-32
|
|
|
|
|
|
|
|
PROLOGUE_NEON:
|
|
|
|
ASR x3, x3, #2
|
|
SUB x3, x3, #4
|
|
ASR x3, x3, #2
|
|
SUB x3, x3, #2
|
|
|
|
LD2 { v0.4s, v1.4s}, [x1]
|
|
MOV v2.16b, v1.16b
|
|
ADD x1, x1, x12
|
|
|
|
//VUZP.16 D0, D1
|
|
UZP1 v28.8h, v0.8h, v0.8h
|
|
UZP2 v29.8h, v0.8h, v0.8h
|
|
MOV v0.d[0], v28.d[0]
|
|
MOV v0.d[1], v29.d[0]
|
|
|
|
//VUZP.16 D2, D3
|
|
|
|
UZP1 v28.8h, v2.8h, v2.8h
|
|
UZP2 v29.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v28.d[0]
|
|
MOV v2.d[1], v29.d[0]
|
|
|
|
|
|
//rev64 v0.8h, v0.8h
|
|
rev64 v0.8h, v0.8h
|
|
MOV v1.d[0], v0.d[1]
|
|
rev64 v2.8h, v2.8h
|
|
MOV v3.d[0], v2.d[1]
|
|
LD2 {v8.4h, v9.4h}, [x2]
|
|
ADD x2, x2, #16
|
|
|
|
LD2 { v4.4s, v5.4s}, [x8]
|
|
MOV v6.16b, v5.16b
|
|
ADD x8, x8, #32
|
|
uMULL v30.4s, v0.4h, v9.4h
|
|
|
|
// VUZP.16 D4, D5
|
|
|
|
UZP1 v28.8h, v4.8h, v4.8h
|
|
UZP2 v29.8h, v4.8h, v4.8h
|
|
MOV v4.d[0], v28.d[0]
|
|
MOV v5.d[0], v29.d[0]
|
|
|
|
uMULL v28.4s, v2.4h, v8.4h
|
|
|
|
// VUZP.16 D6, D7
|
|
UZP1 v26.8h, v6.8h, v6.8h
|
|
UZP2 v27.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v26.d[0]
|
|
MOV v7.d[0], v27.d[0]
|
|
|
|
uMULL v26.4s, v0.4h, v8.4h
|
|
|
|
|
|
uMULL v24.4s, v2.4h, v9.4h
|
|
|
|
LD2 { v10.4s, v11.4s}, [x6]
|
|
MOV v12.16b, v11.16b
|
|
ADD x6, x6, x12
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
//VUZP.16 D10, D11
|
|
|
|
UZP1 v22.8h, v10.8h, v10.8h
|
|
UZP2 v23.8h, v10.8h, v10.8h
|
|
MOV v10.d[0], v22.d[0]
|
|
MOV v10.d[1], v23.d[0]
|
|
|
|
ushR v28.4s, v28.4s, #16
|
|
|
|
//VUZP.16 D12, D13
|
|
|
|
UZP1 v22.8h, v12.8h, v12.8h
|
|
UZP2 v23.8h, v12.8h, v12.8h
|
|
MOV v12.d[0], v22.d[0]
|
|
MOV v12.d[1], v23.d[0]
|
|
|
|
sMLAL v30.4s, v1.4h, v9.4h
|
|
|
|
rev64 v10.8h, v10.8h
|
|
MOV v11.d[0], v10.d[1]
|
|
sMLAL v28.4s, v3.4h, v8.4h
|
|
|
|
rev64 v12.8h, v12.8h
|
|
MOV v13.d[0], v12.d[1]
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v26.4s, v1.4h, v8.4h
|
|
sMLAL v24.4s, v3.4h, v9.4h
|
|
|
|
|
|
|
|
ADD v30.4s, v30.4s , v28.4s
|
|
NEG v30.4s, v30.4s
|
|
|
|
uMULL v22.4s, v4.4h, v8.4h
|
|
|
|
SUB v28.4s, v24.4s , v26.4s
|
|
|
|
|
|
mov v26.16b, v30.16b
|
|
mov v24.16b, v28.16b
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
uMULL v2.4s, v24.4h, v18.4h
|
|
|
|
uMULL v0.4s, v26.4h, v18.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
sMLAL v22.4s, v5.4h, v8.4h
|
|
|
|
ushR v2.4s, v2.4s, #16
|
|
ushR v0.4s, v0.4s, #16
|
|
sMLAL v2.4s, v25.4h, v18.4h
|
|
sMLAL v0.4s, v27.4h, v18.4h
|
|
|
|
uMULL v24.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v6.4h, v8.4h
|
|
|
|
NEG v2.4s, v2.4s
|
|
ADD v28.4s, v28.4s , v0.4s
|
|
ADD v30.4s, v30.4s , v2.4s
|
|
|
|
uMULL v0.4s, v6.4h, v9.4h
|
|
sshR v24.4s, v24.4s, #16
|
|
sMLAL v24.4s, v5.4h, v9.4h
|
|
sshR v26.4s, v26.4s, #16
|
|
sshR v0.4s, v0.4s, #16
|
|
sMLAL v26.4s, v7.4h, v8.4h
|
|
sMLAL v0.4s, v7.4h, v9.4h
|
|
|
|
|
|
|
|
|
|
ADD v22.4s, v22.4s , v0.4s
|
|
NEG v22.4s, v22.4s
|
|
SUB v24.4s, v26.4s , v24.4s
|
|
|
|
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v14.4s, w11
|
|
SQADD v28.4s, v28.4s , v14.4s
|
|
//LDR w11, [sp, #116]
|
|
MOV w11, w25
|
|
//sxtw x11,w11
|
|
dup v0.4s, w11
|
|
sQshL v28.4s, v28.4s, v0.4s
|
|
|
|
mov v0.16b, v22.16b
|
|
mov v14.16b, v24.16b
|
|
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
uMULL v8.4s, v24.4h, v18.4h
|
|
uMULL v26.4s, v22.4h, v18.4h
|
|
|
|
NEG v2.4s, v30.4s
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v3.d[0], v21.d[0]
|
|
|
|
uMULL v4.4s, v30.4h, v12.4h
|
|
|
|
uMULL v6.4s, v2.4h, v13.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
sMLAL v8.4s, v25.4h, v18.4h
|
|
sMLAL v26.4s, v23.4h, v18.4h
|
|
|
|
ushR v4.4s, v4.4s, #16
|
|
ushR v6.4s, v6.4s, #16
|
|
|
|
MOV v19.d[0], v30.d[1]
|
|
|
|
sMLAL v4.4s, v19.4h, v12.4h
|
|
sMLAL v6.4s, v3.4h, v13.4h
|
|
|
|
NEG v8.4s, v8.4s
|
|
ADD v14.4s, v14.4s , v26.4s
|
|
ADD v0.4s, v0.4s , v8.4s
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v8.4s, w11
|
|
SQADD v0.4s, v0.4s , v8.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v26.4s, w11
|
|
sQshL v0.4s, v0.4s, v26.4s
|
|
|
|
mov v26.16b, v28.16b
|
|
|
|
LD2 { v28.4s, v29.4s}, [x4]
|
|
MOV v30.16b, v29.16b
|
|
MOV v29.d[0], v28.d[1]
|
|
// VZIP.32 Q13, Q0
|
|
|
|
ZIP1 v19.4s, v26.4s, v0.4s
|
|
ZIP2 v0.4s, v26.4s, v0.4s
|
|
MOV v26.16b, v19.16b
|
|
|
|
ST1 { v26.4s}, [x4], #16
|
|
ST1 { v0.4s}, [x4], #16
|
|
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D13, D1
|
|
|
|
SADDL v0.4s, v13.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
sMULL v26.2d, v28.2s, v0.2s
|
|
Sqxtn v8.2s, v26.2d
|
|
sMULL v26.2d, v29.2s, v1.2s
|
|
Sqxtn v9.2s, v26.2d
|
|
MOV v8.d[1], v9.d[0]
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D12, D1
|
|
SADDL v0.4s, v12.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
sMULL v24.2d, v28.2s, v0.2s
|
|
Sqxtn v26.2s, v24.2d
|
|
sMULL v24.2d, v29.2s, v1.2s
|
|
Sqxtn v27.2s, v24.2d
|
|
MOV v26.d[1], v27.d[0]
|
|
|
|
sQshL v4.4s, v4.4s, v16.4s
|
|
sQshL v6.4s, v6.4s, v16.4s
|
|
|
|
SQSUB v4.4s, v4.4s , v8.4s
|
|
SQSUB v6.4s, v6.4s , v26.4s
|
|
|
|
NEG v26.4s, v14.4s
|
|
// VUZP.16 D14, D15
|
|
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D10, D1
|
|
SADDL v0.4s, v10.4h, v1.4h
|
|
MOV v1.d[0], v0.d[0]
|
|
sMULL v22.2d, v30.2s, v0.2s
|
|
Sqxtn v24.2s, v22.2d
|
|
sMULL2 v22.2d, v30.4s, v0.4s
|
|
Sqxtn v25.2s, v22.2d
|
|
MOV v24.d[1], v25.d[0]
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D11, D1
|
|
SADDL v0.4s, v11.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v8.2d, v30.2s, v0.2s
|
|
Sqxtn v22.2s, v8.2d
|
|
sMULL2 v8.2d, v30.4s, v0.4s
|
|
Sqxtn v23.2s, v8.2d
|
|
MOV v22.d[1], v23.d[0]
|
|
uMULL v8.4s, v26.4h, v11.4h
|
|
uMULL v30.4s, v14.4h, v10.4h
|
|
|
|
LD2 { v0.4s, v1.4s}, [x1]
|
|
MOV v2.16b, v1.16b
|
|
ADD x1, x1, x12
|
|
|
|
// VUZP.16 D0, D1
|
|
|
|
UZP1 v19.8h, v0.8h, v0.8h
|
|
UZP2 v21.8h, v0.8h, v0.8h
|
|
MOV v0.d[0], v19.d[0]
|
|
MOV v0.d[1], v21.d[0]
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v2.d[1], v21.d[0]
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
|
|
rev64 v0.8h, v0.8h
|
|
MOV v1.d[0], v0.d[1]
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
rev64 v2.8h, v2.8h
|
|
MOV v3.d[0], v2.d[1]
|
|
sMLAL v8.4s, v27.4h, v11.4h
|
|
|
|
sMLAL v30.4s, v15.4h, v10.4h
|
|
|
|
LD2 { v10.4s, v11.4s}, [x6]
|
|
ADD x6, x6, x12
|
|
MOV v12.16b, v11.16b
|
|
sQshL v4.4s, v4.4s, #2
|
|
|
|
// VUZP.16 D10, D11
|
|
|
|
UZP1 v19.8h, v10.8h, v10.8h
|
|
UZP2 v21.8h, v10.8h, v10.8h
|
|
MOV v10.d[0], v19.d[0]
|
|
MOV v10.d[1], v21.d[0]
|
|
|
|
sQshL v6.4s, v6.4s, #2
|
|
|
|
// VUZP.16 D12, D13
|
|
|
|
UZP1 v19.8h, v12.8h, v12.8h
|
|
UZP2 v21.8h, v12.8h, v12.8h
|
|
MOV v12.d[0], v19.d[0]
|
|
MOV v12.d[1], v21.d[0]
|
|
|
|
SQADD v14.4s, v4.4s , v20.4s
|
|
|
|
rev64 v10.8h, v10.8h
|
|
MOV v11.d[0], v10.d[1]
|
|
SQADD v6.4s, v6.4s , v20.4s
|
|
|
|
rev64 v12.8h, v12.8h
|
|
MOV v13.d[0], v12.d[1]
|
|
sshR v14.4s, v14.4s, #16
|
|
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
sshR v6.4s, v6.4s, #16
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
mov v15.8b, v6.8b
|
|
sQshL v8.4s, v8.4s, v16.4s
|
|
|
|
LD2 { v4.4s, v5.4s}, [x8]
|
|
ADD x8, x8, #32
|
|
MOV v6.16b, v5.16b
|
|
sQshL v30.4s, v30.4s, v16.4s
|
|
|
|
// VUZP.16 D4, D5
|
|
|
|
UZP1 v19.8h, v4.8h, v4.8h
|
|
UZP2 v21.8h, v4.8h, v4.8h
|
|
MOV v4.d[0], v19.d[0]
|
|
MOV v5.d[0], v21.d[0]
|
|
|
|
SQSUB v8.4s, v8.4s , v24.4s
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
SQSUB v22.4s, v30.4s , v22.4s
|
|
|
|
sQshL v30.4s, v8.4s, #2
|
|
|
|
LD2 {v8.4h, v9.4h}, [x2]
|
|
ADD x2, x2, #16
|
|
sQshL v22.4s, v22.4s, #2
|
|
|
|
SQADD v30.4s, v30.4s , v20.4s
|
|
SQADD v22.4s, v22.4s , v20.4s
|
|
|
|
sshR v30.4s, v30.4s, #16
|
|
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
sshR v22.4s, v22.4s, #16
|
|
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
|
|
mov v23.8b, v30.8b
|
|
|
|
CORE_LOOP:
|
|
ST1 {v14.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v30.4s, v0.4h, v9.4h
|
|
|
|
ST1 {v22.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v28.4s, v2.4h, v8.4h
|
|
|
|
ST1 {v14.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v26.4s, v0.4h, v8.4h
|
|
|
|
ST1 {v22.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v24.4s, v2.4h, v9.4h
|
|
|
|
ST1 {v14.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
ST1 {v22.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ushR v28.4s, v28.4s, #16
|
|
|
|
ST1 {v14.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
sMLAL v30.4s, v1.4h, v9.4h
|
|
|
|
ST1 {v22.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
sMLAL v28.4s, v3.4h, v8.4h
|
|
|
|
ST1 {v15.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
ST1 {v23.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
ST1 {v15.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
sMLAL v26.4s, v1.4h, v8.4h
|
|
|
|
ST1 {v23.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
sMLAL v24.4s, v3.4h, v9.4h
|
|
|
|
ST1 {v15.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ADD v30.4s, v30.4s , v28.4s
|
|
|
|
ST1 {v23.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
NEG v30.4s, v30.4s
|
|
|
|
ST1 {v15.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
|
|
ST1 {v23.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
SUB v28.4s, v24.4s , v26.4s
|
|
|
|
|
|
mov v26.16b, v30.16b
|
|
uMULL v22.4s, v4.4h, v8.4h
|
|
|
|
mov v24.16b, v28.16b
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
uMULL v2.4s, v24.4h, v18.4h
|
|
uMULL v0.4s, v26.4h, v18.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
sMLAL v22.4s, v5.4h, v8.4h
|
|
|
|
ushR v2.4s, v2.4s, #16
|
|
ushR v0.4s, v0.4s, #16
|
|
sMLAL v2.4s, v25.4h, v18.4h
|
|
sMLAL v0.4s, v27.4h, v18.4h
|
|
|
|
uMULL v24.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v6.4h, v8.4h
|
|
|
|
NEG v2.4s, v2.4s
|
|
ADD v28.4s, v28.4s , v0.4s
|
|
ADD v30.4s, v30.4s , v2.4s
|
|
|
|
uMULL v0.4s, v6.4h, v9.4h
|
|
sshR v24.4s, v24.4s, #16
|
|
sMLAL v24.4s, v5.4h, v9.4h
|
|
sshR v26.4s, v26.4s, #16
|
|
sshR v0.4s, v0.4s, #16
|
|
sMLAL v26.4s, v7.4h, v8.4h
|
|
sMLAL v0.4s, v7.4h, v9.4h
|
|
|
|
|
|
|
|
ADD v22.4s, v22.4s , v0.4s
|
|
|
|
NEG v22.4s, v22.4s
|
|
SUB v24.4s, v26.4s , v24.4s
|
|
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v14.4s, w11
|
|
SQADD v28.4s, v28.4s , v14.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v0.4s, w11
|
|
sQshL v28.4s, v28.4s, v0.4s
|
|
|
|
|
|
mov v0.16b, v22.16b
|
|
mov v14.16b, v24.16b
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
uMULL v8.4s, v24.4h, v18.4h
|
|
uMULL v26.4s, v22.4h, v18.4h
|
|
|
|
NEG v2.4s, v30.4s
|
|
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v3.d[0], v21.d[0]
|
|
|
|
uMULL v4.4s, v30.4h, v12.4h
|
|
uMULL v6.4s, v2.4h, v13.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
sMLAL v8.4s, v25.4h, v18.4h
|
|
sMLAL v26.4s, v23.4h, v18.4h
|
|
|
|
ushR v4.4s, v4.4s, #16
|
|
ushR v6.4s, v6.4s, #16
|
|
|
|
MOV v19.d[0], v30.d[1]
|
|
|
|
sMLAL v4.4s, v19.4h, v12.4h
|
|
sMLAL v6.4s, v3.4h, v13.4h
|
|
|
|
NEG v8.4s, v8.4s
|
|
ADD v14.4s, v14.4s , v26.4s
|
|
ADD v0.4s, v0.4s , v8.4s
|
|
|
|
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v8.4s, w11
|
|
SQADD v0.4s, v0.4s , v8.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v26.4s, w11
|
|
sQshL v0.4s, v0.4s, v26.4s
|
|
mov v26.16b, v28.16b
|
|
|
|
LD2 { v28.4s, v29.4s}, [x4]
|
|
MOV v30.16b, v29.16b
|
|
MOV v29.d[0], v28.d[1]
|
|
// VZIP.32 Q13, Q0
|
|
|
|
ZIP1 v19.4s, v26.4s, v0.4s
|
|
ZIP2 v0.4s, v26.4s, v0.4s
|
|
MOV v26.16b, v19.16b
|
|
|
|
ST1 { v26.4s}, [x4]
|
|
ADD x4, x4, #16
|
|
ST1 { v0.4s}, [x4]
|
|
ADD x4, x4, #16
|
|
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D13, D1
|
|
SADDL v0.4s, v13.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v26.2d, v28.2s, v0.2s
|
|
Sqxtn v8.2s, v26.2d
|
|
sMULL v26.2d, v29.2s, v1.2s
|
|
Sqxtn v9.2s, v26.2d
|
|
MOV v8.d[1], v9.d[0]
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D12, D1
|
|
SADDL v0.4s, v12.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v24.2d, v28.2s, v0.2s
|
|
Sqxtn v26.2s, v24.2d
|
|
sMULL v24.2d, v29.2s, v1.2s
|
|
Sqxtn v27.2s, v24.2d
|
|
MOV v26.d[1], v27.d[0]
|
|
sQshL v4.4s, v4.4s, v16.4s
|
|
sQshL v6.4s, v6.4s, v16.4s
|
|
|
|
|
|
|
|
SQSUB v4.4s, v4.4s , v8.4s
|
|
SQSUB v6.4s, v6.4s , v26.4s
|
|
|
|
NEG v26.4s, v14.4s
|
|
// VUZP.16 D26, D27
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D10, D1
|
|
SADDL v0.4s, v10.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v22.2d, v30.2s, v0.2s
|
|
Sqxtn v24.2s, v22.2d
|
|
sMULL2 v22.2d, v30.4s, v0.4s
|
|
Sqxtn v25.2s, v22.2d
|
|
MOV v24.d[1], v25.d[0]
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D11, D1
|
|
SADDL v0.4s, v11.4h, v1.4h
|
|
|
|
sMULL v8.2d, v30.2s, v0.2s
|
|
Sqxtn v22.2s, v8.2d
|
|
sMULL2 v8.2d, v30.4s, v0.4s
|
|
Sqxtn v23.2s, v8.2d
|
|
MOV v22.d[1], v23.d[0]
|
|
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
uMULL v8.4s, v26.4h, v11.4h
|
|
uMULL v30.4s, v14.4h, v10.4h
|
|
|
|
|
|
LD2 { v0.4s, v1.4s}, [x1]
|
|
MOV v2.16b, v1.16b
|
|
ADD X1, X1, x12
|
|
|
|
// VUZP.16 D0, D1
|
|
UZP1 v19.8h, v0.8h, v0.8h
|
|
UZP2 v21.8h, v0.8h, v0.8h
|
|
MOV v0.d[0], v19.d[0]
|
|
MOV v0.d[1], v21.d[0]
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v2.d[1], v21.d[0]
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
|
|
rev64 v0.8h, v0.8h
|
|
MOV v1.d[0], v0.d[1]
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
rev64 v2.8h, v2.8h
|
|
MOV v3.d[0], v2.d[1]
|
|
sMLAL v8.4s, v27.4h, v11.4h
|
|
|
|
sMLAL v30.4s, v15.4h, v10.4h
|
|
|
|
LD2 { v10.4s, v11.4s}, [x6]
|
|
add X6, x6, x12
|
|
MOV v12.16b, v11.16b
|
|
sQshL v4.4s, v4.4s, #2
|
|
|
|
//VUZP.16 D10, D11
|
|
|
|
UZP1 v19.8h, v10.8h, v10.8h
|
|
UZP2 v21.8h, v10.8h, v10.8h
|
|
MOV v10.d[0], v19.d[0]
|
|
MOV v10.d[1], v21.d[0]
|
|
|
|
sQshL v6.4s, v6.4s, #2
|
|
|
|
// VUZP.16 D12, D13
|
|
|
|
UZP1 v19.8h, v12.8h, v12.8h
|
|
UZP2 v21.8h, v12.8h, v12.8h
|
|
MOV v12.d[0], v19.d[0]
|
|
MOV v12.d[1], v21.d[0]
|
|
|
|
|
|
SQADD v14.4s, v4.4s , v20.4s
|
|
|
|
rev64 v10.8h, v10.8h
|
|
MOV v11.d[0], v10.d[1]
|
|
SQADD v6.4s, v6.4s , v20.4s
|
|
|
|
rev64 v12.8h, v12.8h
|
|
MOV v13.d[0], v12.d[1]
|
|
sshR v14.4s, v14.4s, #16
|
|
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
|
|
sshR v6.4s, v6.4s, #16
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
|
|
mov v15.8b, v6.8b
|
|
sQshL v8.4s, v8.4s, v16.4s
|
|
|
|
LD2 { v4.4s, v5.4s}, [x8]
|
|
ADD x8, x8, #32
|
|
MOV v6.16b, v5.16b
|
|
|
|
sQshL v30.4s, v30.4s, v16.4s
|
|
|
|
// VUZP.16 D4, D5
|
|
|
|
UZP1 v19.8h, v4.8h, v4.8h
|
|
UZP2 v21.8h, v4.8h, v4.8h
|
|
MOV v4.d[0], v19.d[0]
|
|
MOV v5.d[0], v21.d[0]
|
|
|
|
|
|
SQSUB v8.4s, v8.4s , v24.4s
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
|
|
SQSUB v22.4s, v30.4s , v22.4s
|
|
|
|
sQshL v30.4s, v8.4s, #2
|
|
|
|
LD2 {v8.4h, v9.4h}, [x2]
|
|
ADD x2, x2, #16
|
|
sQshL v22.4s, v22.4s, #2
|
|
|
|
SQADD v30.4s, v30.4s , v20.4s
|
|
SQADD v22.4s, v22.4s , v20.4s
|
|
|
|
sshR v30.4s, v30.4s, #16
|
|
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
|
|
sshR v22.4s, v22.4s, #16
|
|
|
|
|
|
// VUZP.16 D22, D23
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
|
|
mov v23.8b, v30.8b
|
|
|
|
SUBS x3, x3, #1
|
|
BNE CORE_LOOP
|
|
|
|
|
|
|
|
|
|
|
|
EPILOGUE:
|
|
|
|
ST1 {v14.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v30.4s, v0.4h, v9.4h
|
|
|
|
ST1 {v22.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v28.4s, v2.4h, v8.4h
|
|
|
|
ST1 {v14.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v26.4s, v0.4h, v8.4h
|
|
|
|
ST1 {v22.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
uMULL v24.4s, v2.4h, v9.4h
|
|
|
|
ST1 {v14.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
ST1 {v22.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ushR v28.4s, v28.4s, #16
|
|
|
|
ST1 {v14.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
sMLAL v30.4s, v1.4h, v9.4h
|
|
|
|
ST1 {v22.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
sMLAL v28.4s, v3.4h, v8.4h
|
|
|
|
ST1 {v15.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
ST1 {v23.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
ST1 {v15.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
sMLAL v26.4s, v1.4h, v8.4h
|
|
|
|
ST1 {v23.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
sMLAL v24.4s, v3.4h, v9.4h
|
|
|
|
ST1 {v15.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ADD v30.4s, v30.4s , v28.4s
|
|
|
|
ST1 {v23.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
NEG v30.4s, v30.4s
|
|
|
|
ST1 {v15.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
|
|
|
|
ST1 {v23.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
SUB v28.4s, v24.4s , v26.4s
|
|
|
|
|
|
uMULL v22.4s, v4.4h, v8.4h
|
|
mov v26.16b, v30.16b
|
|
mov v24.16b, v28.16b
|
|
|
|
mov v26.16b, v30.16b
|
|
mov v24.16b, v28.16b
|
|
|
|
//VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
uMULL v2.4s, v24.4h, v18.4h
|
|
uMULL v0.4s, v26.4h, v18.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
sMLAL v22.4s, v5.4h, v8.4h
|
|
|
|
ushR v2.4s, v2.4s, #16
|
|
ushR v0.4s, v0.4s, #16
|
|
sMLAL v2.4s, v25.4h, v18.4h
|
|
sMLAL v0.4s, v27.4h, v18.4h
|
|
|
|
uMULL v24.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v6.4h, v8.4h
|
|
|
|
NEG v2.4s, v2.4s
|
|
ADD v28.4s, v28.4s , v0.4s
|
|
ADD v30.4s, v30.4s , v2.4s
|
|
|
|
uMULL v0.4s, v6.4h, v9.4h
|
|
sshR v24.4s, v24.4s, #16
|
|
sMLAL v24.4s, v5.4h, v9.4h
|
|
sshR v26.4s, v26.4s, #16
|
|
sshR v0.4s, v0.4s, #16
|
|
sMLAL v26.4s, v7.4h, v8.4h
|
|
sMLAL v0.4s, v7.4h, v9.4h
|
|
|
|
|
|
|
|
|
|
|
|
ADD v22.4s, v22.4s , v0.4s
|
|
NEG v22.4s, v22.4s
|
|
SUB v24.4s, v26.4s , v24.4s
|
|
|
|
|
|
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v14.4s, w11
|
|
SQADD v28.4s, v28.4s , v14.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v0.4s, w11
|
|
sQshL v28.4s, v28.4s, v0.4s
|
|
|
|
|
|
mov v0.16b, v22.16b
|
|
mov v14.16b, v24.16b
|
|
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
uMULL v8.4s, v24.4h, v18.4h
|
|
uMULL v26.4s, v22.4h, v18.4h
|
|
|
|
NEG v2.4s, v30.4s
|
|
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v3.d[0], v21.d[0]
|
|
|
|
uMULL v4.4s, v30.4h, v12.4h
|
|
uMULL v6.4s, v2.4h, v13.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
sMLAL v8.4s, v25.4h, v18.4h
|
|
sMLAL v26.4s, v23.4h, v18.4h
|
|
|
|
ushR v4.4s, v4.4s, #16
|
|
ushR v6.4s, v6.4s, #16
|
|
|
|
MOV v19.d[0], v30.d[1]
|
|
|
|
sMLAL v4.4s, v19.4h, v12.4h
|
|
sMLAL v6.4s, v3.4h, v13.4h
|
|
|
|
NEG v8.4s, v8.4s
|
|
ADD v14.4s, v14.4s , v26.4s
|
|
ADD v0.4s, v0.4s , v8.4s
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v8.4s, w11
|
|
SQADD v0.4s, v0.4s , v8.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v26.4s, w11
|
|
sQshL v0.4s, v0.4s, v26.4s
|
|
|
|
|
|
mov v26.16b, v28.16b
|
|
|
|
LD2 { v28.4s, v29.4s}, [x4]
|
|
MOV v30.16b, v29.16b
|
|
MOV v29.d[0], v28.d[1]
|
|
// VZIP.32 Q13, Q0
|
|
|
|
ZIP1 v19.4s, v26.4s, v0.4s
|
|
ZIP2 v0.4s, v26.4s, v0.4s
|
|
MOV v26.16b, v19.16b
|
|
|
|
ST1 { v26.4s}, [x4], #16
|
|
ST1 { v0.4s}, [x4], #16
|
|
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D13, D1
|
|
SADDL v0.4s, v13.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v26.2d, v28.2s, v0.2s
|
|
Sqxtn v8.2s, v26.2d
|
|
sMULL v26.2d, v29.2s, v1.2s
|
|
Sqxtn v9.2s, v26.2d
|
|
MOV v8.d[1], v9.d[0]
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D12, D1
|
|
SADDL v0.4s, v12.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v24.2d, v28.2s, v0.2s
|
|
Sqxtn v26.2s, v24.2d
|
|
sMULL v24.2d, v29.2s, v1.2s
|
|
Sqxtn v27.2s, v24.2d
|
|
MOV v26.d[1], v27.d[0]
|
|
|
|
sQshL v4.4s, v4.4s, v16.4s
|
|
sQshL v6.4s, v6.4s, v16.4s
|
|
|
|
SQSUB v4.4s, v4.4s , v8.4s
|
|
SQSUB v6.4s, v6.4s , v26.4s
|
|
|
|
NEG v26.4s, v14.4s
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D10, D1
|
|
SADDL v0.4s, v10.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v22.2d, v30.2s, v0.2s
|
|
Sqxtn v24.2s, v22.2d
|
|
sMULL2 v22.2d, v30.4s, v0.4s
|
|
Sqxtn v25.2s, v22.2d
|
|
MOV v24.d[1], v25.d[0]
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D11, D1
|
|
SADDL v0.4s, v11.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v8.2d, v30.2s, v0.2s
|
|
Sqxtn v22.2s, v8.2d
|
|
sMULL2 v8.2d, v30.4s, v0.4s
|
|
Sqxtn v23.2s, v8.2d
|
|
MOV v22.d[1], v23.d[0]
|
|
|
|
uMULL v8.4s, v26.4h, v11.4h
|
|
uMULL v30.4s, v14.4h, v10.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
sMLAL v8.4s, v27.4h, v11.4h
|
|
|
|
sMLAL v30.4s, v15.4h, v10.4h
|
|
|
|
sQshL v4.4s, v4.4s, #2
|
|
|
|
sQshL v6.4s, v6.4s, #2
|
|
|
|
SQADD v14.4s, v4.4s , v20.4s
|
|
|
|
SQADD v6.4s, v6.4s , v20.4s
|
|
|
|
sshR v14.4s, v14.4s, #16
|
|
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
sshR v6.4s, v6.4s, #16
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
mov v15.8b, v6.8b
|
|
sQshL v8.4s, v8.4s, v16.4s
|
|
|
|
sQshL v30.4s, v30.4s, v16.4s
|
|
|
|
SQSUB v8.4s, v8.4s , v24.4s
|
|
|
|
SQSUB v22.4s, v30.4s , v22.4s
|
|
|
|
sQshL v30.4s, v8.4s, #2
|
|
|
|
sQshL v22.4s, v22.4s, #2
|
|
|
|
SQADD v30.4s, v30.4s , v20.4s
|
|
SQADD v22.4s, v22.4s , v20.4s
|
|
|
|
sshR v30.4s, v30.4s, #16
|
|
|
|
//VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
sshR v22.4s, v22.4s, #16
|
|
|
|
// VUZP.16 D22, D23
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
mov v23.8b, v30.8b
|
|
|
|
|
|
|
|
|
|
ST1 {v14.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v15.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
|
|
ARM_EPILOGUE:
|
|
|
|
ARM_LOOP:
|
|
|
|
LD2 { v0.4s, v1.4s}, [x1]
|
|
MOV v2.16b, v1.16b
|
|
|
|
//VUZP.16 D0, D1
|
|
UZP1 v19.8h, v0.8h, v0.8h
|
|
UZP2 v21.8h, v0.8h, v0.8h
|
|
MOV v0.d[0], v19.d[0]
|
|
MOV v0.d[1], v21.d[0]
|
|
|
|
//VUZP.16 D2, D3
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v2.d[1], v21.d[0]
|
|
|
|
|
|
rev64 v0.8h, v0.8h
|
|
MOV v1.d[0], v0.d[1]
|
|
rev64 v2.8h, v2.8h
|
|
MOV v3.d[0], v2.d[1]
|
|
|
|
LD2 {v8.4h, v9.4h}, [x2]
|
|
ADD x2, x2, #16
|
|
|
|
LD2 {v4.2s, v5.2s}, [x8]
|
|
ADD x8, x8, #16
|
|
MOV v6.16b, v5.16b
|
|
movi v5.2s, #0x00000000
|
|
movi v7.2s, #0x00000000
|
|
|
|
LD1 {v5.s}[0], [x8], #4
|
|
LD1 {v7.s}[0], [x8]
|
|
|
|
MOV x12, #16
|
|
MOV v4.d[1], v5.d[0]
|
|
MOV v6.d[1], v7.d[0]
|
|
// VUZP.16 D4, D5
|
|
|
|
UZP1 v19.8h, v4.8h, v4.8h
|
|
UZP2 v21.8h, v4.8h, v4.8h
|
|
MOV v4.d[0], v19.d[0]
|
|
MOV v5.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
ADD x6, x6, #16
|
|
|
|
MOV x12, #-4
|
|
LD2 {v11.2s, v12.2s}, [x6]
|
|
ADD x6, x6, x12
|
|
MOV v13.16b, v12.16b
|
|
|
|
|
|
movi v10.2s, #0x00000000
|
|
|
|
LD1 {v12.s}[1], [x6]
|
|
ADD x6, x6, x12
|
|
LD1 {v10.s}[1], [x6]
|
|
ADD x6, x6, x12
|
|
LD1 {v12.s}[0], [x6]
|
|
ADD x6, x6, x12
|
|
|
|
MOV v10.d[1], v11.d[0]
|
|
MOV v12.d[1], v13.d[0]
|
|
|
|
//VUZP.16 D10, D11
|
|
|
|
UZP1 v19.8h, v10.8h, v10.8h
|
|
UZP2 v21.8h, v10.8h, v10.8h
|
|
MOV v10.d[0], v19.d[0]
|
|
MOV v10.d[1], v21.d[0]
|
|
|
|
//VUZP.16 D12, D13
|
|
|
|
UZP1 v19.8h, v12.8h, v12.8h
|
|
UZP2 v21.8h, v12.8h, v12.8h
|
|
MOV v12.d[0], v19.d[0]
|
|
MOV v12.d[1], v21.d[0]
|
|
|
|
|
|
rev64 v10.8h, v10.8h
|
|
MOV v11.d[0], v10.d[1]
|
|
rev64 v12.8h, v12.8h
|
|
MOV v13.d[0], v12.d[1]
|
|
|
|
uMULL v30.4s, v0.4h, v9.4h
|
|
uMULL v28.4s, v2.4h, v8.4h
|
|
uMULL v26.4s, v0.4h, v8.4h
|
|
uMULL v24.4s, v2.4h, v9.4h
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
ushR v28.4s, v28.4s, #16
|
|
|
|
sMLAL v30.4s, v1.4h, v9.4h
|
|
sMLAL v28.4s, v3.4h, v8.4h
|
|
|
|
ushR v26.4s, v26.4s, #16
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v26.4s, v1.4h, v8.4h
|
|
sMLAL v24.4s, v3.4h, v9.4h
|
|
|
|
ADD v30.4s, v30.4s , v28.4s
|
|
NEG v30.4s, v30.4s
|
|
|
|
uMULL v22.4s, v4.4h, v8.4h
|
|
|
|
SUB v28.4s, v24.4s , v26.4s
|
|
|
|
|
|
mov v26.16b, v30.16b
|
|
mov v24.16b, v28.16b
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
//VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
uMULL v2.4s, v24.4h, v18.4h
|
|
uMULL v0.4s, v26.4h, v18.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
sMLAL v22.4s, v5.4h, v8.4h
|
|
|
|
ushR v2.4s, v2.4s, #16
|
|
ushR v0.4s, v0.4s, #16
|
|
sMLAL v2.4s, v25.4h, v18.4h
|
|
sMLAL v0.4s, v27.4h, v18.4h
|
|
|
|
uMULL v24.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v6.4h, v8.4h
|
|
|
|
NEG v2.4s, v2.4s
|
|
ADD v28.4s, v28.4s , v0.4s
|
|
ADD v30.4s, v30.4s , v2.4s
|
|
|
|
uMULL v0.4s, v6.4h, v9.4h
|
|
sshR v24.4s, v24.4s, #16
|
|
sMLAL v24.4s, v5.4h, v9.4h
|
|
sshR v26.4s, v26.4s, #16
|
|
sshR v0.4s, v0.4s, #16
|
|
sMLAL v26.4s, v7.4h, v8.4h
|
|
sMLAL v0.4s, v7.4h, v9.4h
|
|
|
|
ADD v22.4s, v22.4s , v0.4s
|
|
NEG v22.4s, v22.4s
|
|
SUB v24.4s, v26.4s , v24.4s
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v14.4s, w11
|
|
SQADD v28.4s, v28.4s , v14.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v0.4s, w11
|
|
sQshL v28.4s, v28.4s, v0.4s
|
|
|
|
mov v0.16b, v22.16b
|
|
mov v14.16b, v24.16b
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D24, D25
|
|
|
|
UZP1 v19.8h, v24.8h, v24.8h
|
|
UZP2 v21.8h, v24.8h, v24.8h
|
|
MOV v24.d[0], v19.d[0]
|
|
MOV v25.d[0], v21.d[0]
|
|
|
|
uMULL v8.4s, v24.4h, v18.4h
|
|
uMULL v26.4s, v22.4h, v18.4h
|
|
|
|
NEG v2.4s, v30.4s
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
// VUZP.16 D2, D3
|
|
|
|
UZP1 v19.8h, v2.8h, v2.8h
|
|
UZP2 v21.8h, v2.8h, v2.8h
|
|
MOV v2.d[0], v19.d[0]
|
|
MOV v3.d[0], v21.d[0]
|
|
|
|
uMULL v4.4s, v30.4h, v12.4h
|
|
uMULL v6.4s, v2.4h, v13.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
|
|
sMLAL v8.4s, v25.4h, v18.4h
|
|
sMLAL v26.4s, v23.4h, v18.4h
|
|
|
|
ushR v4.4s, v4.4s, #16
|
|
ushR v6.4s, v6.4s, #16
|
|
|
|
MOV v19.d[0], v30.d[1]
|
|
|
|
sMLAL v4.4s, v19.4h, v12.4h
|
|
sMLAL v6.4s, v3.4h, v13.4h
|
|
|
|
NEG v8.4s, v8.4s
|
|
ADD v14.4s, v14.4s , v26.4s
|
|
ADD v0.4s, v0.4s , v8.4s
|
|
|
|
//LDR w11, [sp, #120]
|
|
//sxtw x11,w11
|
|
MOV w11, w26
|
|
dup v8.4s, w11
|
|
SQADD v0.4s, v0.4s , v8.4s
|
|
//LDR w11, [sp, #116]
|
|
//sxtw x11,w11
|
|
MOV w11, w25
|
|
dup v26.4s, w11
|
|
sQshL v0.4s, v0.4s, v26.4s
|
|
|
|
mov v26.16b, v28.16b
|
|
|
|
MOV x6, x4
|
|
|
|
LD1 {v28.2s, v29.2s}, [x4], #16
|
|
movi v19.2s, #0x00000000
|
|
LD1 {v30.s}[0], [x4], #4
|
|
LD1 {v30.s}[1], [x4], #4
|
|
LD1 {v19.s}[0], [x4], #4
|
|
|
|
MOV v28.d[1], v29.d[0]
|
|
MOV v30.d[1], v19.d[0]
|
|
|
|
//VUZP.32 Q14, Q15
|
|
|
|
UZP1 v19.4s, v28.4s, v30.4s
|
|
UZP2 v30.4s, v28.4s, v30.4s
|
|
MOV v28.16b, v19.16b
|
|
MOV v29.d[0], v28.d[1]
|
|
|
|
ST1 {v26.s}[0], [x6], #4
|
|
ST1 {v0.s}[0], [x6], #4
|
|
ST1 {v26.s}[1], [x6], #4
|
|
ST1 {v0.s}[1], [x6], #4
|
|
ST1 {v26.s}[2], [x6], #4
|
|
ST1 {v0.s}[2], [x6], #4
|
|
ST1 {v26.s}[3], [x6], #4
|
|
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D13, D1
|
|
SADDL v0.4s, v13.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v26.2d, v28.2s, v0.2s
|
|
Sqxtn v8.2s, v26.2d
|
|
sMULL v26.2d, v29.2s, v1.2s
|
|
Sqxtn v9.2s, v26.2d
|
|
MOV v8.d[1], v9.d[0]
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D12, D1
|
|
SADDL v0.4s, v12.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v24.2d, v28.2s, v0.2s
|
|
Sqxtn v26.2s, v24.2d
|
|
sMULL v24.2d, v29.2s, v1.2s
|
|
Sqxtn v27.2s, v24.2d
|
|
MOV v26.d[1], v27.d[0]
|
|
|
|
sQshL v4.4s, v4.4s, v16.4s
|
|
sQshL v6.4s, v6.4s, v16.4s
|
|
|
|
SQSUB v4.4s, v4.4s , v8.4s
|
|
SQSUB v6.4s, v6.4s , v26.4s
|
|
|
|
NEG v26.4s, v14.4s
|
|
//VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
// VUZP.16 D26, D27
|
|
|
|
UZP1 v19.8h, v26.8h, v26.8h
|
|
UZP2 v21.8h, v26.8h, v26.8h
|
|
MOV v26.d[0], v19.d[0]
|
|
MOV v27.d[0], v21.d[0]
|
|
|
|
|
|
movi v1.2s, #0
|
|
//VADDL.S16 Q0, D10, D1
|
|
SADDL v0.4s, v10.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v22.2d, v30.2s, v0.2s
|
|
Sqxtn v24.2s, v22.2d
|
|
sMULL2 v22.2d, v30.4s, v0.4s
|
|
Sqxtn v25.2s, v22.2d
|
|
MOV v24.d[1], v25.d[0]
|
|
|
|
movi v1.2s, #0
|
|
// VADDL.S16 Q0, D11, D1
|
|
SADDL v0.4s, v11.4h, v1.4h
|
|
MOV v1.d[0], v0.d[1]
|
|
|
|
sMULL v8.2d, v30.2s, v0.2s
|
|
Sqxtn v22.2s, v8.2d
|
|
sMULL2 v8.2d, v30.4s, v0.4s
|
|
Sqxtn v23.2s, v8.2d
|
|
MOV v22.d[1], v23.d[0]
|
|
|
|
uMULL v8.4s, v26.4h, v11.4h
|
|
uMULL v30.4s, v14.4h, v10.4h
|
|
|
|
ushR v8.4s, v8.4s, #16
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
|
|
sMLAL v8.4s, v27.4h, v11.4h
|
|
|
|
sMLAL v30.4s, v15.4h, v10.4h
|
|
|
|
sQshL v4.4s, v4.4s, #2
|
|
|
|
sQshL v6.4s, v6.4s, #2
|
|
|
|
SQADD v14.4s, v4.4s , v20.4s
|
|
|
|
SQADD v6.4s, v6.4s , v20.4s
|
|
|
|
sshR v14.4s, v14.4s, #16
|
|
|
|
// VUZP.16 D14, D15
|
|
|
|
UZP1 v19.8h, v14.8h, v14.8h
|
|
UZP2 v21.8h, v14.8h, v14.8h
|
|
MOV v14.d[0], v19.d[0]
|
|
MOV v15.d[0], v21.d[0]
|
|
|
|
sshR v6.4s, v6.4s, #16
|
|
|
|
//VUZP.16 D6, D7
|
|
|
|
UZP1 v19.8h, v6.8h, v6.8h
|
|
UZP2 v21.8h, v6.8h, v6.8h
|
|
MOV v6.d[0], v19.d[0]
|
|
MOV v7.d[0], v21.d[0]
|
|
|
|
mov v15.8b, v6.8b
|
|
sQshL v8.4s, v8.4s, v16.4s
|
|
|
|
sQshL v30.4s, v30.4s, v16.4s
|
|
|
|
SQSUB v8.4s, v8.4s , v24.4s
|
|
|
|
SQSUB v22.4s, v30.4s , v22.4s
|
|
|
|
sQshL v30.4s, v8.4s, #2
|
|
|
|
sQshL v22.4s, v22.4s, #2
|
|
|
|
SQADD v30.4s, v30.4s , v20.4s
|
|
SQADD v22.4s, v22.4s , v20.4s
|
|
|
|
sshR v30.4s, v30.4s, #16
|
|
|
|
// VUZP.16 D30, D31
|
|
|
|
UZP1 v19.8h, v30.8h, v30.8h
|
|
UZP2 v21.8h, v30.8h, v30.8h
|
|
MOV v30.d[0], v19.d[0]
|
|
MOV v30.d[1], v21.d[0]
|
|
|
|
sshR v22.4s, v22.4s, #16
|
|
|
|
// VUZP.16 D22, D23
|
|
|
|
UZP1 v19.8h, v22.8h, v22.8h
|
|
UZP2 v21.8h, v22.8h, v22.8h
|
|
MOV v22.d[0], v19.d[0]
|
|
MOV v23.d[0], v21.d[0]
|
|
|
|
mov v23.8b, v30.8b
|
|
|
|
|
|
|
|
|
|
ST1 {v14.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[0], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[1], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v22.h}[2], [x0]
|
|
ADD x0, x0, x9
|
|
ST1 {v14.h}[3], [x0]
|
|
ADD x0, x0, x9
|
|
|
|
ST1 {v15.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[0], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[1], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v23.h}[2], [x5]
|
|
ADD x5, x5, x10
|
|
ST1 {v15.h}[3], [x5]
|
|
ADD x5, x5, x10
|
|
|
|
// VPOP {d8 - d15}
|
|
// LDMFD sp!, {x4-x12}
|
|
//ldp x19, x20,[sp],#16
|
|
pop_v_regs
|
|
ret
|
|
//BX x14
|