513 lines
13 KiB
ArmAsm
513 lines
13 KiB
ArmAsm
///******************************************************************************
|
|
// *
|
|
// * Copyright (C) 2018 The Android Open Source Project
|
|
// *
|
|
// * Licensed under the Apache License, Version 2.0 (the "License");
|
|
// * you may not use this file except in compliance with the License.
|
|
// * You may obtain a copy of the License at:
|
|
// *
|
|
// * http://www.apache.org/licenses/LICENSE-2.0
|
|
// *
|
|
// * Unless required by applicable law or agreed to in writing, software
|
|
// * distributed under the License is distributed on an "AS IS" BASIS,
|
|
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// * See the License for the specific language governing permissions and
|
|
// * limitations under the License.
|
|
// *
|
|
// *****************************************************************************
|
|
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
|
|
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X22, X23, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X20, X21, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X20, X21, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X22, X23, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
|
|
.macro swp reg1, reg2
|
|
MOV X16, \reg1
|
|
MOV \reg1, \reg2
|
|
MOV \reg2, x16
|
|
.endm
|
|
.text
|
|
.global ixheaacd_pretwiddle_compute_armv8
|
|
|
|
ixheaacd_pretwiddle_compute_armv8:
|
|
|
|
push_v_regs
|
|
|
|
LSL x7, x4, #4
|
|
ADD x7, x2, x7
|
|
SUB x7, x7, #4
|
|
MOV x22, #7500
|
|
ADD x3, x3, x22
|
|
MVN w5, w5
|
|
ADD w5, w5, #1
|
|
|
|
|
|
|
|
|
|
|
|
ARM_PROLOGUE:
|
|
LDRH w21, [x3]
|
|
LDRH w22, [x3, #2]
|
|
LSL w22, w22, #16
|
|
LSL w21, w21, #16
|
|
|
|
LDR w8, [x3], #4
|
|
LDR w9, [x0], #4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SMULL X12, w9, w21
|
|
ASR X12, x12, #32
|
|
LDR w10, [x1], #-4
|
|
SMULL X11, w9, w22
|
|
ASR X11, x11, #32
|
|
SMULL X23, w10, w22
|
|
ASR X23, x23, #32
|
|
ADD w9, w12, w23
|
|
SMULL X6, w10, w21
|
|
ASR X6, x6, #32
|
|
|
|
|
|
MVN w9, w9
|
|
ADD w9, w9, #1
|
|
SUB w11, w11, w6
|
|
CMP w5, #0
|
|
BGT NEXT
|
|
MVN w8, w5
|
|
ADD w8, w8, #1
|
|
ASR w11, w11, w8
|
|
ASR w9, w9, w8
|
|
B NEXT1
|
|
|
|
NEXT:
|
|
LSL w11, w11, w5
|
|
LSL w9, w9, w5
|
|
|
|
|
|
|
|
NEXT1:
|
|
STR w9, [x2], #4
|
|
STR w11, [x2], #4
|
|
|
|
CMP X4, #0x100
|
|
BNE NXT
|
|
MOV X6, #4
|
|
B NXT1
|
|
NXT:
|
|
MOV X6, #32
|
|
ADD X3, X3, #28
|
|
|
|
NXT1:
|
|
SUB X4, X4, #1
|
|
ASR X4, X4, #2
|
|
SUB x7, x7, #28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NEON_PROLOGUE:
|
|
|
|
MOV x8, #-32
|
|
|
|
dup v14.4s, w5
|
|
|
|
SUB X1, X1, #28
|
|
|
|
LD2 {v8.h, v9.h}[0], [x3], x6
|
|
LD2 {v8.h, v9.h}[1], [x3], x6
|
|
LD2 {v8.h, v9.h}[2], [x3], x6
|
|
LD2 {v8.h, v9.h}[3], [x3], x6
|
|
|
|
rev64 v10.4h, v8.4h
|
|
rev64 v11.4h, v9.4h
|
|
|
|
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
|
|
|
|
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
|
|
|
|
rev64 v0.4h, v0.4h
|
|
rev64 v1.4h, v1.4h
|
|
rev64 v4.4h, v4.4h
|
|
rev64 v5.4h, v5.4h
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uMULL v30.4s, v2.4h, v9.4h
|
|
uMULL v28.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v2.4h, v8.4h
|
|
uMULL v24.4s, v4.4h, v8.4h
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
ushR v28.4s, v28.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v30.4s, v3.4h, v9.4h
|
|
sMLAL v28.4s, v5.4h, v9.4h
|
|
sMLAL v26.4s, v3.4h, v8.4h
|
|
sMLAL v24.4s, v5.4h, v8.4h
|
|
|
|
ADD v28.4s, v26.4s , v28.4s
|
|
NEG v28.4s, v28.4s
|
|
SUB v30.4s, v30.4s , v24.4s
|
|
|
|
uMULL v22.4s, v0.4h, v11.4h
|
|
uMULL v20.4s, v6.4h, v11.4h
|
|
uMULL v18.4s, v0.4h, v10.4h
|
|
uMULL v16.4s, v6.4h, v10.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
ushR v20.4s, v20.4s, #16
|
|
ushR v18.4s, v18.4s, #16
|
|
ushR v16.4s, v16.4s, #16
|
|
|
|
sMLAL v22.4s, v1.4h, v11.4h
|
|
LD2 {v8.h, v9.h}[0], [x3], x6
|
|
|
|
sMLAL v20.4s, v7.4h, v11.4h
|
|
LD2 {v8.h, v9.h}[1], [x3], x6
|
|
|
|
sMLAL v18.4s, v1.4h, v10.4h
|
|
LD2 {v8.h, v9.h}[2], [x3], x6
|
|
|
|
sMLAL v16.4s, v7.4h, v10.4h
|
|
LD2 {v8.h, v9.h}[3], [x3], x6
|
|
|
|
ADD v20.4s, v20.4s , v18.4s
|
|
|
|
NEG v20.4s, v20.4s
|
|
rev64 v10.4h, v8.4h
|
|
rev64 v11.4h, v9.4h
|
|
SUB v22.4s, v16.4s , v22.4s
|
|
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
|
|
|
|
|
|
|
|
sshL v20.4s, v20.4s, v14.4s
|
|
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
|
|
|
|
rev64 v0.4h, v0.4h
|
|
rev64 v1.4h, v1.4h
|
|
sshL v22.4s, v22.4s, v14.4s
|
|
|
|
rev64 v4.4h, v4.4h
|
|
rev64 v5.4h, v5.4h
|
|
sshL v18.4s, v30.4s, v14.4s
|
|
|
|
|
|
sshL v16.4s, v28.4s, v14.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SUB X4, X4, #2
|
|
|
|
CORE_LOOP:
|
|
uMULL v30.4s, v2.4h, v9.4h
|
|
MOV v17.16B, v18.16B
|
|
ST2 { v16.4s, v17.4s}, [x2]
|
|
ADD x2, x2, #32
|
|
uMULL v28.4s, v4.4h, v9.4h
|
|
|
|
uMULL v26.4s, v2.4h, v8.4h
|
|
MOV v21.16B, v22.16B
|
|
ST2 { v20.4s, v21.4s}, [x7], x8
|
|
uMULL v24.4s, v4.4h, v8.4h
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
ushR v28.4s, v28.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v30.4s, v3.4h, v9.4h
|
|
sMLAL v28.4s, v5.4h, v9.4h
|
|
sMLAL v26.4s, v3.4h, v8.4h
|
|
sMLAL v24.4s, v5.4h, v8.4h
|
|
|
|
ADD v28.4s, v26.4s , v28.4s
|
|
NEG v28.4s, v28.4s
|
|
SUB v30.4s, v30.4s , v24.4s
|
|
|
|
uMULL v22.4s, v0.4h, v11.4h
|
|
LD2 {v8.h, v9.h}[0], [x3], x6
|
|
uMULL v20.4s, v6.4h, v11.4h
|
|
|
|
uMULL v18.4s, v0.4h, v10.4h
|
|
LD2 {v8.h, v9.h}[1], [x3], x6
|
|
uMULL v16.4s, v6.4h, v10.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
LD2 {v8.h, v9.h}[2], [x3], x6
|
|
ushR v20.4s, v20.4s, #16
|
|
|
|
|
|
ushR v18.4s, v18.4s, #16
|
|
LD2 {v8.h, v9.h}[3], [x3], x6
|
|
ushR v16.4s, v16.4s, #16
|
|
|
|
sMLAL v22.4s, v1.4h, v11.4h
|
|
|
|
sMLAL v20.4s, v7.4h, v11.4h
|
|
|
|
|
|
sMLAL v18.4s, v1.4h, v10.4h
|
|
|
|
|
|
sMLAL v16.4s, v7.4h, v10.4h
|
|
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
|
|
ADD v20.4s, v20.4s , v18.4s
|
|
|
|
NEG v20.4s, v20.4s
|
|
rev64 v10.4h, v8.4h
|
|
rev64 v11.4h, v9.4h
|
|
|
|
SUB v22.4s, v16.4s , v22.4s
|
|
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
|
|
sshL v20.4s, v20.4s, v14.4s
|
|
|
|
|
|
sshL v22.4s, v22.4s, v14.4s
|
|
|
|
rev64 v0.4h, v0.4h
|
|
rev64 v1.4h, v1.4h
|
|
sshL v18.4s, v30.4s, v14.4s
|
|
|
|
rev64 v4.4h, v4.4h
|
|
rev64 v5.4h, v5.4h
|
|
sshL v16.4s, v28.4s, v14.4s
|
|
|
|
|
|
SUBS x4, x4, #1
|
|
BNE CORE_LOOP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NEON_EPILOGUE:
|
|
uMULL v30.4s, v2.4h, v9.4h
|
|
MOV v17.16B, v18.16B
|
|
ST2 { v16.4s, v17.4s}, [x2]
|
|
ADD x2, x2, #32
|
|
uMULL v28.4s, v4.4h, v9.4h
|
|
|
|
uMULL v26.4s, v2.4h, v8.4h
|
|
MOV v21.16B, v22.16B
|
|
|
|
ST2 { v20.4s, v21.4s}, [x7], x8
|
|
uMULL v24.4s, v4.4h, v8.4h
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
ushR v28.4s, v28.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v30.4s, v3.4h, v9.4h
|
|
sMLAL v28.4s, v5.4h, v9.4h
|
|
sMLAL v26.4s, v3.4h, v8.4h
|
|
sMLAL v24.4s, v5.4h, v8.4h
|
|
|
|
ADD v28.4s, v26.4s , v28.4s
|
|
NEG v28.4s, v28.4s
|
|
SUB v30.4s, v30.4s , v24.4s
|
|
|
|
uMULL v22.4s, v0.4h, v11.4h
|
|
uMULL v20.4s, v6.4h, v11.4h
|
|
uMULL v18.4s, v0.4h, v10.4h
|
|
uMULL v16.4s, v6.4h, v10.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
ushR v20.4s, v20.4s, #16
|
|
ushR v18.4s, v18.4s, #16
|
|
ushR v16.4s, v16.4s, #16
|
|
|
|
sMLAL v22.4s, v1.4h, v11.4h
|
|
sMLAL v20.4s, v7.4h, v11.4h
|
|
sMLAL v18.4s, v1.4h, v10.4h
|
|
sMLAL v16.4s, v7.4h, v10.4h
|
|
|
|
ADD v20.4s, v20.4s , v18.4s
|
|
NEG v20.4s, v20.4s
|
|
SUB v22.4s, v16.4s , v22.4s
|
|
|
|
|
|
sshL v20.4s, v20.4s, v14.4s
|
|
sshL v22.4s, v22.4s, v14.4s
|
|
sshL v18.4s, v30.4s, v14.4s
|
|
sshL v16.4s, v28.4s, v14.4s
|
|
MOV v17.16B, v18.16B
|
|
ST2 { v16.4s, v17.4s}, [x2]
|
|
ADD x2, x2, #32
|
|
MOV v21.16B, v22.16B
|
|
ST2 { v20.4s, v21.4s}, [x7], x8
|
|
|
|
|
|
RESIDUE_NEON:
|
|
MOV x10, #-16
|
|
movi v3.2s, #0x00000000
|
|
movi v4.2s, #0x00000000
|
|
|
|
LD2 {v21.2s, v22.2s}, [x0], #16
|
|
MOV v0.8B, v21.8B
|
|
MOV v2.8B, v22.8B
|
|
|
|
LD1 {v1.s}[0], [x0], #4;
|
|
LD1 {v3.s}[0], [x0], #4;
|
|
LD1 {v1.s}[1], [x0]
|
|
MOV v21.8B, v0.8B
|
|
|
|
UZP1 v0.4h, v21.4h, v1.4h
|
|
UZP2 v1.4h, v21.4h, v1.4h
|
|
MOV v21.8B, v2.8B
|
|
UZP1 v2.4h, v21.4h, v3.4h
|
|
UZP2 v3.4h, v21.4h, v3.4h
|
|
|
|
ADD x1, x1, #4
|
|
|
|
LD1 {v6.s}[0], [x1], #4
|
|
LD1 {v4.s}[1], [x1], #4
|
|
LD1 {v6.s}[1], [x1], #4
|
|
|
|
|
|
LD2 {v21.2s, v22.2s}, [x1], #16
|
|
MOV v5.8B, v21.8B
|
|
MOV v7.8B, v22.8B
|
|
|
|
|
|
MOV v21.8B, v4.8B
|
|
UZP1 v4.4h, v21.4h, v5.4h
|
|
UZP2 v5.4h, v21.4h, v5.4h
|
|
MOV v21.8B, v6.8B
|
|
UZP1 v6.4h, v21.4h, v7.4h
|
|
UZP2 v7.4h, v21.4h, v7.4h
|
|
rev64 v0.4h, v0.4h
|
|
rev64 v1.4h, v1.4h
|
|
rev64 v4.4h, v4.4h
|
|
rev64 v5.4h, v5.4h
|
|
|
|
LD2 {v8.h, v9.h}[0], [x3], x6
|
|
LD2 {v8.h, v9.h}[1], [x3], x6
|
|
LD2 {v8.h, v9.h}[2], [x3], x6
|
|
LD2 {v8.h, v9.h}[3], [x3], x6
|
|
|
|
rev64 v10.4h, v8.4h
|
|
rev64 v11.4h, v9.4h
|
|
|
|
|
|
|
|
uMULL v30.4s, v2.4h, v9.4h
|
|
uMULL v28.4s, v4.4h, v9.4h
|
|
uMULL v26.4s, v2.4h, v8.4h
|
|
uMULL v24.4s, v4.4h, v8.4h
|
|
|
|
ushR v30.4s, v30.4s, #16
|
|
ushR v28.4s, v28.4s, #16
|
|
ushR v26.4s, v26.4s, #16
|
|
ushR v24.4s, v24.4s, #16
|
|
|
|
sMLAL v30.4s, v3.4h, v9.4h
|
|
sMLAL v28.4s, v5.4h, v9.4h
|
|
sMLAL v26.4s, v3.4h, v8.4h
|
|
sMLAL v24.4s, v5.4h, v8.4h
|
|
|
|
ADD v28.4s, v26.4s , v28.4s
|
|
NEG v28.4s, v28.4s
|
|
SUB v30.4s, v30.4s , v24.4s
|
|
|
|
uMULL v22.4s, v0.4h, v11.4h
|
|
uMULL v20.4s, v6.4h, v11.4h
|
|
uMULL v18.4s, v0.4h, v10.4h
|
|
uMULL v16.4s, v6.4h, v10.4h
|
|
|
|
ushR v22.4s, v22.4s, #16
|
|
ushR v20.4s, v20.4s, #16
|
|
ushR v18.4s, v18.4s, #16
|
|
ushR v16.4s, v16.4s, #16
|
|
|
|
sMLAL v22.4s, v1.4h, v11.4h
|
|
sMLAL v20.4s, v7.4h, v11.4h
|
|
sMLAL v18.4s, v1.4h, v10.4h
|
|
sMLAL v16.4s, v7.4h, v10.4h
|
|
|
|
ADD v20.4s, v20.4s , v18.4s
|
|
NEG v20.4s, v20.4s
|
|
SUB v22.4s, v16.4s , v22.4s
|
|
|
|
|
|
|
|
sshL v20.4s, v20.4s, v14.4s
|
|
sshL v22.4s, v22.4s, v14.4s
|
|
sshL v18.4s, v30.4s, v14.4s
|
|
sshL v16.4s, v28.4s, v14.4s
|
|
MOV v21.16B, v22.16B
|
|
ST2 { v20.4s, v21.4s}, [x7]
|
|
mov v17.16B, v18.16B
|
|
ST2 {v16.2s, v17.2s}, [x2]
|
|
ADD x2, x2, #16
|
|
|
|
ST2 {v16.s, v17.s}[2], [x2]
|
|
ADD x2, x2, #8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
END1:
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|