334 lines
11 KiB
ArmAsm
334 lines
11 KiB
ArmAsm
///******************************************************************************
|
|
// *
|
|
// * Copyright (C) 2018 The Android Open Source Project
|
|
// *
|
|
// * Licensed under the Apache License, Version 2.0 (the "License");
|
|
// * you may not use this file except in compliance with the License.
|
|
// * You may obtain a copy of the License at:
|
|
// *
|
|
// * http://www.apache.org/licenses/LICENSE-2.0
|
|
// *
|
|
// * Unless required by applicable law or agreed to in writing, software
|
|
// * distributed under the License is distributed on an "AS IS" BASIS,
|
|
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// * See the License for the specific language governing permissions and
|
|
// * limitations under the License.
|
|
// *
|
|
// *****************************************************************************
|
|
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
|
|
.text
|
|
.global ixheaacd_over_lap_add1_armv8
|
|
ixheaacd_over_lap_add1_armv8:
|
|
push_v_regs
|
|
LSL X10, X5, #1
|
|
SUB X11, X10, #1
|
|
LSL X10, X11, #2
|
|
ADD X10, X0, X10
|
|
SUB X10, X10, #12
|
|
LSL X8, X11, #1
|
|
ADD X8, X8, X3
|
|
SUB X8, X8, #14
|
|
MOV X12, #-16
|
|
DUP V11.8H, W4
|
|
LD1 {V3.4S}, [X10], X12
|
|
MOV W7, #0x2000
|
|
|
|
NEG W7, W7
|
|
SQNEG V0.4S, V3.4S
|
|
DUP V10.4S, W7
|
|
UZP1 V31.8H, V0.8H, V0.8H
|
|
UZP2 V30.8H, V0.8H, V0.8H
|
|
REV64 V31.8h, V31.8h
|
|
REV64 V30.8h, V30.8h
|
|
SUB X11, X5, #1
|
|
UZP1 V7.8H, V3.8H, V3.8H
|
|
UZP2 V6.8H, V3.8H, V3.8H
|
|
REV64 V7.8H, V7.8H
|
|
REV64 V6.8H, V6.8H
|
|
MOV V16.S[0], W6
|
|
MOV V17.S[0], W11
|
|
SMULL V17.4S, V16.4H, V17.4H
|
|
MOV W11, V17.S[0]
|
|
LSL X11, X11, #1
|
|
|
|
LD2 {V2.4H, V3.4H}, [X8], X12
|
|
ADD X11, X11, X2
|
|
REV64 V2.4H, V2.4H
|
|
REV64 V3.4H, V3.4H
|
|
LSL X4, X6, #1
|
|
NEG X4, X4
|
|
LSL X9, X6, #1
|
|
MOV V16.S[0], W5
|
|
MOV V17.S[0], W6
|
|
SMULL V17.4S, V16.4H, V17.4H
|
|
MOV W6, V17.S[0]
|
|
LSL W6, W6, #1
|
|
ADD X6, X6, X2
|
|
|
|
UMULL V15.4S, V7.4H, V2.4H
|
|
LD1 {V4.4S}, [X1], #16
|
|
USHR V15.4S, V15.4S, #16
|
|
|
|
SMLAL V15.4S, V6.4H, V2.4H
|
|
SQSHL V15.4S, V15.4S, V11.4S
|
|
SSHLL V27.4S, V3.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V14.16B, V28.16B
|
|
|
|
SQADD V14.4S, V14.4S, V10.4S
|
|
SQSUB V13.4S, V15.4S, V14.4S
|
|
SQSHL V13.4S, V13.4S, #2
|
|
SSHR V13.4S, V13.4S, #16
|
|
UZP1 V26.8H, V13.8H, V13.8H
|
|
|
|
UMULL V12.4S, V31.4H, V3.4H
|
|
USHR V12.4S, V12.4S, #16
|
|
SMLAL V12.4S, V30.4H, V3.4H
|
|
SQSHL V12.4S, V12.4S, V11.4S
|
|
LD1 {V3.4S}, [X10], X12
|
|
|
|
SSHLL V27.4S, V2.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V8.16B, V28.16B
|
|
|
|
SQADD V8.4S, V8.4S, V10.4S
|
|
|
|
SQNEG V0.4S, V3.4S
|
|
UZP1 V1.8H, V0.8H, V0.8H
|
|
UZP2 V0.8H, V0.8H, V0.8H
|
|
REV64 V1.8h, V1.8h
|
|
REV64 V0.8h, V0.8h
|
|
SQSUB V9.4S, V12.4S, V8.4S
|
|
UZP1 V7.8H, V3.8H, V3.8H
|
|
UZP2 V6.8H, V3.8H, V3.8H
|
|
REV64 V7.8h, V7.8h
|
|
REV64 V6.8h, V6.8h
|
|
SQSHL V9.4S, V9.4S, #2
|
|
LD2 {V2.4H, V3.4H}, [X8], X12
|
|
SSHR V9.4S, V9.4S, #16
|
|
REV64 V2.4H, V2.4H
|
|
REV64 V3.4H, V3.4H
|
|
UZP1 V18.8H, V9.8H, V9.8H
|
|
|
|
LD1 {V4.4S}, [X1], #16
|
|
SUB W5, W5, #8
|
|
|
|
|
|
LOOP_1:
|
|
|
|
ST1 {V26.H}[0], [X11], X4
|
|
UMULL V15.4S, V7.4H, V2.4H
|
|
ST1 {V26.H}[1], [X11], X4
|
|
UMULL V12.4S, V1.4H, V3.4H
|
|
ST1 {V26.H}[2], [X11], X4
|
|
USHR V15.4S, V15.4S, #16
|
|
ST1 {V26.H}[3], [X11], X4
|
|
USHR V12.4S, V12.4S, #16
|
|
ST1 {V18.H}[0], [X6], X9
|
|
SMLAL V15.4S, V6.4H, V2.4H
|
|
ST1 {V18.H}[1], [X6], X9
|
|
SMLAL V12.4S, V0.4H, V3.4H
|
|
ST1 {V18.H}[2], [X6], X9
|
|
SQSHL V15.4S, V15.4S, V11.4S
|
|
ST1 {V18.H}[3], [X6], X9
|
|
SQSHL V12.4S, V12.4S, V11.4S
|
|
LD1 {V6.4S}, [X10], X12
|
|
|
|
SSHLL V27.4S, V3.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V14.16B, V28.16B
|
|
|
|
SSHLL V27.4S, V2.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V8.16B, V28.16B
|
|
|
|
LD2 {V2.4H, V3.4H}, [X8], X12
|
|
|
|
SQNEG V0.4S, V6.4S
|
|
|
|
LD1 {V4.4S}, [X1], #16
|
|
|
|
SQADD V14.4S, V14.4S, V10.4S
|
|
UZP1 V1.8H, V0.8H, V0.8H
|
|
UZP2 V0.8H, V0.8H, V0.8H
|
|
REV64 V1.8h, V1.8h
|
|
REV64 V0.8h, V0.8h
|
|
SQADD V8.4S, V8.4S, V10.4S
|
|
UZP1 V7.8H, V6.8H, V6.8H
|
|
UZP2 V6.8H, V6.8H, V6.8H
|
|
REV64 V7.8h, V7.8h
|
|
REV64 V6.8h, V6.8h
|
|
SQSUB V13.4S, V15.4S, V14.4S
|
|
REV64 V2.4H, V2.4H
|
|
REV64 V3.4H, V3.4H
|
|
SQSUB V9.4S, V12.4S, V8.4S
|
|
SQSHL V13.4S, V13.4S, #2
|
|
SQSHL V9.4S, V9.4S, #2
|
|
UMULL V15.4S, V7.4H, V2.4H
|
|
SSHR V13.4S, V13.4S, #16
|
|
UZP1 V26.8H, V13.8H, V13.8H
|
|
SSHR V9.4S, V9.4S, #16
|
|
ST1 {V26.H}[0], [X11], X4
|
|
UMULL V12.4S, V1.4H, V3.4H
|
|
UZP1 V18.8H, V9.8H, V9.8H
|
|
USHR V15.4S, V15.4S, #16
|
|
ST1 {V26.H}[1], [X11], X4
|
|
SMLAL V15.4S, V6.4H, V2.4H
|
|
ST1 {V26.H}[2], [X11], X4
|
|
USHR V12.4S, V12.4S, #16
|
|
ST1 {V26.H}[3], [X11], X4
|
|
SMLAL V12.4S, V0.4H, V3.4H
|
|
ST1 {V18.H}[0], [X6], X9
|
|
SQSHL V15.4S, V15.4S, V11.4S
|
|
ST1 {V18.H}[1], [X6], X9
|
|
SQSHL V12.4S, V12.4S, V11.4S
|
|
ST1 {V18.H}[2], [X6], X9
|
|
|
|
SSHLL V27.4S, V3.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V14.16B, V28.16B
|
|
|
|
ST1 {V18.H}[3], [X6], X9
|
|
|
|
|
|
SSHLL V27.4S, V2.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V8.16B, V28.16B
|
|
|
|
LD1 {V3.4S}, [X10], X12
|
|
SQADD V14.4S, V14.4S, V10.4S
|
|
|
|
SQNEG V0.4S, V3.4S
|
|
UZP1 V1.8H, V0.8H, V0.8H
|
|
UZP2 V0.8H, V0.8H, V0.8H
|
|
REV64 V1.8H, V1.8H
|
|
REV64 V0.8H, V0.8H
|
|
SQSUB V13.4S, V15.4S, V14.4S
|
|
UZP1 V7.8H, V3.8H, V3.8H
|
|
UZP2 V6.8H, V3.8H, V3.8H
|
|
REV64 V7.8H, V7.8H
|
|
REV64 V6.8H, V6.8H
|
|
SQADD V8.4S, V8.4S, V10.4S
|
|
LD2 {V2.4H, V3.4H}, [X8], X12
|
|
SQSUB V9.4S, V12.4S, V8.4S
|
|
REV64 V2.4H, V2.4H
|
|
REV64 V3.4H, V3.4H
|
|
SQSHL V13.4S, V13.4S, #2
|
|
LD1 {V4.4S}, [X1], #16
|
|
|
|
SQSHL V9.4S, V9.4S, #2
|
|
SSHR V13.4S, V13.4S, #16
|
|
SUBS X5, X5, #8
|
|
SSHR V9.4S, V9.4S, #16
|
|
UZP1 V26.8H, V13.8H, V13.8H
|
|
UZP1 V18.8H, V9.8H, V9.8H
|
|
|
|
BGT LOOP_1
|
|
|
|
ST1 {V26.H}[0], [X11], X4
|
|
UMULL V15.4S, V7.4H, V2.4H
|
|
ST1 {V26.H}[1], [X11], X4
|
|
UMULL V12.4s, V1.4H, V3.4H
|
|
ST1 {V26.H}[2], [X11], X4
|
|
USHR V15.4S, V15.4S, #16
|
|
ST1 {V26.H}[3], [X11], X4
|
|
USHR V12.4S, V12.4S, #16
|
|
|
|
ST1 {V18.H}[0], [X6], X9
|
|
SMLAL V15.4S, V6.4H, V2.4H
|
|
ST1 {V18.H}[1], [X6], X9
|
|
SMLAL V12.4S, V0.4H, V3.4H
|
|
ST1 {V18.H}[2], [X6], X9
|
|
SQSHL V15.4S, V15.4S, V11.4S
|
|
ST1 {V18.H}[3], [X6], X9
|
|
SQSHL V12.4S, V12.4S, V11.4S
|
|
|
|
|
|
SSHLL V27.4S, V3.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V14.16B, V28.16B
|
|
|
|
SSHLL V27.4S, V2.4H, #0
|
|
SMULL V28.2D, V27.2S, V4.2S
|
|
SMULL2 V29.2D, V27.4S, V4.4S
|
|
SQXTN V28.2S, V28.2D
|
|
SQXTN2 V28.4S, V29.2D
|
|
MOV V8.16B, V28.16B
|
|
|
|
SQADD V14.4S, V14.4S, V10.4S
|
|
SQADD V8.4S, V8.4S, V10.4S
|
|
SQSUB V13.4S, V15.4S, V14.4S
|
|
SQSUB V9.4S, V12.4S, V8.4S
|
|
SQSHL V13.4S, V13.4S, #2
|
|
SQSHL V9.4S, V9.4S, #2
|
|
SSHR V13.4S, V13.4S, #16
|
|
SSHR V9.4S, V9.4S, #16
|
|
UZP1 V26.8H, V13.8H, V13.8H
|
|
|
|
UZP1 V18.8H, V9.8H, V9.8H
|
|
|
|
|
|
ST1 {V26.H}[0], [X11], X4
|
|
ST1 {V26.H}[1], [X11], X4
|
|
ST1 {V26.H}[2], [X11], X4
|
|
ST1 {V26.H}[3], [X11], X4
|
|
|
|
ST1 {V18.H}[0], [X6], X9
|
|
ST1 {V18.H}[1], [X6], X9
|
|
ST1 {V18.H}[2], [X6], X9
|
|
ST1 {V18.H}[3], [X6], X9
|
|
pop_v_regs
|
|
RET
|
|
|
|
|
|
|
|
|