aosp12/external/libxaac/decoder/armv8/ixheaacd_overlap_add1.s

334 lines
11 KiB
ArmAsm

///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.global ixheaacd_over_lap_add1_armv8
ixheaacd_over_lap_add1_armv8:
push_v_regs
LSL X10, X5, #1
SUB X11, X10, #1
LSL X10, X11, #2
ADD X10, X0, X10
SUB X10, X10, #12
LSL X8, X11, #1
ADD X8, X8, X3
SUB X8, X8, #14
MOV X12, #-16
DUP V11.8H, W4
LD1 {V3.4S}, [X10], X12
MOV W7, #0x2000
NEG W7, W7
SQNEG V0.4S, V3.4S
DUP V10.4S, W7
UZP1 V31.8H, V0.8H, V0.8H
UZP2 V30.8H, V0.8H, V0.8H
REV64 V31.8h, V31.8h
REV64 V30.8h, V30.8h
SUB X11, X5, #1
UZP1 V7.8H, V3.8H, V3.8H
UZP2 V6.8H, V3.8H, V3.8H
REV64 V7.8H, V7.8H
REV64 V6.8H, V6.8H
MOV V16.S[0], W6
MOV V17.S[0], W11
SMULL V17.4S, V16.4H, V17.4H
MOV W11, V17.S[0]
LSL X11, X11, #1
LD2 {V2.4H, V3.4H}, [X8], X12
ADD X11, X11, X2
REV64 V2.4H, V2.4H
REV64 V3.4H, V3.4H
LSL X4, X6, #1
NEG X4, X4
LSL X9, X6, #1
MOV V16.S[0], W5
MOV V17.S[0], W6
SMULL V17.4S, V16.4H, V17.4H
MOV W6, V17.S[0]
LSL W6, W6, #1
ADD X6, X6, X2
UMULL V15.4S, V7.4H, V2.4H
LD1 {V4.4S}, [X1], #16
USHR V15.4S, V15.4S, #16
SMLAL V15.4S, V6.4H, V2.4H
SQSHL V15.4S, V15.4S, V11.4S
SSHLL V27.4S, V3.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V14.16B, V28.16B
SQADD V14.4S, V14.4S, V10.4S
SQSUB V13.4S, V15.4S, V14.4S
SQSHL V13.4S, V13.4S, #2
SSHR V13.4S, V13.4S, #16
UZP1 V26.8H, V13.8H, V13.8H
UMULL V12.4S, V31.4H, V3.4H
USHR V12.4S, V12.4S, #16
SMLAL V12.4S, V30.4H, V3.4H
SQSHL V12.4S, V12.4S, V11.4S
LD1 {V3.4S}, [X10], X12
SSHLL V27.4S, V2.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V8.16B, V28.16B
SQADD V8.4S, V8.4S, V10.4S
SQNEG V0.4S, V3.4S
UZP1 V1.8H, V0.8H, V0.8H
UZP2 V0.8H, V0.8H, V0.8H
REV64 V1.8h, V1.8h
REV64 V0.8h, V0.8h
SQSUB V9.4S, V12.4S, V8.4S
UZP1 V7.8H, V3.8H, V3.8H
UZP2 V6.8H, V3.8H, V3.8H
REV64 V7.8h, V7.8h
REV64 V6.8h, V6.8h
SQSHL V9.4S, V9.4S, #2
LD2 {V2.4H, V3.4H}, [X8], X12
SSHR V9.4S, V9.4S, #16
REV64 V2.4H, V2.4H
REV64 V3.4H, V3.4H
UZP1 V18.8H, V9.8H, V9.8H
LD1 {V4.4S}, [X1], #16
SUB W5, W5, #8
LOOP_1:
ST1 {V26.H}[0], [X11], X4
UMULL V15.4S, V7.4H, V2.4H
ST1 {V26.H}[1], [X11], X4
UMULL V12.4S, V1.4H, V3.4H
ST1 {V26.H}[2], [X11], X4
USHR V15.4S, V15.4S, #16
ST1 {V26.H}[3], [X11], X4
USHR V12.4S, V12.4S, #16
ST1 {V18.H}[0], [X6], X9
SMLAL V15.4S, V6.4H, V2.4H
ST1 {V18.H}[1], [X6], X9
SMLAL V12.4S, V0.4H, V3.4H
ST1 {V18.H}[2], [X6], X9
SQSHL V15.4S, V15.4S, V11.4S
ST1 {V18.H}[3], [X6], X9
SQSHL V12.4S, V12.4S, V11.4S
LD1 {V6.4S}, [X10], X12
SSHLL V27.4S, V3.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V14.16B, V28.16B
SSHLL V27.4S, V2.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V8.16B, V28.16B
LD2 {V2.4H, V3.4H}, [X8], X12
SQNEG V0.4S, V6.4S
LD1 {V4.4S}, [X1], #16
SQADD V14.4S, V14.4S, V10.4S
UZP1 V1.8H, V0.8H, V0.8H
UZP2 V0.8H, V0.8H, V0.8H
REV64 V1.8h, V1.8h
REV64 V0.8h, V0.8h
SQADD V8.4S, V8.4S, V10.4S
UZP1 V7.8H, V6.8H, V6.8H
UZP2 V6.8H, V6.8H, V6.8H
REV64 V7.8h, V7.8h
REV64 V6.8h, V6.8h
SQSUB V13.4S, V15.4S, V14.4S
REV64 V2.4H, V2.4H
REV64 V3.4H, V3.4H
SQSUB V9.4S, V12.4S, V8.4S
SQSHL V13.4S, V13.4S, #2
SQSHL V9.4S, V9.4S, #2
UMULL V15.4S, V7.4H, V2.4H
SSHR V13.4S, V13.4S, #16
UZP1 V26.8H, V13.8H, V13.8H
SSHR V9.4S, V9.4S, #16
ST1 {V26.H}[0], [X11], X4
UMULL V12.4S, V1.4H, V3.4H
UZP1 V18.8H, V9.8H, V9.8H
USHR V15.4S, V15.4S, #16
ST1 {V26.H}[1], [X11], X4
SMLAL V15.4S, V6.4H, V2.4H
ST1 {V26.H}[2], [X11], X4
USHR V12.4S, V12.4S, #16
ST1 {V26.H}[3], [X11], X4
SMLAL V12.4S, V0.4H, V3.4H
ST1 {V18.H}[0], [X6], X9
SQSHL V15.4S, V15.4S, V11.4S
ST1 {V18.H}[1], [X6], X9
SQSHL V12.4S, V12.4S, V11.4S
ST1 {V18.H}[2], [X6], X9
SSHLL V27.4S, V3.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V14.16B, V28.16B
ST1 {V18.H}[3], [X6], X9
SSHLL V27.4S, V2.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V8.16B, V28.16B
LD1 {V3.4S}, [X10], X12
SQADD V14.4S, V14.4S, V10.4S
SQNEG V0.4S, V3.4S
UZP1 V1.8H, V0.8H, V0.8H
UZP2 V0.8H, V0.8H, V0.8H
REV64 V1.8H, V1.8H
REV64 V0.8H, V0.8H
SQSUB V13.4S, V15.4S, V14.4S
UZP1 V7.8H, V3.8H, V3.8H
UZP2 V6.8H, V3.8H, V3.8H
REV64 V7.8H, V7.8H
REV64 V6.8H, V6.8H
SQADD V8.4S, V8.4S, V10.4S
LD2 {V2.4H, V3.4H}, [X8], X12
SQSUB V9.4S, V12.4S, V8.4S
REV64 V2.4H, V2.4H
REV64 V3.4H, V3.4H
SQSHL V13.4S, V13.4S, #2
LD1 {V4.4S}, [X1], #16
SQSHL V9.4S, V9.4S, #2
SSHR V13.4S, V13.4S, #16
SUBS X5, X5, #8
SSHR V9.4S, V9.4S, #16
UZP1 V26.8H, V13.8H, V13.8H
UZP1 V18.8H, V9.8H, V9.8H
BGT LOOP_1
ST1 {V26.H}[0], [X11], X4
UMULL V15.4S, V7.4H, V2.4H
ST1 {V26.H}[1], [X11], X4
UMULL V12.4s, V1.4H, V3.4H
ST1 {V26.H}[2], [X11], X4
USHR V15.4S, V15.4S, #16
ST1 {V26.H}[3], [X11], X4
USHR V12.4S, V12.4S, #16
ST1 {V18.H}[0], [X6], X9
SMLAL V15.4S, V6.4H, V2.4H
ST1 {V18.H}[1], [X6], X9
SMLAL V12.4S, V0.4H, V3.4H
ST1 {V18.H}[2], [X6], X9
SQSHL V15.4S, V15.4S, V11.4S
ST1 {V18.H}[3], [X6], X9
SQSHL V12.4S, V12.4S, V11.4S
SSHLL V27.4S, V3.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V14.16B, V28.16B
SSHLL V27.4S, V2.4H, #0
SMULL V28.2D, V27.2S, V4.2S
SMULL2 V29.2D, V27.4S, V4.4S
SQXTN V28.2S, V28.2D
SQXTN2 V28.4S, V29.2D
MOV V8.16B, V28.16B
SQADD V14.4S, V14.4S, V10.4S
SQADD V8.4S, V8.4S, V10.4S
SQSUB V13.4S, V15.4S, V14.4S
SQSUB V9.4S, V12.4S, V8.4S
SQSHL V13.4S, V13.4S, #2
SQSHL V9.4S, V9.4S, #2
SSHR V13.4S, V13.4S, #16
SSHR V9.4S, V9.4S, #16
UZP1 V26.8H, V13.8H, V13.8H
UZP1 V18.8H, V9.8H, V9.8H
ST1 {V26.H}[0], [X11], X4
ST1 {V26.H}[1], [X11], X4
ST1 {V26.H}[2], [X11], X4
ST1 {V26.H}[3], [X11], X4
ST1 {V18.H}[0], [X6], X9
ST1 {V18.H}[1], [X6], X9
ST1 {V18.H}[2], [X6], X9
ST1 {V18.H}[3], [X6], X9
pop_v_regs
RET