aosp12/external/libxaac/decoder/armv8/ixheaacd_imdct_using_fft.s

820 lines
20 KiB
ArmAsm

///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.macro swp reg1, reg2
MOv x16, \reg1
MOv \reg1, \reg2
MOv \reg2, x16
.endm
.text
.p2align 2
.global ixheaacd_imdct_using_fft_armv8
ixheaacd_imdct_using_fft_armv8:
push_v_regs
MOV X29, #11600
ADD X4, X0, X29
MOV X29, #11856
ADD X5, X0, X29
MOV X29, #11920
ADD X6, X0, X29
MOV X29, #11936
ADD X7, X0, X29
COND_1: CMP X1, #0x400
BNE COND_2
MOv X8, #4
B RADIX_4_FIRST_START
COND_2: CMP X1, #0x200
BNE COND_3
MOv X8, #3
MOv X4, X5
B RADIX_8_FIRST_START
COND_3: CMP X1, #0x100
BNE COND_4
MOv X8, #3
MOv X4, X5
B RADIX_4_FIRST_START
COND_4: CMP X1, #0x80
BNE COND_5
MOv X8, #2
MOv X4, X6
B RADIX_8_FIRST_START
COND_5: CMP X1, #0x40
BNE COND_6
MOv X8, #2
MOv X4, X6
B RADIX_4_FIRST_START
COND_6:
MOv X8, #1
MOv X4, X7
RADIX_8_FIRST_START:
LSR W9 , W1, #5
LSL W1, W1, #1
RADIX_8_FIRST_LOOP:
MOv X5 , X2
MOv X6 , X2
MOv X7 , X2
MOv X11 , X2
LDRB W12, [X4]
ADD X5, X5, X12, LSL #3
LD2 {v0.S, v1.S}[0], [X5], X1
ADD X5, X5, X1
LD2 {v4.S, v5.S}[0], [X5], X1
SUB X5, X5, X1, LSL #1
LD2 {v2.S, v3.S}[0], [X5], X1
ADD X5, X5, X1
LD2 {v6.S, v7.S}[0], [X5], X1
SUB X5, X5, X1, LSL #2
LDRB W12, [X4, #1]
ADD X6, X6, X12, LSL #3
LD2 {v0.S, v1.S}[1], [X6] , X1
ADD X6, X6, X1
LD2 {v4.S, v5.S}[1], [X6] , X1
SUB X6, X6, X1, LSL #1
LD2 {v2.S, v3.S}[1], [X6] , X1
ADD X6, X6, X1
LD2 {v6.S, v7.S}[1], [X6], X1
SUB X6, X6, X1, LSL #2
LDRB W12, [X4, #2]
ADD X7, X7, X12, LSL #3
LD2 {v0.S, v1.S}[2], [X7] , X1
ADD X7, X7, X1
LD2 {v4.S, v5.S}[2], [X7] , X1
SUB X7, X7, X1, LSL #1
LDRB W12, [X4, #3]
ADD X11, X11, X12, LSL #3
LD2 {v0.S, v1.S}[3], [X11] , X1
ADD X11, X11, X1
LD2 {v4.S, v5.S}[3], [X11] , X1
SUB X11, X11, X1, LSL #1
ADD v8.4S, v0.4S, v4.4S
LD2 {v2.S, v3.S}[2], [X7] , X1
ADD X7, X7, X1
SUB v9.4S, v0.4S, v4.4S
LD2 {v6.S, v7.S}[2], [X7], X1
SUB X7, X7, X1, LSL #2
ADD v0.4S, v1.4S, v5.4S
LD2 {v2.S, v3.S}[3], [X11] , X1
ADD X11, X11, X1
SUB v4.4S, v1.4S, v5.4S
LD2 {v6.S, v7.S}[3], [X11], X1
SUB X11, X11, X1, LSL #2
ADD X4, X4, #4
ADD X5, X5, X1, LSR #1
ADD X6, X6, X1, LSR #1
ADD X7, X7, X1, LSR #1
ADD X11, X11, X1, LSR #1
ADD v1.4S, v2.4S, v6.4S
LD2 {v14.S, v15.S}[0], [X5] , X1
SUB v5.4S, v2.4S, v6.4S
LD2 {v10.S, v11.S}[0], [X5] , X1
ADD v2.4S, v3.4S, v7.4S
LD2 {v12.S, v13.S}[0], [X5] , X1
SUB v6.4S, v3.4S, v7.4S
LD2 {v14.S, v15.S}[1], [X6] , X1
ADD v3.4S, v9.4S, v6.4S
LD2 {v10.S, v11.S}[1], [X6] , X1
SUB v7.4S, v9.4S, v6.4S
LD2 {v12.S, v13.S}[1], [X6] , X1
SUB v6.4S, v4.4S, v5.4S
LD2 {v14.S, v15.S}[2], [X7] , X1
ADD v9.4S, v4.4S, v5.4S
LD2 {v10.S, v11.S}[2], [X7] , X1
ADD v4.4S, v8.4S, v1.4S
LD2 {v12.S, v13.S}[2], [X7] , X1
SUB v5.4S, v8.4S, v1.4S
LD2 {v14.S, v15.S}[3], [X11] , X1
ADD v8.4S, v0.4S, v2.4S
LD2 {v10.S, v11.S}[3], [X11] , X1
SUB v0.4S, v0.4S, v2.4S
LD2 {v12.S, v13.S}[3], [X11] , X1
LD2 {v1.S, v2.S}[0], [X5], X1
ADD v17.4S, v14.4S, v12.4S
LD2 {v1.S, v2.S}[1], [X6] , X1
SUB v16.4S, v14.4S, v12.4S
LD2 {v1.S, v2.S}[2], [X7] , X1
ADD v14.4S, v15.4S, v13.4S
LD2 {v1.S, v2.S}[3], [X11] , X1
SUB v12.4S, v15.4S, v13.4S
ADD v15.4S, v10.4S, v1.4S
SUB v13.4S, v10.4S, v1.4S
ADD v10.4S, v11.4S, v2.4S
SUB v1.4S, v11.4S, v2.4S
ADD v11.4S, v17.4S, v15.4S
SUB v2.4S, v17.4S, v15.4S
ADD v17.4S, v14.4S, v10.4S
SUB v15.4S, v14.4S, v10.4S
ADD v14.4S, v16.4S, v12.4S
SUB v10.4S, v16.4S, v12.4S
ADD v16.4S, v13.4S, v1.4S
SUB v12.4S, v13.4S, v1.4S
ADD v1.4S , v14.4S, v12.4S
SUB v13.4S, v14.4S, v12.4S
SUB v12.4S, v16.4S, v10.4S
UZP1 v22.8H, v1.8H, v1.8H
UZP2 v23.8H, v1.8H, v1.8H
ADD v14.4S, v16.4S, v10.4S
UZP1 v26.8H, v13.8H, v13.8H
UZP2 v27.8H, v13.8H, v13.8H
ADD v16.4S, v4.4S, v11.4S
UZP1 v24.8H, v12.8H, v12.8H
UZP2 v25.8H, v12.8H, v12.8H
SUB v10.4S, v4.4S, v11.4S
UZP1 v28.8H, v14.8H, v14.8H
UZP2 v29.8H, v14.8H, v14.8H
ADD v4.4S, v8.4S, v17.4S
MOv W14, #0x5a82
SUB v11.4S, v8.4S, v17.4S
ADD v8.4S, v5.4S, v15.4S
SUB v17.4S, v5.4S, v15.4S
SUB v5.4S, v0.4S, v2.4S
ADD v15.4S, v0.4S, v2.4S
DUP v31.4H, W14
UMULL v19.4S, v26.4H, v31.4H
UMULL v18.4S, v28.4H, v31.4H
SSHR v19.4S, v19.4S, #15
SSHR v18.4S, v18.4S, #15
SQDMLAL v19.4S, v27.4H, v31.4H
SQDMLAL v18.4S, v29.4H, v31.4H
UMULL v13.4S, v24.4H, v31.4H
UMULL v14.4S, v22.4H, v31.4H
ADD v20.4S, v3.4S, v19.4S
SUB v21.4S, v3.4S, v19.4S
ADD v30.4S, v6.4S, v18.4S
SUB v6.4S, v6.4S, v18.4S
SSHR v13.4S, v13.4S, #15
SSHR v14.4S, v14.4S, #15
SQDMLAL v13.4S, v25.4H, v31.4H
SQDMLAL v14.4S, v23.4H, v31.4H
ADD v3.4S, v7.4S, v13.4S
SUB v19.4S, v7.4S, v13.4S
ADD v1.4S, v9.4S, v14.4S
SUB v18.4S, v9.4S, v14.4S
swp v17.D[0], v8.D[0]
swp v17.D[1], v8.D[1]
swp v4.D[0], v16.D[0]
swp v4.D[1], v16.D[1]
TRN1 v12.4S, v4.4S, v20.4S
TRN2 v22.4S, v4.4S, v20.4S
SHL v12.4S, v12.4S, #3
TRN1 v9.4S, v17.4S, v3.4S
TRN2 v2.4S, v17.4S, v3.4S
SHL v22.4S, v22.4S, #3
SHL v9.4S, v9.4S, #3
TRN1 v24.4S, v10.4S, v21.4S
TRN2 v7.4S, v10.4S, v21.4S
SHL v2.4S, v2.4S, #3
SHL v24.4S, v24.4S, #3
TRN1 v13.4S, v16.4S, v6.4S
TRN2 v23.4S, v16.4S, v6.4S
SHL v7.4S, v7.4S, #3
SHL v13.4S, v13.4S, #3
TRN1 v10.4S, v5.4S, v18.4S
TRN2 v3.4S, v5.4S, v18.4S
SHL v23.4S, v23.4S, #3
SHL v10.4S, v10.4S, #3
TRN1 v26.4S, v8.4S, v19.4S
TRN2 v4.4S, v8.4S, v19.4S
SHL v3.4S, v3.4S, #3
SHL v26.4S, v26.4S, #3
TRN1 v25.4S, v11.4S, v30.4S
TRN2 v8.4S, v11.4S, v30.4S
SHL v4.4S, v4.4S, #3
SHL v25.4S, v25.4S, #3
TRN1 v27.4S, v15.4S, v1.4S
TRN2 v5.4S, v15.4S, v1.4S
SHL v8.4S, v8.4S, #3
SHL v27.4S, v27.4S, #3
swp v9.D[0], v12.D[1]
SHL v5.4S, v5.4S, #3
swp v2.D[0], v22.D[1]
swp v24.D[1], v26.D[0]
swp v7.D[1], v4.D[0]
swp v10.D[0], v13.D[1]
swp v3.D[0], v23.D[1]
swp v27.D[0], v25.D[1]
swp v5.D[0], v8.D[1]
MOv X15, #32
ST2 {v12.4S, v13.4S}, [X3], X15
ST2 {v24.4S, v25.4S}, [X3], X15
ST2 {v22.4S, v23.4S}, [X3], X15
ST2 {v7.4S, v8.4S}, [X3], X15
ST2 {v9.4S, v10.4S}, [X3], X15
ST2 {v26.4S, v27.4S}, [X3], X15
ST2 {v2.4S, v3.4S}, [X3], X15
ST2 {v4.4S, v5.4S}, [X3], X15
SUBS X9, X9, #1
BNE RADIX_8_FIRST_LOOP
LSR X1, X1, #1
LSL X15, X1, #3
SUB X3, X3, X15
MOv X5, #8
MOv X4, #32
LSR X15, X1, #5
MOv X6, X15
B RADIX_4_FIRST_ENDS
RADIX_8_FIRST_ENDS:
RADIX_4_FIRST_START:
LSR W9, W1, #4
LSL W1, W1, #1
RADIX_4_LOOP:
MOv X5 , X2
MOv X6 , X2
MOv X7 , X2
MOv X11 , X2
LDRB W12, [X4, #0]
ADD X5, X5, X12, LSL #3
LD2 {v0.S, v1.S}[0], [X5] , X1
ADD X5, X5, X1
LD2 {v8.S, v9.S}[0], [X5] , X1
SUB X5, X5, X1, LSL #1
LD2 {v4.S, v5.S}[0], [X5] , X1
ADD X5, X5, X1
LD2 {v12.S, v13.S}[0], [X5] , X1
LDRB W12, [X4, #1]
ADD X6, X6, X12, LSL #3
LD2 {v0.S, v1.S}[1], [X6] , X1
ADD X6, X6, X1
LD2 {v8.S, v9.S}[1], [X6] , X1
SUB X6, X6, X1, LSL #1
LD2 {v4.S, v5.S}[1], [X6] , X1
ADD X6, X6, X1
LD2 {v12.S, v13.S}[1], [X6] , X1
LDRB W12, [X4, #2]
ADD X7, X7, X12, LSL #3
LD2 {v0.S, v1.S}[2], [X7] , X1
ADD X7, X7, X1
LD2 {v8.S, v9.S}[2], [X7] , X1
LDRB W12, [X4, #3]
ADD X11, X11, X12 , LSL #3
LD2 {v0.S, v1.S}[3], [X11] , X1
ADD X11, X11, X1
LD2 {v8.S, v9.S}[3], [X11] , X1
SUB X7, X7, X1, LSL #1
ADD v16.4S, v0.4S, v8.4S
LD2 {v4.S, v5.S}[2], [X7] , X1
ADD X7, X7, X1
ADD v18.4S, v1.4S, v9.4S
LD2 {v12.S, v13.S}[2], [X7] , X1
SUB X11, X11, X1, LSL #1
SUB v20.4S, v0.4S, v8.4S
LD2 {v4.S, v5.S}[3], [X11] , X1
ADD X11, X11, X1
SUB v22.4S, v1.4S, v9.4S
LD2 {v12.S, v13.S}[3], [X11] , X1
ADD X4, X4, #4
ADD v24.4S, v4.4S, v12.4S
ADD v26.4S, v5.4S, v13.4S
SUB v28.4S, v4.4S, v12.4S
SUB v30.4S, v5.4S, v13.4S
ADD v17.4S, v16.4S, v24.4S
ADD v11.4S, v18.4S, v26.4S
SUB v19.4S, v16.4S, v24.4S
SUB v15.4S, v18.4S, v26.4S
ADD v8.4S, v20.4S, v30.4S
SUB v9.4S, v22.4S, v28.4S
ADD v13.4S, v22.4S, v28.4S
SUB v12.4S, v20.4S, v30.4S
TRN1 v0.4S, v17.4S, v8.4S
TRN2 v8.4S, v17.4S, v8.4S
SHL v0.4S, v0.4S, #2
TRN1 v4.4S, v19.4S, v12.4S
TRN2 v12.4S, v19.4S, v12.4S
SHL v8.4S, v8.4S, #2
SHL v4.4S, v4.4S, #2
TRN1 v1.4S, v11.4S, v9.4S
TRN2 v9.4S, v11.4S, v9.4S
SHL v12.4S, v12.4S, #2
SHL v1.4S, v1.4S, #2
TRN1 v5.4S, v15.4S, v13.4S
TRN2 v13.4S, v15.4S, v13.4S
SHL v9.4S, v9.4S, #2
SHL v5.4S, v5.4S, #2
swp v4.D[0], v0.D[1]
SHL v13.4S, v13.4S, #2
swp v12.D[0], v8.D[1]
swp v5.D[0], v1.D[1]
swp v13.D[0], v9.D[1]
MOv X15, #32
ST2 {v0.4S, v1.4S}, [X3], X15
ST2 {v8.4S, v9.4S}, [X3], X15
ST2 {v4.4S, v5.4S}, [X3], X15
ST2 {v12.4S, v13.4S}, [X3], X15
SUBS W9, W9, #1
BNE RADIX_4_LOOP
LSR X1, X1, #1
SUB X3, X3, X1, LSL #3
MOv X5, #4
MOv X4, #64
LSR X6, X1, #4
RADIX_4_FIRST_ENDS:
MOv x30, X3
LSR X5, X5, #2
MOV X14, #8528
ADD X0, X0, X14
OUTER_LOOP_R4:
MOv X14, x30
MOv X7, X5
MOv X2, #0
MOv X9, X0
LSL X12, X5, #5
MIDDLE_LOOP_R4:
LD2 {v20.H, v21.H}[0], [X9], X2
LD2 {v22.H, v23.H}[0], [X9], X2
ADD X11, X2, X4, LSL #2
LD2 {v24.H, v25.H}[0], [X9]
ADD X10, X0, X11
LD2 {v20.H, v21.H}[1], [X10], X11
LD2 {v22.H, v23.H}[1], [X10], X11
ADD X2, X11, X4, LSL #2
LD2 {v24.H, v25.H}[1], [X10]
ADD X9, X0, X2
LD2 {v20.H, v21.H}[2], [X9], X2
LD2 {v22.H, v23.H}[2], [X9], X2
ADD X11, X2, X4, LSL #2
LD2 {v24.H, v25.H}[2], [X9]
ADD X10, X0, X11
LD2 {v20.H, v21.H}[3], [X10], X11
LD2 {v22.H, v23.H}[3], [X10], X11
ADD X2, X11, X4, LSL #2
LD2 {v24.H, v25.H}[3], [X10]
ADD X9, X0, X2
MOv X10, X6
INNER_LOOP_R4:
LD2 {v30.4S, v31.4S}, [X14], X12
SSHR v30.4S, v30.4S, #1
LD4 {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12
SSHR v31.4S, v31.4S, #1
USHR v16.4H, v16.4H, #1
LD4 {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12
USHR v18.4H, v18.4H, #1
SMULL v11.4S, v16.4H, v20.4H
SMLSL v11.4S, v18.4H, v21.4H
LD4 {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12
SMULL v12.4S, v16.4H, v21.4H
SMLAL v12.4S, v18.4H, v20.4H
USHR v26.4H, v26.4H, #1
USHR v28.4H, v28.4H, #1
LSL x29, X12, #2
SUB X14, X14, X12, LSL #2
USHR v0.4H, v0.4H, #1
USHR v2.4H, v2.4H, #1
SMULL v13.4S, v26.4H, v22.4H
SMLSL v13.4S, v28.4H, v23.4H
SSHR v11.4S, v11.4S, #15
SMULL v14.4S, v26.4H, v23.4H
SMLAL v14.4S, v28.4H, v22.4H
SMULL v15.4S, v0.4H, v24.4H
SMLSL v15.4S, v2.4H, v25.4H
SMLAL v11.4S, v17.4H, v20.4H
SMLSL v11.4S, v19.4H, v21.4H
SSHR v12.4S, v12.4S, #15
SSHR v13.4S, v13.4S, #15
SSHR v14.4S, v14.4S, #15
SSHR v15.4S, v15.4S, #15
SMLAL v12.4S, v17.4H, v21.4H
SMLAL v12.4S, v19.4H, v20.4H
SMULL v5.4S, v0.4H, v25.4H
SMLAL v5.4S, v2.4H, v24.4H
SMLAL v13.4S, v27.4H, v22.4H
SMLSL v13.4S, v29.4H, v23.4H
SMLAL v14.4S, v27.4H, v23.4H
SMLAL v14.4S, v29.4H, v22.4H
SMLAL v15.4S, v1.4H, v24.4H
SMLSL v15.4S, v3.4H, v25.4H
SSHR v5.4S, v5.4S, #15
SMLAL v5.4S, v1.4H, v25.4H
SMLAL v5.4S, v3.4H, v24.4H
SUBS x17, X7, X5
BNE BYPASS_IF
ADD X14, X14, X12
LDR W3, [X14]
ADD X14, X14, X12
ASR W3, W3, #1
MOv v11.S[0], W3
LDR W3, [X14]
ADD X14, X14, X12
ASR W3, W3, #1
MOv v13.S[0], W3
LDR W3, [X14]
ASR W3, W3, #1
MOv v15.S[0], W3
SUB X14, X14, X12, LSL #1
ADD X14, X14, #4
LDR W3, [X14]
ADD X14, X14, X12
ASR W3, W3, #1
MOv v12.S[0], W3
LDR W3, [X14]
ADD X14, X14, X12
ASR W3, W3, #1
MOv v14.S[0], W3
LDR W3, [X14]
ADD X14, X14, X12
ASR W3, W3, #1
MOv v5.S[0], W3
SUB X14, X14, #4
SUB X14, X14, x29
BYPASS_IF:
ADD v6.4S, v30.4S, v13.4S
ADD v7.4S, v31.4S, v14.4S
SUB v30.4S, v30.4S, v13.4S
SUB v31.4S, v31.4S, v14.4S
ADD v8.4S, v11.4S, v15.4S
ADD v9.4S, v12.4S, v5.4S
SUB v15.4S, v11.4S, v15.4S
SUB v14.4S, v12.4S, v5.4S
ADD v10.4S, v6.4S, v8.4S
ADD v11.4S, v7.4S, v9.4S
ADD v12.4S, v30.4S, v14.4S
SUB v13.4S, v31.4S, v15.4S
SUB v6.4S, v6.4S, v8.4S
ST2 {v10.4S, v11.4S}, [X14], X12
SUB v7.4S, v7.4S, v9.4S
SUB v8.4S, v30.4S, v14.4S
ST2 {v12.4S, v13.4S}, [X14], X12
ADD v9.4S, v31.4S, v15.4S
ST2 {v6.4S, v7.4S}, [X14], X12
ST2 {v8.4S, v9.4S}, [X14], X12
SUBS X10, X10, #1
BNE INNER_LOOP_R4
SUB X14, X14, X1, LSL #3
ADD X14, X14, #32
SUBS X7, X7, #1
BNE MIDDLE_LOOP_R4
LSR X4, X4, #2
LSL X5, X5, #2
LSR X6, X6, #2
SUBS X8, X8, #1
BNE OUTER_LOOP_R4
END_LOOPS:
pop_v_regs
RET