820 lines
20 KiB
ArmAsm
820 lines
20 KiB
ArmAsm
///******************************************************************************
|
|
// *
|
|
// * Copyright (C) 2018 The Android Open Source Project
|
|
// *
|
|
// * Licensed under the Apache License, Version 2.0 (the "License");
|
|
// * you may not use this file except in compliance with the License.
|
|
// * You may obtain a copy of the License at:
|
|
// *
|
|
// * http://www.apache.org/licenses/LICENSE-2.0
|
|
// *
|
|
// * Unless required by applicable law or agreed to in writing, software
|
|
// * distributed under the License is distributed on an "AS IS" BASIS,
|
|
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// * See the License for the specific language governing permissions and
|
|
// * limitations under the License.
|
|
// *
|
|
// *****************************************************************************
|
|
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
|
|
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
|
|
.macro swp reg1, reg2
|
|
MOv x16, \reg1
|
|
MOv \reg1, \reg2
|
|
MOv \reg2, x16
|
|
.endm
|
|
.text
|
|
.p2align 2
|
|
.global ixheaacd_imdct_using_fft_armv8
|
|
ixheaacd_imdct_using_fft_armv8:
|
|
push_v_regs
|
|
|
|
MOV X29, #11600
|
|
ADD X4, X0, X29
|
|
MOV X29, #11856
|
|
ADD X5, X0, X29
|
|
MOV X29, #11920
|
|
ADD X6, X0, X29
|
|
MOV X29, #11936
|
|
ADD X7, X0, X29
|
|
|
|
COND_1: CMP X1, #0x400
|
|
BNE COND_2
|
|
MOv X8, #4
|
|
B RADIX_4_FIRST_START
|
|
|
|
|
|
COND_2: CMP X1, #0x200
|
|
BNE COND_3
|
|
MOv X8, #3
|
|
MOv X4, X5
|
|
B RADIX_8_FIRST_START
|
|
|
|
COND_3: CMP X1, #0x100
|
|
BNE COND_4
|
|
MOv X8, #3
|
|
MOv X4, X5
|
|
B RADIX_4_FIRST_START
|
|
|
|
COND_4: CMP X1, #0x80
|
|
BNE COND_5
|
|
MOv X8, #2
|
|
MOv X4, X6
|
|
B RADIX_8_FIRST_START
|
|
|
|
COND_5: CMP X1, #0x40
|
|
BNE COND_6
|
|
MOv X8, #2
|
|
MOv X4, X6
|
|
B RADIX_4_FIRST_START
|
|
COND_6:
|
|
MOv X8, #1
|
|
MOv X4, X7
|
|
|
|
|
|
|
|
RADIX_8_FIRST_START:
|
|
LSR W9 , W1, #5
|
|
LSL W1, W1, #1
|
|
|
|
RADIX_8_FIRST_LOOP:
|
|
|
|
MOv X5 , X2
|
|
MOv X6 , X2
|
|
MOv X7 , X2
|
|
MOv X11 , X2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LDRB W12, [X4]
|
|
ADD X5, X5, X12, LSL #3
|
|
LD2 {v0.S, v1.S}[0], [X5], X1
|
|
ADD X5, X5, X1
|
|
LD2 {v4.S, v5.S}[0], [X5], X1
|
|
SUB X5, X5, X1, LSL #1
|
|
LD2 {v2.S, v3.S}[0], [X5], X1
|
|
ADD X5, X5, X1
|
|
LD2 {v6.S, v7.S}[0], [X5], X1
|
|
SUB X5, X5, X1, LSL #2
|
|
|
|
LDRB W12, [X4, #1]
|
|
ADD X6, X6, X12, LSL #3
|
|
LD2 {v0.S, v1.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {v4.S, v5.S}[1], [X6] , X1
|
|
SUB X6, X6, X1, LSL #1
|
|
LD2 {v2.S, v3.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {v6.S, v7.S}[1], [X6], X1
|
|
SUB X6, X6, X1, LSL #2
|
|
|
|
|
|
LDRB W12, [X4, #2]
|
|
ADD X7, X7, X12, LSL #3
|
|
LD2 {v0.S, v1.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
LD2 {v4.S, v5.S}[2], [X7] , X1
|
|
SUB X7, X7, X1, LSL #1
|
|
|
|
LDRB W12, [X4, #3]
|
|
ADD X11, X11, X12, LSL #3
|
|
LD2 {v0.S, v1.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
LD2 {v4.S, v5.S}[3], [X11] , X1
|
|
SUB X11, X11, X1, LSL #1
|
|
|
|
|
|
ADD v8.4S, v0.4S, v4.4S
|
|
LD2 {v2.S, v3.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
|
|
|
|
SUB v9.4S, v0.4S, v4.4S
|
|
LD2 {v6.S, v7.S}[2], [X7], X1
|
|
SUB X7, X7, X1, LSL #2
|
|
|
|
|
|
ADD v0.4S, v1.4S, v5.4S
|
|
LD2 {v2.S, v3.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
|
|
SUB v4.4S, v1.4S, v5.4S
|
|
LD2 {v6.S, v7.S}[3], [X11], X1
|
|
SUB X11, X11, X1, LSL #2
|
|
|
|
ADD X4, X4, #4
|
|
|
|
ADD X5, X5, X1, LSR #1
|
|
ADD X6, X6, X1, LSR #1
|
|
ADD X7, X7, X1, LSR #1
|
|
ADD X11, X11, X1, LSR #1
|
|
|
|
|
|
ADD v1.4S, v2.4S, v6.4S
|
|
LD2 {v14.S, v15.S}[0], [X5] , X1
|
|
|
|
|
|
SUB v5.4S, v2.4S, v6.4S
|
|
LD2 {v10.S, v11.S}[0], [X5] , X1
|
|
|
|
|
|
ADD v2.4S, v3.4S, v7.4S
|
|
LD2 {v12.S, v13.S}[0], [X5] , X1
|
|
|
|
|
|
SUB v6.4S, v3.4S, v7.4S
|
|
LD2 {v14.S, v15.S}[1], [X6] , X1
|
|
|
|
ADD v3.4S, v9.4S, v6.4S
|
|
LD2 {v10.S, v11.S}[1], [X6] , X1
|
|
|
|
SUB v7.4S, v9.4S, v6.4S
|
|
LD2 {v12.S, v13.S}[1], [X6] , X1
|
|
|
|
SUB v6.4S, v4.4S, v5.4S
|
|
LD2 {v14.S, v15.S}[2], [X7] , X1
|
|
|
|
ADD v9.4S, v4.4S, v5.4S
|
|
LD2 {v10.S, v11.S}[2], [X7] , X1
|
|
|
|
ADD v4.4S, v8.4S, v1.4S
|
|
LD2 {v12.S, v13.S}[2], [X7] , X1
|
|
|
|
SUB v5.4S, v8.4S, v1.4S
|
|
LD2 {v14.S, v15.S}[3], [X11] , X1
|
|
|
|
ADD v8.4S, v0.4S, v2.4S
|
|
LD2 {v10.S, v11.S}[3], [X11] , X1
|
|
|
|
SUB v0.4S, v0.4S, v2.4S
|
|
LD2 {v12.S, v13.S}[3], [X11] , X1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LD2 {v1.S, v2.S}[0], [X5], X1
|
|
|
|
ADD v17.4S, v14.4S, v12.4S
|
|
|
|
LD2 {v1.S, v2.S}[1], [X6] , X1
|
|
|
|
SUB v16.4S, v14.4S, v12.4S
|
|
|
|
LD2 {v1.S, v2.S}[2], [X7] , X1
|
|
|
|
ADD v14.4S, v15.4S, v13.4S
|
|
|
|
LD2 {v1.S, v2.S}[3], [X11] , X1
|
|
|
|
SUB v12.4S, v15.4S, v13.4S
|
|
|
|
ADD v15.4S, v10.4S, v1.4S
|
|
SUB v13.4S, v10.4S, v1.4S
|
|
ADD v10.4S, v11.4S, v2.4S
|
|
SUB v1.4S, v11.4S, v2.4S
|
|
|
|
ADD v11.4S, v17.4S, v15.4S
|
|
SUB v2.4S, v17.4S, v15.4S
|
|
ADD v17.4S, v14.4S, v10.4S
|
|
SUB v15.4S, v14.4S, v10.4S
|
|
|
|
ADD v14.4S, v16.4S, v12.4S
|
|
SUB v10.4S, v16.4S, v12.4S
|
|
ADD v16.4S, v13.4S, v1.4S
|
|
SUB v12.4S, v13.4S, v1.4S
|
|
|
|
ADD v1.4S , v14.4S, v12.4S
|
|
SUB v13.4S, v14.4S, v12.4S
|
|
SUB v12.4S, v16.4S, v10.4S
|
|
|
|
|
|
UZP1 v22.8H, v1.8H, v1.8H
|
|
UZP2 v23.8H, v1.8H, v1.8H
|
|
ADD v14.4S, v16.4S, v10.4S
|
|
|
|
UZP1 v26.8H, v13.8H, v13.8H
|
|
UZP2 v27.8H, v13.8H, v13.8H
|
|
ADD v16.4S, v4.4S, v11.4S
|
|
|
|
UZP1 v24.8H, v12.8H, v12.8H
|
|
UZP2 v25.8H, v12.8H, v12.8H
|
|
SUB v10.4S, v4.4S, v11.4S
|
|
|
|
UZP1 v28.8H, v14.8H, v14.8H
|
|
UZP2 v29.8H, v14.8H, v14.8H
|
|
ADD v4.4S, v8.4S, v17.4S
|
|
|
|
MOv W14, #0x5a82
|
|
|
|
SUB v11.4S, v8.4S, v17.4S
|
|
|
|
ADD v8.4S, v5.4S, v15.4S
|
|
SUB v17.4S, v5.4S, v15.4S
|
|
SUB v5.4S, v0.4S, v2.4S
|
|
ADD v15.4S, v0.4S, v2.4S
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DUP v31.4H, W14
|
|
|
|
UMULL v19.4S, v26.4H, v31.4H
|
|
UMULL v18.4S, v28.4H, v31.4H
|
|
SSHR v19.4S, v19.4S, #15
|
|
SSHR v18.4S, v18.4S, #15
|
|
|
|
|
|
SQDMLAL v19.4S, v27.4H, v31.4H
|
|
SQDMLAL v18.4S, v29.4H, v31.4H
|
|
|
|
|
|
UMULL v13.4S, v24.4H, v31.4H
|
|
UMULL v14.4S, v22.4H, v31.4H
|
|
|
|
ADD v20.4S, v3.4S, v19.4S
|
|
SUB v21.4S, v3.4S, v19.4S
|
|
ADD v30.4S, v6.4S, v18.4S
|
|
SUB v6.4S, v6.4S, v18.4S
|
|
|
|
SSHR v13.4S, v13.4S, #15
|
|
SSHR v14.4S, v14.4S, #15
|
|
|
|
SQDMLAL v13.4S, v25.4H, v31.4H
|
|
SQDMLAL v14.4S, v23.4H, v31.4H
|
|
|
|
|
|
|
|
|
|
ADD v3.4S, v7.4S, v13.4S
|
|
SUB v19.4S, v7.4S, v13.4S
|
|
ADD v1.4S, v9.4S, v14.4S
|
|
SUB v18.4S, v9.4S, v14.4S
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
swp v17.D[0], v8.D[0]
|
|
swp v17.D[1], v8.D[1]
|
|
swp v4.D[0], v16.D[0]
|
|
swp v4.D[1], v16.D[1]
|
|
|
|
TRN1 v12.4S, v4.4S, v20.4S
|
|
TRN2 v22.4S, v4.4S, v20.4S
|
|
|
|
SHL v12.4S, v12.4S, #3
|
|
TRN1 v9.4S, v17.4S, v3.4S
|
|
TRN2 v2.4S, v17.4S, v3.4S
|
|
SHL v22.4S, v22.4S, #3
|
|
|
|
SHL v9.4S, v9.4S, #3
|
|
TRN1 v24.4S, v10.4S, v21.4S
|
|
TRN2 v7.4S, v10.4S, v21.4S
|
|
SHL v2.4S, v2.4S, #3
|
|
|
|
SHL v24.4S, v24.4S, #3
|
|
TRN1 v13.4S, v16.4S, v6.4S
|
|
TRN2 v23.4S, v16.4S, v6.4S
|
|
SHL v7.4S, v7.4S, #3
|
|
|
|
SHL v13.4S, v13.4S, #3
|
|
TRN1 v10.4S, v5.4S, v18.4S
|
|
TRN2 v3.4S, v5.4S, v18.4S
|
|
SHL v23.4S, v23.4S, #3
|
|
|
|
SHL v10.4S, v10.4S, #3
|
|
TRN1 v26.4S, v8.4S, v19.4S
|
|
TRN2 v4.4S, v8.4S, v19.4S
|
|
SHL v3.4S, v3.4S, #3
|
|
|
|
SHL v26.4S, v26.4S, #3
|
|
TRN1 v25.4S, v11.4S, v30.4S
|
|
TRN2 v8.4S, v11.4S, v30.4S
|
|
SHL v4.4S, v4.4S, #3
|
|
|
|
SHL v25.4S, v25.4S, #3
|
|
TRN1 v27.4S, v15.4S, v1.4S
|
|
TRN2 v5.4S, v15.4S, v1.4S
|
|
SHL v8.4S, v8.4S, #3
|
|
|
|
SHL v27.4S, v27.4S, #3
|
|
swp v9.D[0], v12.D[1]
|
|
SHL v5.4S, v5.4S, #3
|
|
swp v2.D[0], v22.D[1]
|
|
|
|
swp v24.D[1], v26.D[0]
|
|
swp v7.D[1], v4.D[0]
|
|
swp v10.D[0], v13.D[1]
|
|
swp v3.D[0], v23.D[1]
|
|
swp v27.D[0], v25.D[1]
|
|
swp v5.D[0], v8.D[1]
|
|
|
|
MOv X15, #32
|
|
ST2 {v12.4S, v13.4S}, [X3], X15
|
|
ST2 {v24.4S, v25.4S}, [X3], X15
|
|
ST2 {v22.4S, v23.4S}, [X3], X15
|
|
ST2 {v7.4S, v8.4S}, [X3], X15
|
|
ST2 {v9.4S, v10.4S}, [X3], X15
|
|
ST2 {v26.4S, v27.4S}, [X3], X15
|
|
ST2 {v2.4S, v3.4S}, [X3], X15
|
|
ST2 {v4.4S, v5.4S}, [X3], X15
|
|
|
|
|
|
SUBS X9, X9, #1
|
|
BNE RADIX_8_FIRST_LOOP
|
|
|
|
LSR X1, X1, #1
|
|
LSL X15, X1, #3
|
|
SUB X3, X3, X15
|
|
|
|
MOv X5, #8
|
|
MOv X4, #32
|
|
LSR X15, X1, #5
|
|
MOv X6, X15
|
|
B RADIX_4_FIRST_ENDS
|
|
RADIX_8_FIRST_ENDS:
|
|
|
|
RADIX_4_FIRST_START:
|
|
|
|
LSR W9, W1, #4
|
|
LSL W1, W1, #1
|
|
RADIX_4_LOOP:
|
|
|
|
MOv X5 , X2
|
|
MOv X6 , X2
|
|
MOv X7 , X2
|
|
MOv X11 , X2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LDRB W12, [X4, #0]
|
|
ADD X5, X5, X12, LSL #3
|
|
|
|
LD2 {v0.S, v1.S}[0], [X5] , X1
|
|
ADD X5, X5, X1
|
|
LD2 {v8.S, v9.S}[0], [X5] , X1
|
|
SUB X5, X5, X1, LSL #1
|
|
LD2 {v4.S, v5.S}[0], [X5] , X1
|
|
ADD X5, X5, X1
|
|
LD2 {v12.S, v13.S}[0], [X5] , X1
|
|
|
|
LDRB W12, [X4, #1]
|
|
ADD X6, X6, X12, LSL #3
|
|
LD2 {v0.S, v1.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {v8.S, v9.S}[1], [X6] , X1
|
|
SUB X6, X6, X1, LSL #1
|
|
LD2 {v4.S, v5.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {v12.S, v13.S}[1], [X6] , X1
|
|
|
|
LDRB W12, [X4, #2]
|
|
ADD X7, X7, X12, LSL #3
|
|
|
|
LD2 {v0.S, v1.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
LD2 {v8.S, v9.S}[2], [X7] , X1
|
|
|
|
|
|
LDRB W12, [X4, #3]
|
|
ADD X11, X11, X12 , LSL #3
|
|
|
|
|
|
LD2 {v0.S, v1.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
LD2 {v8.S, v9.S}[3], [X11] , X1
|
|
|
|
SUB X7, X7, X1, LSL #1
|
|
ADD v16.4S, v0.4S, v8.4S
|
|
LD2 {v4.S, v5.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
ADD v18.4S, v1.4S, v9.4S
|
|
LD2 {v12.S, v13.S}[2], [X7] , X1
|
|
|
|
SUB X11, X11, X1, LSL #1
|
|
SUB v20.4S, v0.4S, v8.4S
|
|
LD2 {v4.S, v5.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
SUB v22.4S, v1.4S, v9.4S
|
|
LD2 {v12.S, v13.S}[3], [X11] , X1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ADD X4, X4, #4
|
|
|
|
ADD v24.4S, v4.4S, v12.4S
|
|
ADD v26.4S, v5.4S, v13.4S
|
|
SUB v28.4S, v4.4S, v12.4S
|
|
SUB v30.4S, v5.4S, v13.4S
|
|
|
|
ADD v17.4S, v16.4S, v24.4S
|
|
ADD v11.4S, v18.4S, v26.4S
|
|
SUB v19.4S, v16.4S, v24.4S
|
|
SUB v15.4S, v18.4S, v26.4S
|
|
|
|
ADD v8.4S, v20.4S, v30.4S
|
|
SUB v9.4S, v22.4S, v28.4S
|
|
ADD v13.4S, v22.4S, v28.4S
|
|
SUB v12.4S, v20.4S, v30.4S
|
|
|
|
|
|
|
|
|
|
TRN1 v0.4S, v17.4S, v8.4S
|
|
TRN2 v8.4S, v17.4S, v8.4S
|
|
|
|
SHL v0.4S, v0.4S, #2
|
|
TRN1 v4.4S, v19.4S, v12.4S
|
|
TRN2 v12.4S, v19.4S, v12.4S
|
|
SHL v8.4S, v8.4S, #2
|
|
|
|
SHL v4.4S, v4.4S, #2
|
|
TRN1 v1.4S, v11.4S, v9.4S
|
|
TRN2 v9.4S, v11.4S, v9.4S
|
|
SHL v12.4S, v12.4S, #2
|
|
|
|
SHL v1.4S, v1.4S, #2
|
|
TRN1 v5.4S, v15.4S, v13.4S
|
|
TRN2 v13.4S, v15.4S, v13.4S
|
|
SHL v9.4S, v9.4S, #2
|
|
|
|
SHL v5.4S, v5.4S, #2
|
|
swp v4.D[0], v0.D[1]
|
|
SHL v13.4S, v13.4S, #2
|
|
|
|
swp v12.D[0], v8.D[1]
|
|
swp v5.D[0], v1.D[1]
|
|
swp v13.D[0], v9.D[1]
|
|
|
|
MOv X15, #32
|
|
ST2 {v0.4S, v1.4S}, [X3], X15
|
|
ST2 {v8.4S, v9.4S}, [X3], X15
|
|
ST2 {v4.4S, v5.4S}, [X3], X15
|
|
ST2 {v12.4S, v13.4S}, [X3], X15
|
|
|
|
|
|
SUBS W9, W9, #1
|
|
BNE RADIX_4_LOOP
|
|
|
|
LSR X1, X1, #1
|
|
SUB X3, X3, X1, LSL #3
|
|
MOv X5, #4
|
|
MOv X4, #64
|
|
LSR X6, X1, #4
|
|
|
|
|
|
RADIX_4_FIRST_ENDS:
|
|
|
|
MOv x30, X3
|
|
LSR X5, X5, #2
|
|
|
|
MOV X14, #8528
|
|
ADD X0, X0, X14
|
|
|
|
OUTER_LOOP_R4:
|
|
|
|
MOv X14, x30
|
|
|
|
MOv X7, X5
|
|
MOv X2, #0
|
|
MOv X9, X0
|
|
LSL X12, X5, #5
|
|
MIDDLE_LOOP_R4:
|
|
|
|
LD2 {v20.H, v21.H}[0], [X9], X2
|
|
LD2 {v22.H, v23.H}[0], [X9], X2
|
|
ADD X11, X2, X4, LSL #2
|
|
LD2 {v24.H, v25.H}[0], [X9]
|
|
ADD X10, X0, X11
|
|
|
|
LD2 {v20.H, v21.H}[1], [X10], X11
|
|
LD2 {v22.H, v23.H}[1], [X10], X11
|
|
ADD X2, X11, X4, LSL #2
|
|
LD2 {v24.H, v25.H}[1], [X10]
|
|
ADD X9, X0, X2
|
|
|
|
LD2 {v20.H, v21.H}[2], [X9], X2
|
|
LD2 {v22.H, v23.H}[2], [X9], X2
|
|
ADD X11, X2, X4, LSL #2
|
|
LD2 {v24.H, v25.H}[2], [X9]
|
|
ADD X10, X0, X11
|
|
|
|
LD2 {v20.H, v21.H}[3], [X10], X11
|
|
LD2 {v22.H, v23.H}[3], [X10], X11
|
|
ADD X2, X11, X4, LSL #2
|
|
LD2 {v24.H, v25.H}[3], [X10]
|
|
ADD X9, X0, X2
|
|
|
|
MOv X10, X6
|
|
INNER_LOOP_R4:
|
|
|
|
LD2 {v30.4S, v31.4S}, [X14], X12
|
|
SSHR v30.4S, v30.4S, #1
|
|
LD4 {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12
|
|
SSHR v31.4S, v31.4S, #1
|
|
|
|
USHR v16.4H, v16.4H, #1
|
|
LD4 {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12
|
|
USHR v18.4H, v18.4H, #1
|
|
|
|
SMULL v11.4S, v16.4H, v20.4H
|
|
SMLSL v11.4S, v18.4H, v21.4H
|
|
|
|
LD4 {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12
|
|
SMULL v12.4S, v16.4H, v21.4H
|
|
SMLAL v12.4S, v18.4H, v20.4H
|
|
|
|
USHR v26.4H, v26.4H, #1
|
|
USHR v28.4H, v28.4H, #1
|
|
|
|
LSL x29, X12, #2
|
|
SUB X14, X14, X12, LSL #2
|
|
|
|
USHR v0.4H, v0.4H, #1
|
|
USHR v2.4H, v2.4H, #1
|
|
|
|
SMULL v13.4S, v26.4H, v22.4H
|
|
SMLSL v13.4S, v28.4H, v23.4H
|
|
|
|
SSHR v11.4S, v11.4S, #15
|
|
|
|
SMULL v14.4S, v26.4H, v23.4H
|
|
SMLAL v14.4S, v28.4H, v22.4H
|
|
|
|
SMULL v15.4S, v0.4H, v24.4H
|
|
SMLSL v15.4S, v2.4H, v25.4H
|
|
|
|
SMLAL v11.4S, v17.4H, v20.4H
|
|
SMLSL v11.4S, v19.4H, v21.4H
|
|
|
|
SSHR v12.4S, v12.4S, #15
|
|
SSHR v13.4S, v13.4S, #15
|
|
SSHR v14.4S, v14.4S, #15
|
|
SSHR v15.4S, v15.4S, #15
|
|
|
|
SMLAL v12.4S, v17.4H, v21.4H
|
|
SMLAL v12.4S, v19.4H, v20.4H
|
|
|
|
SMULL v5.4S, v0.4H, v25.4H
|
|
SMLAL v5.4S, v2.4H, v24.4H
|
|
|
|
SMLAL v13.4S, v27.4H, v22.4H
|
|
SMLSL v13.4S, v29.4H, v23.4H
|
|
|
|
SMLAL v14.4S, v27.4H, v23.4H
|
|
SMLAL v14.4S, v29.4H, v22.4H
|
|
|
|
SMLAL v15.4S, v1.4H, v24.4H
|
|
SMLSL v15.4S, v3.4H, v25.4H
|
|
|
|
SSHR v5.4S, v5.4S, #15
|
|
|
|
SMLAL v5.4S, v1.4H, v25.4H
|
|
SMLAL v5.4S, v3.4H, v24.4H
|
|
|
|
|
|
|
|
SUBS x17, X7, X5
|
|
BNE BYPASS_IF
|
|
|
|
ADD X14, X14, X12
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
|
|
MOv v11.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOv v13.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ASR W3, W3, #1
|
|
MOv v15.S[0], W3
|
|
|
|
SUB X14, X14, X12, LSL #1
|
|
ADD X14, X14, #4
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOv v12.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOv v14.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOv v5.S[0], W3
|
|
|
|
SUB X14, X14, #4
|
|
|
|
SUB X14, X14, x29
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BYPASS_IF:
|
|
|
|
ADD v6.4S, v30.4S, v13.4S
|
|
ADD v7.4S, v31.4S, v14.4S
|
|
SUB v30.4S, v30.4S, v13.4S
|
|
SUB v31.4S, v31.4S, v14.4S
|
|
ADD v8.4S, v11.4S, v15.4S
|
|
ADD v9.4S, v12.4S, v5.4S
|
|
|
|
SUB v15.4S, v11.4S, v15.4S
|
|
SUB v14.4S, v12.4S, v5.4S
|
|
|
|
|
|
ADD v10.4S, v6.4S, v8.4S
|
|
ADD v11.4S, v7.4S, v9.4S
|
|
ADD v12.4S, v30.4S, v14.4S
|
|
SUB v13.4S, v31.4S, v15.4S
|
|
|
|
SUB v6.4S, v6.4S, v8.4S
|
|
ST2 {v10.4S, v11.4S}, [X14], X12
|
|
SUB v7.4S, v7.4S, v9.4S
|
|
|
|
SUB v8.4S, v30.4S, v14.4S
|
|
ST2 {v12.4S, v13.4S}, [X14], X12
|
|
ADD v9.4S, v31.4S, v15.4S
|
|
|
|
ST2 {v6.4S, v7.4S}, [X14], X12
|
|
ST2 {v8.4S, v9.4S}, [X14], X12
|
|
SUBS X10, X10, #1
|
|
BNE INNER_LOOP_R4
|
|
|
|
SUB X14, X14, X1, LSL #3
|
|
ADD X14, X14, #32
|
|
|
|
SUBS X7, X7, #1
|
|
BNE MIDDLE_LOOP_R4
|
|
|
|
|
|
|
|
|
|
LSR X4, X4, #2
|
|
LSL X5, X5, #2
|
|
LSR X6, X6, #2
|
|
SUBS X8, X8, #1
|
|
BNE OUTER_LOOP_R4
|
|
END_LOOPS:
|
|
pop_v_regs
|
|
RET
|
|
|
|
|
|
|