274 lines
10 KiB
ArmAsm
274 lines
10 KiB
ArmAsm
|
@/*****************************************************************************
|
||
|
@*
|
||
|
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
||
|
@*
|
||
|
@* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
@* you may not use this file except in compliance with the License.
|
||
|
@* You may obtain a copy of the License at:
|
||
|
@*
|
||
|
@* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
@*
|
||
|
@* Unless required by applicable law or agreed to in writing, software
|
||
|
@* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
@* See the License for the specific language governing permissions and
|
||
|
@* limitations under the License.
|
||
|
@*
|
||
|
@*****************************************************************************/
|
||
|
@/**
|
||
|
@*******************************************************************************
|
||
|
@* @file
|
||
|
@* ihevc_inter_pred_chroma_copy_neon.s
|
||
|
@*
|
||
|
@* @brief
|
||
|
@* contains function definitions for inter prediction interpolation.
|
||
|
@* functions are coded using neon intrinsics and can be compiled using
|
||
|
|
||
|
@* rvct
|
||
|
@*
|
||
|
@* @author
|
||
|
@* yogeswaran rs
|
||
|
@*
|
||
|
@* @par list of functions:
|
||
|
@*
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* none
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*/
|
||
|
@/**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief
|
||
|
@* chroma interprediction filter for copy
|
||
|
@*
|
||
|
@* @par description:
|
||
|
@* copies the array of width 'wd' and height 'ht' from the location pointed
|
||
|
@* by 'src' to the location pointed by 'dst'
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@* uword8 pointer to the source
|
||
|
@*
|
||
|
@* @param[out] pu1_dst
|
||
|
@* uword8 pointer to the destination
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] dst_strd
|
||
|
@* integer destination stride
|
||
|
@*
|
||
|
@* @param[in] pi1_coeff
|
||
|
@* word8 pointer to the filter coefficients
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @returns
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* none
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*/
|
||
|
|
||
|
@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
|
||
|
@ uword8 *pu1_dst,
|
||
|
@ word32 src_strd,
|
||
|
@ word32 dst_strd,
|
||
|
@ word8 *pi1_coeff,
|
||
|
@ word32 ht,
|
||
|
@ word32 wd)
|
||
|
@**************variables vs registers*****************************************
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => *pu1_dst
|
||
|
@ r2 => src_strd
|
||
|
@ r3 => dst_strd
|
||
|
@ r4 => *pi1_coeff
|
||
|
@ r5 => ht
|
||
|
@ r6 => wd
|
||
|
|
||
|
.equ ht_offset, 44
|
||
|
.equ wd_offset, 48
|
||
|
|
||
|
.text
|
||
|
.align 4
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
.globl ihevc_inter_pred_chroma_copy_a9q
|
||
|
|
||
|
.type ihevc_inter_pred_chroma_copy_a9q, %function
|
||
|
|
||
|
ihevc_inter_pred_chroma_copy_a9q:
|
||
|
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
|
||
|
ldr r12,[sp,#wd_offset] @loads wd
|
||
|
lsl r12,r12,#1
|
||
|
ldr r7,[sp,#ht_offset] @loads ht
|
||
|
cmp r7,#0 @checks ht == 0
|
||
|
ble end_loops
|
||
|
and r8,r7,#3 @check ht for mul of 2
|
||
|
sub r7,r7,r8 @check the rounded height value
|
||
|
tst r12,#15 @checks wd for multiples for 4 & 8
|
||
|
beq core_loop_wd_16
|
||
|
tst r12,#7 @checks wd for multiples for 4 & 8
|
||
|
beq core_loop_wd_8
|
||
|
|
||
|
sub r11,r12,#4
|
||
|
cmp r7,#0
|
||
|
beq outer_loop_wd_4_ht_2
|
||
|
|
||
|
outer_loop_wd_4:
|
||
|
subs r4,r12,#0 @checks wd == 0
|
||
|
ble end_inner_loop_wd_4
|
||
|
|
||
|
inner_loop_wd_4:
|
||
|
vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
add r0,r0,#4 @pu1_src += 4
|
||
|
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
subs r4,r4,#4 @(wd -4)
|
||
|
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
add r1,r1,#4 @pu1_dst += 4
|
||
|
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
bgt inner_loop_wd_4
|
||
|
|
||
|
end_inner_loop_wd_4:
|
||
|
subs r7,r7,#4 @ht - 4
|
||
|
sub r0,r5,r11 @pu1_src = pu1_src_tmp
|
||
|
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
|
||
|
bgt outer_loop_wd_4
|
||
|
cmp r8,#0
|
||
|
bgt outer_loop_wd_4_ht_2
|
||
|
|
||
|
end_loops:
|
||
|
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
|
||
|
|
||
|
|
||
|
outer_loop_wd_4_ht_2:
|
||
|
subs r4,r12,#0 @checks wd == 0
|
||
|
ble end_loops
|
||
|
|
||
|
inner_loop_wd_4_ht_2:
|
||
|
vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
|
||
|
add r0,r0,#4 @pu1_src += 4
|
||
|
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
|
||
|
subs r4,r4,#4 @(wd -4)
|
||
|
add r1,r1,#4 @pu1_dst += 4
|
||
|
bgt inner_loop_wd_4_ht_2
|
||
|
b end_loops
|
||
|
|
||
|
core_loop_wd_8:
|
||
|
sub r11,r12,#8
|
||
|
cmp r7,#0
|
||
|
beq outer_loop_wd_8_ht_2
|
||
|
|
||
|
outer_loop_wd_8:
|
||
|
subs r4,r12,#0 @checks wd
|
||
|
ble end_inner_loop_wd_8
|
||
|
|
||
|
inner_loop_wd_8:
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
subs r4,r4,#8 @wd - 8(loop condition)
|
||
|
vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
bgt inner_loop_wd_8
|
||
|
|
||
|
end_inner_loop_wd_8:
|
||
|
subs r7,r7,#4 @ht -= 4
|
||
|
sub r0,r5,r11 @pu1_src = pu1_src_tmp
|
||
|
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
|
||
|
bgt outer_loop_wd_8
|
||
|
cmp r8,#0
|
||
|
bgt outer_loop_wd_8_ht_2
|
||
|
b end_loops
|
||
|
|
||
|
outer_loop_wd_8_ht_2:
|
||
|
subs r4,r12,#0 @checks wd
|
||
|
ble end_loops
|
||
|
|
||
|
inner_loop_wd_8_ht_2:
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
@subs r4,r4,#8 @wd - 8(loop condition)
|
||
|
@bgt inner_loop_wd_8_ht_2
|
||
|
b end_loops
|
||
|
|
||
|
core_loop_wd_16:
|
||
|
sub r11,r12,#16
|
||
|
cmp r7,#0
|
||
|
beq outer_loop_wd_16_ht_2
|
||
|
|
||
|
outer_loop_wd_16:
|
||
|
subs r4,r12,#0 @checks wd
|
||
|
ble end_inner_loop_wd_16
|
||
|
|
||
|
inner_loop_wd_16:
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
subs r4,r4,#16 @wd - 16(loop condition)
|
||
|
vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
bgt inner_loop_wd_16
|
||
|
|
||
|
end_inner_loop_wd_16:
|
||
|
subs r7,r7,#4 @ht -= 4
|
||
|
sub r0,r5,r11 @pu1_src = pu1_src_tmp
|
||
|
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
|
||
|
bgt outer_loop_wd_16
|
||
|
cmp r8,#0
|
||
|
bgt outer_loop_wd_16_ht_2
|
||
|
b end_loops
|
||
|
|
||
|
outer_loop_wd_16_ht_2:
|
||
|
subs r4,r12,#0 @checks wd
|
||
|
ble end_loops
|
||
|
|
||
|
inner_loop_wd_16_ht_2:
|
||
|
add r5,r0,r2 @pu1_src_tmp += src_strd
|
||
|
vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
|
||
|
add r6,r1,r3 @pu1_dst_tmp += dst_strd
|
||
|
vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
|
||
|
vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
|
||
|
@subs r4,r4,#16 @wd - 16(loop condition)
|
||
|
@bgt inner_loop_wd_16_ht_2
|
||
|
|
||
|
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|