aosp12/external/libhevc/common/arm/ihevc_inter_pred_chroma_copy.s

@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_inter_pred_chroma_copy_neon.s
@*
@* @brief
@*  contains function definitions for inter prediction  interpolation.
@* functions are coded using neon  intrinsics and can be compiled using

@* rvct
@*
@* @author
@*  yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*   chroma interprediction filter for copy
@*
@* @par description:
@*    copies the array of width 'wd' and height 'ht' from the  location pointed
@*    by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@*  uword8 pointer to the source
@*
@* @param[out] pu1_dst
@*  uword8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] pi1_coeff
@*  word8 pointer to the filter coefficients
@*
@* @param[in] ht
@*  integer height of the array
@*
@* @param[in] wd
@*  integer width of the array
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
@                                   uword8 *pu1_dst,
@                                   word32 src_strd,
@                                   word32 dst_strd,
@                                   word8 *pi1_coeff,
@                                   word32 ht,
@                                   word32 wd)
@**************variables vs registers*****************************************
@               r0 => *pu1_src
@               r1 => *pu1_dst
@               r2 =>  src_strd
@               r3 =>  dst_strd
@               r4 => *pi1_coeff
@               r5 =>  ht
@               r6 =>  wd

.equ    ht_offset,      44
.equ    wd_offset,      48

.text
.align 4


.globl ihevc_inter_pred_chroma_copy_a9q

.type ihevc_inter_pred_chroma_copy_a9q, %function

ihevc_inter_pred_chroma_copy_a9q:
    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    ldr         r12,[sp,#wd_offset]         @loads wd
    lsl         r12,r12,#1
    ldr         r7,[sp,#ht_offset]          @loads ht
    cmp         r7,#0                       @checks ht == 0
    ble         end_loops
    and         r8,r7,#3                    @check ht for mul of 2
    sub         r7,r7,r8                    @check the rounded height value
    tst         r12,#15                     @checks wd for multiples for 4 & 8
    beq         core_loop_wd_16
    tst         r12,#7                      @checks wd for multiples for 4 & 8
    beq         core_loop_wd_8

    sub         r11,r12,#4
    cmp         r7,#0
    beq         outer_loop_wd_4_ht_2

outer_loop_wd_4:
    subs        r4,r12,#0                   @checks wd == 0
    ble         end_inner_loop_wd_4

inner_loop_wd_4:
    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         r0,r0,#4                    @pu1_src += 4
    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    subs        r4,r4,#4                    @(wd -4)
    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         r1,r1,#4                    @pu1_dst += 4
    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    bgt         inner_loop_wd_4

end_inner_loop_wd_4:
    subs        r7,r7,#4                    @ht - 4
    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_4
    cmp         r8,#0
    bgt         outer_loop_wd_4_ht_2

end_loops:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp


outer_loop_wd_4_ht_2:
    subs        r4,r12,#0                   @checks wd == 0
    ble         end_loops

inner_loop_wd_4_ht_2:
    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add         r0,r0,#4                    @pu1_src += 4
    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    subs        r4,r4,#4                    @(wd -4)
    add         r1,r1,#4                    @pu1_dst += 4
    bgt         inner_loop_wd_4_ht_2
    b           end_loops

core_loop_wd_8:
    sub         r11,r12,#8
    cmp         r7,#0
    beq         outer_loop_wd_8_ht_2

outer_loop_wd_8:
    subs        r4,r12,#0                   @checks wd
    ble         end_inner_loop_wd_8

inner_loop_wd_8:
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    subs        r4,r4,#8                    @wd - 8(loop condition)
    vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    bgt         inner_loop_wd_8

end_inner_loop_wd_8:
    subs        r7,r7,#4                    @ht -= 4
    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_8
    cmp         r8,#0
    bgt         outer_loop_wd_8_ht_2
    b           end_loops

outer_loop_wd_8_ht_2:
    subs        r4,r12,#0                   @checks wd
    ble         end_loops

inner_loop_wd_8_ht_2:
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    @subs     r4,r4,#8                      @wd - 8(loop condition)
    @bgt      inner_loop_wd_8_ht_2
    b           end_loops

core_loop_wd_16:
    sub         r11,r12,#16
    cmp         r7,#0
    beq         outer_loop_wd_16_ht_2

outer_loop_wd_16:
    subs        r4,r12,#0                   @checks wd
    ble         end_inner_loop_wd_16

inner_loop_wd_16:
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    subs        r4,r4,#16                   @wd - 16(loop condition)
    vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    bgt         inner_loop_wd_16

end_inner_loop_wd_16:
    subs        r7,r7,#4                    @ht -= 4
    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    bgt         outer_loop_wd_16
    cmp         r8,#0
    bgt         outer_loop_wd_16_ht_2
    b           end_loops

outer_loop_wd_16_ht_2:
    subs        r4,r12,#0                   @checks wd
    ble         end_loops

inner_loop_wd_16_ht_2:
    add         r5,r0,r2                    @pu1_src_tmp += src_strd
    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    @subs     r4,r4,#16                     @wd - 16(loop condition)
    @bgt      inner_loop_wd_16_ht_2

    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
init from android-12.1.0_r8 2023-01-09 17:11:35 +08:00			`@/*****************************************************************************`
			`@*`
			`@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore`
			`@*`
			`@* Licensed under the Apache License, Version 2.0 (the "License");`
			`@* you may not use this file except in compliance with the License.`
			`@* You may obtain a copy of the License at:`
			`@*`
			`@* http://www.apache.org/licenses/LICENSE-2.0`
			`@*`
			`@* Unless required by applicable law or agreed to in writing, software`
			`@* distributed under the License is distributed on an "AS IS" BASIS,`
			`@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`@* See the License for the specific language governing permissions and`
			`@* limitations under the License.`
			`@*`
			`@*****************************************************************************/`
			`@/**`
			`@*******************************************************************************`
			`@* @file`
			`@* ihevc_inter_pred_chroma_copy_neon.s`
			`@*`
			`@* @brief`
			`@* contains function definitions for inter prediction interpolation.`
			`@* functions are coded using neon intrinsics and can be compiled using`

			`@* rvct`
			`@*`
			`@* @author`
			`@* yogeswaran rs`
			`@*`
			`@* @par list of functions:`
			`@*`
			`@*`
			`@* @remarks`
			`@* none`
			`@*`
			`@*******************************************************************************`
			`@*/`
			`@/**`
			`@*******************************************************************************`
			`@*`
			`@* @brief`
			`@* chroma interprediction filter for copy`
			`@*`
			`@* @par description:`
			`@* copies the array of width 'wd' and height 'ht' from the location pointed`
			`@* by 'src' to the location pointed by 'dst'`
			`@*`
			`@* @param[in] pu1_src`
			`@* uword8 pointer to the source`
			`@*`
			`@* @param[out] pu1_dst`
			`@* uword8 pointer to the destination`
			`@*`
			`@* @param[in] src_strd`
			`@* integer source stride`
			`@*`
			`@* @param[in] dst_strd`
			`@* integer destination stride`
			`@*`
			`@* @param[in] pi1_coeff`
			`@* word8 pointer to the filter coefficients`
			`@*`
			`@* @param[in] ht`
			`@* integer height of the array`
			`@*`
			`@* @param[in] wd`
			`@* integer width of the array`
			`@*`
			`@* @returns`
			`@*`
			`@* @remarks`
			`@* none`
			`@*`
			`@*******************************************************************************`
			`@*/`

			`@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,`
			`@ uword8 *pu1_dst,`
			`@ word32 src_strd,`
			`@ word32 dst_strd,`
			`@ word8 *pi1_coeff,`
			`@ word32 ht,`
			`@ word32 wd)`
			`@************variables vs registers***************************************`
			`@ r0 => *pu1_src`
			`@ r1 => *pu1_dst`
			`@ r2 => src_strd`
			`@ r3 => dst_strd`
			`@ r4 => *pi1_coeff`
			`@ r5 => ht`
			`@ r6 => wd`

			`.equ ht_offset, 44`
			`.equ wd_offset, 48`

			`.text`
			`.align 4`




			`.globl ihevc_inter_pred_chroma_copy_a9q`

			`.type ihevc_inter_pred_chroma_copy_a9q, %function`

			`ihevc_inter_pred_chroma_copy_a9q:`
			`stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments`
			`ldr r12,[sp,#wd_offset] @loads wd`
			`lsl r12,r12,#1`
			`ldr r7,[sp,#ht_offset] @loads ht`
			`cmp r7,#0 @checks ht == 0`
			`ble end_loops`
			`and r8,r7,#3 @check ht for mul of 2`
			`sub r7,r7,r8 @check the rounded height value`
			`tst r12,#15 @checks wd for multiples for 4 & 8`
			`beq core_loop_wd_16`
			`tst r12,#7 @checks wd for multiples for 4 & 8`
			`beq core_loop_wd_8`

			`sub r11,r12,#4`
			`cmp r7,#0`
			`beq outer_loop_wd_4_ht_2`

			`outer_loop_wd_4:`
			`subs r4,r12,#0 @checks wd == 0`
			`ble end_inner_loop_wd_4`

			`inner_loop_wd_4:`
			`vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`add r0,r0,#4 @pu1_src += 4`
			`vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`subs r4,r4,#4 @(wd -4)`
			`vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`add r1,r1,#4 @pu1_dst += 4`
			`vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`bgt inner_loop_wd_4`

			`end_inner_loop_wd_4:`
			`subs r7,r7,#4 @ht - 4`
			`sub r0,r5,r11 @pu1_src = pu1_src_tmp`
			`sub r1,r6,r11 @pu1_dst = pu1_dst_tmp`
			`bgt outer_loop_wd_4`
			`cmp r8,#0`
			`bgt outer_loop_wd_4_ht_2`

			`end_loops:`
			`ldmfd sp!,{r4-r12,r15} @reload the registers from sp`


			`outer_loop_wd_4_ht_2:`
			`subs r4,r12,#0 @checks wd == 0`
			`ble end_loops`

			`inner_loop_wd_4_ht_2:`
			`vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)`
			`add r0,r0,#4 @pu1_src += 4`
			`vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)`
			`subs r4,r4,#4 @(wd -4)`
			`add r1,r1,#4 @pu1_dst += 4`
			`bgt inner_loop_wd_4_ht_2`
			`b end_loops`

			`core_loop_wd_8:`
			`sub r11,r12,#8`
			`cmp r7,#0`
			`beq outer_loop_wd_8_ht_2`

			`outer_loop_wd_8:`
			`subs r4,r12,#0 @checks wd`
			`ble end_inner_loop_wd_8`

			`inner_loop_wd_8:`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`subs r4,r4,#8 @wd - 8(loop condition)`
			`vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`bgt inner_loop_wd_8`

			`end_inner_loop_wd_8:`
			`subs r7,r7,#4 @ht -= 4`
			`sub r0,r5,r11 @pu1_src = pu1_src_tmp`
			`sub r1,r6,r11 @pu1_dst = pu1_dst_tmp`
			`bgt outer_loop_wd_8`
			`cmp r8,#0`
			`bgt outer_loop_wd_8_ht_2`
			`b end_loops`

			`outer_loop_wd_8_ht_2:`
			`subs r4,r12,#0 @checks wd`
			`ble end_loops`

			`inner_loop_wd_8_ht_2:`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`@subs r4,r4,#8 @wd - 8(loop condition)`
			`@bgt inner_loop_wd_8_ht_2`
			`b end_loops`

			`core_loop_wd_16:`
			`sub r11,r12,#16`
			`cmp r7,#0`
			`beq outer_loop_wd_16_ht_2`

			`outer_loop_wd_16:`
			`subs r4,r12,#0 @checks wd`
			`ble end_inner_loop_wd_16`

			`inner_loop_wd_16:`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`subs r4,r4,#16 @wd - 16(loop condition)`
			`vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`bgt inner_loop_wd_16`

			`end_inner_loop_wd_16:`
			`subs r7,r7,#4 @ht -= 4`
			`sub r0,r5,r11 @pu1_src = pu1_src_tmp`
			`sub r1,r6,r11 @pu1_dst = pu1_dst_tmp`
			`bgt outer_loop_wd_16`
			`cmp r8,#0`
			`bgt outer_loop_wd_16_ht_2`
			`b end_loops`

			`outer_loop_wd_16_ht_2:`
			`subs r4,r12,#0 @checks wd`
			`ble end_loops`

			`inner_loop_wd_16_ht_2:`
			`add r5,r0,r2 @pu1_src_tmp += src_strd`
			`vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)`
			`add r6,r1,r3 @pu1_dst_tmp += dst_strd`
			`vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)`
			`vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)`
			`vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)`
			`@subs r4,r4,#16 @wd - 16(loop condition)`
			`@bgt inner_loop_wd_16_ht_2`

			`ldmfd sp!,{r4-r12,r15} @reload the registers from sp`