309 lines
9.6 KiB
C
309 lines
9.6 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevce_itrans_recon_neon.c
|
|
*
|
|
* @brief
|
|
* Contains functions to inverse transform and adds residue to pred buffer
|
|
*
|
|
* @author
|
|
* Ittiam
|
|
*
|
|
* @par List of Functions:
|
|
* - ihevce_itrans_recon_dc_neon
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
********************************************************************************
|
|
*/
|
|
|
|
/*****************************************************************************/
|
|
/* File Includes */
|
|
/*****************************************************************************/
|
|
/* System include files */
|
|
#include <string.h>
|
|
#include <arm_neon.h>
|
|
|
|
/* User include files */
|
|
#include "ihevc_typedefs.h"
|
|
#include "itt_video_api.h"
|
|
#include "ihevc_cmn_utils_neon.h"
|
|
#include "ihevce_cmn_utils_instr_set_router.h"
|
|
#include "ihevc_defs.h"
|
|
#include "ihevc_macros.h"
|
|
|
|
/*****************************************************************************/
|
|
/* Function Definitions */
|
|
/*****************************************************************************/
|
|
static INLINE void ihevce_itrans_recon_dc_4x4_luma_neon(
|
|
UWORD8 *pu1_pred, WORD32 pred_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 dc_value)
|
|
{
|
|
uint8x16_t src_u8;
|
|
int16x8_t a0, a1, a2;
|
|
uint8x8_t a3, a4;
|
|
|
|
src_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
|
|
a0 = vdupq_n_s16(dc_value);
|
|
a1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
|
|
a2 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
|
|
a1 = vaddq_s16(a1, a0);
|
|
a2 = vaddq_s16(a2, a0);
|
|
a3 = vqmovun_s16(a1);
|
|
a4 = vqmovun_s16(a2);
|
|
uint32x2_t p0 = vreinterpret_u32_u8(a3);
|
|
uint32x2_t p1 = vreinterpret_u32_u8(a4);
|
|
*(UWORD32 *)(pu1_dst) = vget_lane_u32(p0, 0);
|
|
*(UWORD32 *)(pu1_dst + dst_strd) = vget_lane_u32(p0, 1);
|
|
*(UWORD32 *)(pu1_dst + 2 * dst_strd) = vget_lane_u32(p1, 0);
|
|
*(UWORD32 *)(pu1_dst + 3 * dst_strd) = vget_lane_u32(p1, 1);
|
|
}
|
|
|
|
static INLINE void ihevce_itrans_recon_dc_4x4_chroma_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD32 dc_value,
|
|
CHROMA_PLANE_ID_T e_chroma_plane)
|
|
{
|
|
WORD32 i;
|
|
int16x8_t a0, a1;
|
|
uint8x8_t a2, a3;
|
|
uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
|
|
|
|
a0 = vdupq_n_s16(dc_value);
|
|
for(i = 0; i < trans_size; i++)
|
|
{
|
|
a1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pu1_pred + i * pred_strd)));
|
|
a2 = vqmovun_s16(vaddq_s16(a0, a1));
|
|
a3 = vld1_u8(pu1_dst + i * dst_strd);
|
|
a3 = vbsl_u8(vreinterpret_u8_u16(select), a2, a3);
|
|
vst1_u8(pu1_dst + i * dst_strd, a3);
|
|
}
|
|
}
|
|
|
|
static INLINE void ihevce_itrans_recon_dc_8x8_luma_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD32 dc_value)
|
|
{
|
|
WORD32 i;
|
|
uint8x16_t a1, a4, a5;
|
|
uint8x8_t a0, a2, a3;
|
|
|
|
a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
|
|
: vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
|
|
a1 = vcombine_u8(a0, a0);
|
|
for(i = 0; i < trans_size; i += 2)
|
|
{
|
|
a2 = vld1_u8(pu1_pred + i * pred_strd);
|
|
a3 = vld1_u8(pu1_pred + (i + 1) * pred_strd);
|
|
a4 = vcombine_u8(a2, a3);
|
|
a5 = (dc_value >= 0) ? vqaddq_u8(a1, a4) : vqsubq_u8(a4, a1);
|
|
vst1_u8(pu1_dst + i * dst_strd, vget_low_u8(a5));
|
|
vst1_u8(pu1_dst + (i + 1) * dst_strd, vget_high_u8(a5));
|
|
}
|
|
}
|
|
|
|
static INLINE void ihevce_itrans_recon_dc_8x8_chroma_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD32 dc_value,
|
|
CHROMA_PLANE_ID_T e_chroma_plane)
|
|
{
|
|
WORD32 i;
|
|
uint8x16_t a4, a0, a5;
|
|
uint8x8_t a1, a2, a3;
|
|
uint8x16x2_t a6;
|
|
|
|
a1 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
|
|
: vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
|
|
a0 = vcombine_u8(a1, a1);
|
|
for(i = 0; i < trans_size; i += 2)
|
|
{
|
|
a2 = vld2_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
|
|
a3 = vld2_u8(pu1_pred + (i + 1) * pred_strd).val[e_chroma_plane];
|
|
a4 = vcombine_u8(a2, a3);
|
|
a4 = (dc_value >= 0) ? vqaddq_u8(a0, a4) : vqsubq_u8(a4, a0);
|
|
a2 = vld2_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
|
|
a3 = vld2_u8(pu1_dst + (i + 1) * dst_strd).val[!e_chroma_plane];
|
|
a5 = vcombine_u8(a2, a3);
|
|
a6 = (e_chroma_plane == 0) ? vzipq_u8(a4, a5) : vzipq_u8(a5, a4);
|
|
vst1q_u8(pu1_dst + i * dst_strd, a6.val[0]);
|
|
vst1q_u8(pu1_dst + (i + 1) * dst_strd, a6.val[1]);
|
|
}
|
|
}
|
|
|
|
static INLINE void ihevce_itrans_recon_dc_16x16_luma_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD32 dc_value)
|
|
{
|
|
WORD32 i;
|
|
uint8x16_t a1, a3, a2;
|
|
uint8x8_t a0;
|
|
|
|
a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
|
|
: vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
|
|
a1 = vcombine_u8(a0, a0);
|
|
for(i = 0; i < trans_size; i++)
|
|
{
|
|
a2 = vld1q_u8(pu1_pred + i * pred_strd);
|
|
a3 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
|
|
vst1q_u8(pu1_dst + i * dst_strd, a3);
|
|
}
|
|
}
|
|
|
|
static INLINE void ihevce_itrans_recon_dc_16x16_chroma_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD32 dc_value,
|
|
CHROMA_PLANE_ID_T e_chroma_plane)
|
|
{
|
|
WORD32 i;
|
|
uint8x8_t a0;
|
|
uint8x16_t a1, a2, a3;
|
|
uint8x16x2_t a4;
|
|
|
|
a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
|
|
: vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
|
|
a1 = vcombine_u8(a0, a0);
|
|
for(i = 0; i < trans_size; i++)
|
|
{
|
|
a2 = vld2q_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
|
|
a2 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
|
|
a3 = vld2q_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
|
|
a4 = (e_chroma_plane == 0) ? vzipq_u8(a2, a3) : vzipq_u8(a3, a2);
|
|
vst1q_u8(pu1_dst + i * dst_strd, a4.val[0]);
|
|
vst1q_u8(pu1_dst + i * dst_strd + 16, a4.val[1]);
|
|
}
|
|
}
|
|
|
|
void ihevce_itrans_recon_dc_neon(
|
|
UWORD8 *pu1_pred,
|
|
WORD32 pred_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 trans_size,
|
|
WORD16 i2_deq_value,
|
|
CHROMA_PLANE_ID_T e_chroma_plane)
|
|
{
|
|
WORD32 add, shift;
|
|
WORD32 dc_value;
|
|
|
|
shift = IT_SHIFT_STAGE_1;
|
|
add = 1 << (shift - 1);
|
|
dc_value = CLIP_S16((i2_deq_value * 64 + add) >> shift);
|
|
shift = IT_SHIFT_STAGE_2;
|
|
add = 1 << (shift - 1);
|
|
dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
|
|
|
|
switch(trans_size)
|
|
{
|
|
case 4:
|
|
if(NULL_PLANE == e_chroma_plane)
|
|
{
|
|
ihevce_itrans_recon_dc_4x4_luma_neon(pu1_pred, pred_strd, pu1_dst, dst_strd, dc_value);
|
|
}
|
|
else
|
|
{
|
|
ihevce_itrans_recon_dc_4x4_chroma_neon(
|
|
pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
|
|
}
|
|
break;
|
|
|
|
case 8:
|
|
if(NULL_PLANE == e_chroma_plane)
|
|
{
|
|
ihevce_itrans_recon_dc_8x8_luma_neon(
|
|
pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
|
|
}
|
|
else
|
|
{
|
|
ihevce_itrans_recon_dc_8x8_chroma_neon(
|
|
pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
|
|
}
|
|
break;
|
|
|
|
case 16:
|
|
if(NULL_PLANE == e_chroma_plane)
|
|
{
|
|
ihevce_itrans_recon_dc_16x16_luma_neon(
|
|
pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
|
|
}
|
|
else
|
|
{
|
|
ihevce_itrans_recon_dc_16x16_chroma_neon(
|
|
pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
|
|
}
|
|
break;
|
|
|
|
case 32:
|
|
if(NULL_PLANE == e_chroma_plane)
|
|
{
|
|
WORD32 b16;
|
|
|
|
for(b16 = 0; b16 < 4; b16++)
|
|
{
|
|
ihevce_itrans_recon_dc_16x16_luma_neon(
|
|
pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 16),
|
|
pred_strd,
|
|
pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 16),
|
|
dst_strd,
|
|
trans_size >> 1,
|
|
dc_value);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
WORD32 b16;
|
|
|
|
for(b16 = 0; b16 < 4; b16++)
|
|
{
|
|
ihevce_itrans_recon_dc_16x16_chroma_neon(
|
|
pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 32),
|
|
pred_strd,
|
|
pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 32),
|
|
dst_strd,
|
|
trans_size >> 1,
|
|
dc_value,
|
|
e_chroma_plane);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|