252 lines
8.3 KiB
C
252 lines
8.3 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevce_scale_by_2_neon.c
|
|
*
|
|
* @brief
|
|
* Contains definitions of functions for scale by 2
|
|
*
|
|
* @author
|
|
* Ittiam
|
|
*
|
|
* @par List of Functions:
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
********************************************************************************
|
|
*/
|
|
/*****************************************************************************/
|
|
/* File Includes */
|
|
/*****************************************************************************/
|
|
/* System include files */
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <arm_neon.h>
|
|
|
|
/* System user files */
|
|
#include "ihevc_typedefs.h"
|
|
#include "ihevc_macros.h"
|
|
#include "itt_video_api.h"
|
|
#include "ihevce_ipe_instr_set_router.h"
|
|
|
|
/*****************************************************************************/
|
|
/* Constant Macros */
|
|
/*****************************************************************************/
|
|
#define FILT_TAP_Q 7
|
|
|
|
/*****************************************************************************/
|
|
/* Function Definitions */
|
|
/*****************************************************************************/
|
|
|
|
static void ihevce_horz_scale_neon_w16(
|
|
UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
|
|
{
|
|
const int16x8_t prec = vdupq_n_s16(8192);
|
|
const int16x8_t inv_prec = vdupq_n_s16(64);
|
|
const uint8x8_t wt_0 = vdup_n_u8(66);
|
|
const int8_t wt_1 = 40;
|
|
const int8_t wt_2 = 9;
|
|
WORD32 i, j;
|
|
|
|
for(j = 0; j < ht; j++)
|
|
{
|
|
UWORD8 *pu1_src_tmp = pu1_src + j * src_strd - 3;
|
|
UWORD8 *pu1_dst_tmp = pu1_dst + j * dst_strd;
|
|
|
|
for(i = 0; i < wd;)
|
|
{
|
|
uint8x16x2_t src = vld2q_u8(pu1_src_tmp);
|
|
uint8x8_t c, l0, r0, r3;
|
|
int16x8_t p, q, r;
|
|
int16x8_t sum;
|
|
|
|
c = vext_u8(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]), 1);
|
|
l0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 1);
|
|
r0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 2);
|
|
r3 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 3);
|
|
|
|
p = vreinterpretq_s16_u16(vmull_u8(c, wt_0)); // a[0] * 66
|
|
q = vreinterpretq_s16_u16(vaddl_u8(l0, r0));
|
|
q = vmulq_n_s16(q, wt_1); // (a[-1] + a[1]) * 40
|
|
r = vreinterpretq_s16_u16(vaddl_u8(r3, vget_low_u8(src.val[0])));
|
|
r = vmulq_n_s16(r, wt_2); // (a[-3] + a[3]) * 9
|
|
|
|
// a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
|
|
sum = vsubq_s16(p, prec);
|
|
q = vsubq_s16(q, r);
|
|
sum = vaddq_s16(q, sum);
|
|
sum = vrshrq_n_s16(sum, FILT_TAP_Q);
|
|
sum = vaddq_s16(sum, inv_prec);
|
|
|
|
// result
|
|
c = vqmovun_s16(sum);
|
|
vst1_u8(pu1_dst_tmp, c);
|
|
|
|
i += 16;
|
|
pu1_src_tmp += 16;
|
|
pu1_dst_tmp += 8;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void ihevce_vert_scale_neon_w16(
|
|
UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
|
|
{
|
|
const int16x8_t prec = vdupq_n_s16(8192);
|
|
const int16x8_t inv_prec = vdupq_n_s16(64);
|
|
const uint8x8_t wt_0 = vdup_n_u8(66);
|
|
const int8_t wt_1 = 40;
|
|
const int8_t wt_2 = 9;
|
|
WORD32 i, j;
|
|
|
|
#define LOAD_ROW() \
|
|
{ \
|
|
src[mod8] = vld1q_u8(pu1_src_tmp); \
|
|
pu1_src_tmp += src_strd; \
|
|
mod8++; \
|
|
mod8 &= 7; \
|
|
}
|
|
|
|
for(i = 0; i < wd; i += 16)
|
|
{
|
|
UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd + i;
|
|
WORD32 lut_id = 0;
|
|
UWORD8 mod8 = 0;
|
|
uint8x16_t src[8];
|
|
|
|
LOAD_ROW() // r[-3]
|
|
LOAD_ROW() // r[-2]
|
|
LOAD_ROW() // r[-1]
|
|
LOAD_ROW() // r[0]
|
|
LOAD_ROW() // r[1]
|
|
|
|
for(j = 0; j < ht; j += 2)
|
|
{
|
|
UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd + i;
|
|
UWORD8 c, t1, b1, t2, b2;
|
|
int16x8_t p, q, r;
|
|
int16x8_t sum;
|
|
|
|
LOAD_ROW() // r[2]
|
|
LOAD_ROW() // r[3]
|
|
|
|
t2 = (lut_id & 7);
|
|
t1 = (lut_id + 2) & 7;
|
|
c = (lut_id + 3) & 7;
|
|
b1 = (lut_id + 4) & 7;
|
|
b2 = (lut_id + 6) & 7;
|
|
lut_id += 2;
|
|
|
|
// a[0] * 66
|
|
p = vreinterpretq_s16_u16(vmull_u8(vget_low_u8(src[c]), wt_0));
|
|
// (a[-1] + a[1]) * 40
|
|
q = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t1]), vget_low_u8(src[b1])));
|
|
q = vmulq_n_s16(q, wt_1);
|
|
// (a[-3] + a[3]) * 9
|
|
r = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t2]), vget_low_u8(src[b2])));
|
|
r = vmulq_n_s16(r, wt_2);
|
|
|
|
// a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
|
|
sum = vsubq_s16(p, prec);
|
|
q = vsubq_s16(q, r);
|
|
sum = vaddq_s16(q, sum);
|
|
sum = vrshrq_n_s16(sum, FILT_TAP_Q);
|
|
sum = vaddq_s16(sum, inv_prec);
|
|
|
|
vst1_u8(pu1_dst_tmp, vqmovun_s16(sum));
|
|
|
|
// a[0] * 66
|
|
p = vreinterpretq_s16_u16(vmull_u8(vget_high_u8(src[c]), wt_0));
|
|
// (a[-1] + a[1]) * 40
|
|
q = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t1]), vget_high_u8(src[b1])));
|
|
q = vmulq_n_s16(q, wt_1);
|
|
// (a[-3] + a[3]) * 9
|
|
r = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t2]), vget_high_u8(src[b2])));
|
|
r = vmulq_n_s16(r, wt_2);
|
|
|
|
// a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
|
|
sum = vsubq_s16(p, prec);
|
|
q = vsubq_s16(q, r);
|
|
sum = vaddq_s16(q, sum);
|
|
sum = vrshrq_n_s16(sum, FILT_TAP_Q);
|
|
sum = vaddq_s16(sum, inv_prec);
|
|
|
|
vst1_u8(pu1_dst_tmp + 8, vqmovun_s16(sum));
|
|
|
|
pu1_dst_tmp += 16;
|
|
}
|
|
}
|
|
}
|
|
|
|
void ihevce_scaling_filter_mxn_neon(
|
|
UWORD8 *pu1_src,
|
|
WORD32 src_strd,
|
|
UWORD8 *pu1_scrtch,
|
|
WORD32 scrtch_strd,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 dst_strd,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 i, j;
|
|
|
|
assert(wd >= 16 && wd % 16 == 0);
|
|
assert(ht % 2 == 0);
|
|
for(j = 0; j < ht;)
|
|
{
|
|
UWORD8 *pu1_src_tmp = pu1_src + j * src_strd;
|
|
UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd;
|
|
WORD32 rows = MIN(64, (ht - j));
|
|
|
|
for(i = 0; i < wd;)
|
|
{
|
|
WORD32 cols;
|
|
|
|
if((wd - i) >= 64)
|
|
cols = 64;
|
|
else if((wd - i) >= 32)
|
|
cols = 32;
|
|
else
|
|
cols = 16;
|
|
|
|
ihevce_horz_scale_neon_w16(
|
|
pu1_src_tmp - 3 * src_strd + i,
|
|
src_strd,
|
|
pu1_scrtch,
|
|
scrtch_strd,
|
|
cols,
|
|
(3 + rows + 2));
|
|
ihevce_vert_scale_neon_w16(
|
|
pu1_scrtch + 3 * scrtch_strd,
|
|
scrtch_strd,
|
|
pu1_dst_tmp + (i >> 1),
|
|
dst_strd,
|
|
(cols >> 1),
|
|
rows);
|
|
i += cols;
|
|
}
|
|
j += rows;
|
|
}
|
|
}
|