548 lines
21 KiB
C
548 lines
21 KiB
C
/*****************************************************************************
|
|
* quant.c: ppc quantization
|
|
*****************************************************************************
|
|
* Copyright (C) 2007-2025 x264 project
|
|
*
|
|
* Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at licensing@x264.com.
|
|
*****************************************************************************/
|
|
|
|
#include "common/common.h"
|
|
#include "ppccommon.h"
|
|
#include "quant.h"
|
|
|
|
#if !HIGH_BIT_DEPTH
|
|
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
|
|
#define QUANT_16_U( idx0, idx1 ) \
|
|
{ \
|
|
temp1v = vec_ld((idx0), dct); \
|
|
temp2v = vec_ld((idx1), dct); \
|
|
mfvA = vec_ld((idx0), mf); \
|
|
mfvB = vec_ld((idx1), mf); \
|
|
biasvA = vec_ld((idx0), bias); \
|
|
biasvB = vec_ld((idx1), bias); \
|
|
mskA = vec_cmplt(temp1v, zero_s16v); \
|
|
mskB = vec_cmplt(temp2v, zero_s16v); \
|
|
coefvA = (vec_u16_t)vec_abs( temp1v ); \
|
|
coefvB = (vec_u16_t)vec_abs( temp2v ); \
|
|
coefvA = vec_adds(coefvA, biasvA); \
|
|
coefvB = vec_adds(coefvB, biasvB); \
|
|
multEvenvA = vec_mule(coefvA, mfvA); \
|
|
multOddvA = vec_mulo(coefvA, mfvA); \
|
|
multEvenvB = vec_mule(coefvB, mfvB); \
|
|
multOddvB = vec_mulo(coefvB, mfvB); \
|
|
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
|
|
multOddvA = vec_sr(multOddvA, i_qbitsv); \
|
|
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
|
|
multOddvB = vec_sr(multOddvB, i_qbitsv); \
|
|
temp1v = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
|
|
tmpv = xxpermdi( temp1v, temp1v, 2 ); \
|
|
temp1v = vec_mergeh( temp1v, tmpv ); \
|
|
temp2v = (vec_s16_t) vec_packs( multEvenvB, multOddvB ); \
|
|
tmpv = xxpermdi( temp2v, temp2v, 2 ); \
|
|
temp2v = vec_mergeh( temp2v, tmpv ); \
|
|
temp1v = vec_xor(temp1v, mskA); \
|
|
temp2v = vec_xor(temp2v, mskB); \
|
|
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
|
|
vec_st(temp1v, (idx0), dct); \
|
|
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
|
|
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
|
|
vec_st(temp2v, (idx1), dct); \
|
|
}
|
|
|
|
int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
|
|
{
|
|
LOAD_ZERO;
|
|
vector bool short mskA;
|
|
vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 );
|
|
vec_u16_t coefvA;
|
|
vec_u32_t multEvenvA, multOddvA;
|
|
vec_u16_t mfvA;
|
|
vec_u16_t biasvA;
|
|
vec_s16_t one = vec_splat_s16(1);
|
|
vec_s16_t nz = zero_s16v;
|
|
|
|
vector bool short mskB;
|
|
vec_u16_t coefvB;
|
|
vec_u32_t multEvenvB, multOddvB;
|
|
vec_u16_t mfvB;
|
|
vec_u16_t biasvB;
|
|
|
|
vec_s16_t temp1v, temp2v, tmpv;
|
|
|
|
QUANT_16_U( 0, 16 );
|
|
return vec_any_ne(nz, zero_s16v);
|
|
}
|
|
|
|
int x264_quant_4x4x4_altivec( dctcoef dcta[4][16], udctcoef mf[16], udctcoef bias[16] )
|
|
{
|
|
LOAD_ZERO;
|
|
vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 );
|
|
vec_s16_t one = vec_splat_s16( 1 );
|
|
vec_s16_t nz0, nz1, nz2, nz3;
|
|
|
|
vector bool short mskA0;
|
|
vec_u16_t coefvA0;
|
|
vec_u32_t multEvenvA0, multOddvA0;
|
|
vec_u16_t mfvA0;
|
|
vec_u16_t biasvA0;
|
|
vector bool short mskB0;
|
|
vec_u16_t coefvB0;
|
|
vec_u32_t multEvenvB0, multOddvB0;
|
|
vec_u16_t mfvB0;
|
|
vec_u16_t biasvB0;
|
|
|
|
vector bool short mskA1;
|
|
vec_u16_t coefvA1;
|
|
vec_u32_t multEvenvA1, multOddvA1;
|
|
vec_u16_t mfvA1;
|
|
vec_u16_t biasvA1;
|
|
vector bool short mskB1;
|
|
vec_u16_t coefvB1;
|
|
vec_u32_t multEvenvB1, multOddvB1;
|
|
vec_u16_t mfvB1;
|
|
vec_u16_t biasvB1;
|
|
|
|
vector bool short mskA2;
|
|
vec_u16_t coefvA2;
|
|
vec_u32_t multEvenvA2, multOddvA2;
|
|
vec_u16_t mfvA2;
|
|
vec_u16_t biasvA2;
|
|
vector bool short mskB2;
|
|
vec_u16_t coefvB2;
|
|
vec_u32_t multEvenvB2, multOddvB2;
|
|
vec_u16_t mfvB2;
|
|
vec_u16_t biasvB2;
|
|
|
|
vector bool short mskA3;
|
|
vec_u16_t coefvA3;
|
|
vec_u32_t multEvenvA3, multOddvA3;
|
|
vec_u16_t mfvA3;
|
|
vec_u16_t biasvA3;
|
|
vector bool short mskB3;
|
|
vec_u16_t coefvB3;
|
|
vec_u32_t multEvenvB3, multOddvB3;
|
|
vec_u16_t mfvB3;
|
|
vec_u16_t biasvB3;
|
|
|
|
vec_s16_t temp1v, temp2v;
|
|
vec_s16_t tmpv0;
|
|
vec_s16_t tmpv1;
|
|
|
|
dctcoef *dct0 = dcta[0];
|
|
dctcoef *dct1 = dcta[1];
|
|
dctcoef *dct2 = dcta[2];
|
|
dctcoef *dct3 = dcta[3];
|
|
|
|
temp1v = vec_ld( 0, dct0 );
|
|
temp2v = vec_ld( 16, dct0 );
|
|
mfvA0 = vec_ld( 0, mf );
|
|
mfvB0 = vec_ld( 16, mf );
|
|
biasvA0 = vec_ld( 0, bias );
|
|
biasvB0 = vec_ld( 16, bias );
|
|
mskA0 = vec_cmplt( temp1v, zero_s16v );
|
|
mskB0 = vec_cmplt( temp2v, zero_s16v );
|
|
coefvA0 = (vec_u16_t)vec_abs( temp1v );
|
|
coefvB0 = (vec_u16_t)vec_abs( temp2v );
|
|
temp1v = vec_ld( 0, dct1 );
|
|
temp2v = vec_ld( 16, dct1 );
|
|
mfvA1 = vec_ld( 0, mf );
|
|
mfvB1 = vec_ld( 16, mf );
|
|
biasvA1 = vec_ld( 0, bias );
|
|
biasvB1 = vec_ld( 16, bias );
|
|
mskA1 = vec_cmplt( temp1v, zero_s16v );
|
|
mskB1 = vec_cmplt( temp2v, zero_s16v );
|
|
coefvA1 = (vec_u16_t)vec_abs( temp1v );
|
|
coefvB1 = (vec_u16_t)vec_abs( temp2v );
|
|
temp1v = vec_ld( 0, dct2 );
|
|
temp2v = vec_ld( 16, dct2 );
|
|
mfvA2 = vec_ld( 0, mf );
|
|
mfvB2 = vec_ld( 16, mf );
|
|
biasvA2 = vec_ld( 0, bias );
|
|
biasvB2 = vec_ld( 16, bias );
|
|
mskA2 = vec_cmplt( temp1v, zero_s16v );
|
|
mskB2 = vec_cmplt( temp2v, zero_s16v );
|
|
coefvA2 = (vec_u16_t)vec_abs( temp1v );
|
|
coefvB2 = (vec_u16_t)vec_abs( temp2v );
|
|
temp1v = vec_ld( 0, dct3 );
|
|
temp2v = vec_ld( 16, dct3 );
|
|
mfvA3 = vec_ld( 0, mf );
|
|
mfvB3 = vec_ld( 16, mf );
|
|
biasvA3 = vec_ld( 0, bias );
|
|
biasvB3 = vec_ld( 16, bias );
|
|
mskA3 = vec_cmplt( temp1v, zero_s16v );
|
|
mskB3 = vec_cmplt( temp2v, zero_s16v );
|
|
coefvA3 = (vec_u16_t)vec_abs( temp1v );
|
|
coefvB3 = (vec_u16_t)vec_abs( temp2v );
|
|
|
|
coefvA0 = vec_adds( coefvA0, biasvA0 );
|
|
coefvB0 = vec_adds( coefvB0, biasvB0 );
|
|
coefvA1 = vec_adds( coefvA1, biasvA1 );
|
|
coefvB1 = vec_adds( coefvB1, biasvB1 );
|
|
coefvA2 = vec_adds( coefvA2, biasvA2 );
|
|
coefvB2 = vec_adds( coefvB2, biasvB2 );
|
|
coefvA3 = vec_adds( coefvA3, biasvA3 );
|
|
coefvB3 = vec_adds( coefvB3, biasvB3 );
|
|
|
|
multEvenvA0 = vec_mule( coefvA0, mfvA0 );
|
|
multOddvA0 = vec_mulo( coefvA0, mfvA0 );
|
|
multEvenvB0 = vec_mule( coefvB0, mfvB0 );
|
|
multOddvB0 = vec_mulo( coefvB0, mfvB0 );
|
|
multEvenvA0 = vec_sr( multEvenvA0, i_qbitsv );
|
|
multOddvA0 = vec_sr( multOddvA0, i_qbitsv );
|
|
multEvenvB0 = vec_sr( multEvenvB0, i_qbitsv );
|
|
multOddvB0 = vec_sr( multOddvB0, i_qbitsv );
|
|
temp1v = (vec_s16_t)vec_packs( multEvenvA0, multOddvA0 );
|
|
temp2v = (vec_s16_t)vec_packs( multEvenvB0, multOddvB0 );
|
|
tmpv0 = xxpermdi( temp1v, temp1v, 2 );
|
|
tmpv1 = xxpermdi( temp2v, temp2v, 2 );
|
|
temp1v = vec_mergeh( temp1v, tmpv0 );
|
|
temp2v = vec_mergeh( temp2v, tmpv1 );
|
|
temp1v = vec_xor( temp1v, mskA0 );
|
|
temp2v = vec_xor( temp2v, mskB0 );
|
|
temp1v = vec_adds( temp1v, vec_and( mskA0, one ) );
|
|
temp2v = vec_adds( temp2v, vec_and( mskB0, one ) );
|
|
vec_st( temp1v, 0, dct0 );
|
|
vec_st( temp2v, 16, dct0 );
|
|
nz0 = vec_or( temp1v, temp2v );
|
|
|
|
multEvenvA1 = vec_mule( coefvA1, mfvA1 );
|
|
multOddvA1 = vec_mulo( coefvA1, mfvA1 );
|
|
multEvenvB1 = vec_mule( coefvB1, mfvB1 );
|
|
multOddvB1 = vec_mulo( coefvB1, mfvB1 );
|
|
multEvenvA1 = vec_sr( multEvenvA1, i_qbitsv );
|
|
multOddvA1 = vec_sr( multOddvA1, i_qbitsv );
|
|
multEvenvB1 = vec_sr( multEvenvB1, i_qbitsv );
|
|
multOddvB1 = vec_sr( multOddvB1, i_qbitsv );
|
|
temp1v = (vec_s16_t)vec_packs( multEvenvA1, multOddvA1 );
|
|
temp2v = (vec_s16_t)vec_packs( multEvenvB1, multOddvB1 );
|
|
tmpv0 = xxpermdi( temp1v, temp1v, 2 );
|
|
tmpv1 = xxpermdi( temp2v, temp2v, 2 );
|
|
temp1v = vec_mergeh( temp1v, tmpv0 );
|
|
temp2v = vec_mergeh( temp2v, tmpv1 );
|
|
temp1v = vec_xor( temp1v, mskA1 );
|
|
temp2v = vec_xor( temp2v, mskB1 );
|
|
temp1v = vec_adds( temp1v, vec_and( mskA1, one ) );
|
|
temp2v = vec_adds( temp2v, vec_and( mskB1, one ) );
|
|
vec_st( temp1v, 0, dct1 );
|
|
vec_st( temp2v, 16, dct1 );
|
|
nz1 = vec_or( temp1v, temp2v );
|
|
|
|
multEvenvA2 = vec_mule( coefvA2, mfvA2 );
|
|
multOddvA2 = vec_mulo( coefvA2, mfvA2 );
|
|
multEvenvB2 = vec_mule( coefvB2, mfvB2 );
|
|
multOddvB2 = vec_mulo( coefvB2, mfvB2 );
|
|
multEvenvA2 = vec_sr( multEvenvA2, i_qbitsv );
|
|
multOddvA2 = vec_sr( multOddvA2, i_qbitsv );
|
|
multEvenvB2 = vec_sr( multEvenvB2, i_qbitsv );
|
|
multOddvB2 = vec_sr( multOddvB2, i_qbitsv );
|
|
temp1v = (vec_s16_t)vec_packs( multEvenvA2, multOddvA2 );
|
|
temp2v = (vec_s16_t)vec_packs( multEvenvB2, multOddvB2 );
|
|
tmpv0 = xxpermdi( temp1v, temp1v, 2 );
|
|
tmpv1 = xxpermdi( temp2v, temp2v, 2 );
|
|
temp1v = vec_mergeh( temp1v, tmpv0 );
|
|
temp2v = vec_mergeh( temp2v, tmpv1 );
|
|
temp1v = vec_xor( temp1v, mskA2 );
|
|
temp2v = vec_xor( temp2v, mskB2 );
|
|
temp1v = vec_adds( temp1v, vec_and( mskA2, one ) );
|
|
temp2v = vec_adds( temp2v, vec_and( mskB2, one ) );
|
|
vec_st( temp1v, 0, dct2 );
|
|
vec_st( temp2v, 16, dct2 );
|
|
nz2 = vec_or( temp1v, temp2v );
|
|
|
|
multEvenvA3 = vec_mule( coefvA3, mfvA3 );
|
|
multOddvA3 = vec_mulo( coefvA3, mfvA3 );
|
|
multEvenvB3 = vec_mule( coefvB3, mfvB3 );
|
|
multOddvB3 = vec_mulo( coefvB3, mfvB3 );
|
|
multEvenvA3 = vec_sr( multEvenvA3, i_qbitsv );
|
|
multOddvA3 = vec_sr( multOddvA3, i_qbitsv );
|
|
multEvenvB3 = vec_sr( multEvenvB3, i_qbitsv );
|
|
multOddvB3 = vec_sr( multOddvB3, i_qbitsv );
|
|
temp1v = (vec_s16_t)vec_packs( multEvenvA3, multOddvA3 );
|
|
temp2v = (vec_s16_t)vec_packs( multEvenvB3, multOddvB3 );
|
|
tmpv0 = xxpermdi( temp1v, temp1v, 2 );
|
|
tmpv1 = xxpermdi( temp2v, temp2v, 2 );
|
|
temp1v = vec_mergeh( temp1v, tmpv0 );
|
|
temp2v = vec_mergeh( temp2v, tmpv1 );
|
|
temp1v = vec_xor( temp1v, mskA3 );
|
|
temp2v = vec_xor( temp2v, mskB3 );
|
|
temp1v = vec_adds( temp1v, vec_and( mskA3, one ) );
|
|
temp2v = vec_adds( temp2v, vec_and( mskB3, one ) );
|
|
vec_st( temp1v, 0, dct3 );
|
|
vec_st( temp2v, 16, dct3 );
|
|
nz3 = vec_or( temp1v, temp2v );
|
|
|
|
return (vec_any_ne( nz0, zero_s16v ) << 0) | (vec_any_ne( nz1, zero_s16v ) << 1) |
|
|
(vec_any_ne( nz2, zero_s16v ) << 2) | (vec_any_ne( nz3, zero_s16v ) << 3);
|
|
}
|
|
|
|
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
|
|
#define QUANT_16_U_DC( idx0, idx1 ) \
|
|
{ \
|
|
temp1v = vec_ld((idx0), dct); \
|
|
temp2v = vec_ld((idx1), dct); \
|
|
mskA = vec_cmplt(temp1v, zero_s16v); \
|
|
mskB = vec_cmplt(temp2v, zero_s16v); \
|
|
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
|
|
coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
|
|
coefvA = vec_add(coefvA, biasv); \
|
|
coefvB = vec_add(coefvB, biasv); \
|
|
multEvenvA = vec_mule(coefvA, mfv); \
|
|
multOddvA = vec_mulo(coefvA, mfv); \
|
|
multEvenvB = vec_mule(coefvB, mfv); \
|
|
multOddvB = vec_mulo(coefvB, mfv); \
|
|
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
|
|
multOddvA = vec_sr(multOddvA, i_qbitsv); \
|
|
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
|
|
multOddvB = vec_sr(multOddvB, i_qbitsv); \
|
|
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
|
|
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
|
|
temp1v = vec_xor(temp1v, mskA); \
|
|
temp2v = vec_xor(temp2v, mskB); \
|
|
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
|
|
vec_st(temp1v, (idx0), dct); \
|
|
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
|
|
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
|
|
vec_st(temp2v, (idx1), dct); \
|
|
}
|
|
|
|
int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
|
|
{
|
|
LOAD_ZERO;
|
|
vector bool short mskA;
|
|
vec_u32_t i_qbitsv;
|
|
vec_u16_t coefvA;
|
|
vec_u32_t multEvenvA, multOddvA;
|
|
vec_s16_t one = vec_splat_s16(1);
|
|
vec_s16_t nz = zero_s16v;
|
|
|
|
vector bool short mskB;
|
|
vec_u16_t coefvB;
|
|
vec_u32_t multEvenvB, multOddvB;
|
|
|
|
vec_s16_t temp1v, temp2v;
|
|
|
|
vec_u16_t mfv;
|
|
vec_u16_t biasv;
|
|
|
|
mfv = vec_splats( (uint16_t)mf );
|
|
i_qbitsv = vec_splats( (uint32_t) 16 );
|
|
biasv = vec_splats( (uint16_t)bias );
|
|
|
|
QUANT_16_U_DC( 0, 16 );
|
|
return vec_any_ne(nz, zero_s16v);
|
|
}
|
|
|
|
// DC quant of a whole 2x2 block
|
|
#define QUANT_4_U_DC( idx0 ) \
|
|
{ \
|
|
const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
|
|
temp1v = vec_ld((idx0), dct); \
|
|
mskA = vec_cmplt(temp1v, zero_s16v); \
|
|
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
|
|
coefvA = vec_add(coefvA, biasv); \
|
|
multEvenvA = vec_mule(coefvA, mfv); \
|
|
multOddvA = vec_mulo(coefvA, mfv); \
|
|
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
|
|
multOddvA = vec_sr(multOddvA, i_qbitsv); \
|
|
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
|
|
temp2v = vec_xor(temp2v, mskA); \
|
|
temp2v = vec_add(temp2v, vec_and(mskA, one)); \
|
|
temp1v = vec_sel(temp1v, temp2v, sel); \
|
|
nz = vec_or(nz, temp1v); \
|
|
vec_st(temp1v, (idx0), dct); \
|
|
}
|
|
|
|
int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
|
|
{
|
|
LOAD_ZERO;
|
|
vector bool short mskA;
|
|
vec_u32_t i_qbitsv;
|
|
vec_u16_t coefvA;
|
|
vec_u32_t multEvenvA, multOddvA;
|
|
vec_s16_t one = vec_splat_s16(1);
|
|
vec_s16_t nz = zero_s16v;
|
|
static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
|
|
|
|
vec_s16_t temp1v, temp2v;
|
|
|
|
vec_u16_t mfv;
|
|
vec_u16_t biasv;
|
|
|
|
mfv = vec_splats( (uint16_t)mf );
|
|
i_qbitsv = vec_splats( (uint32_t) 16 );
|
|
biasv = vec_splats( (uint16_t)bias );
|
|
|
|
QUANT_4_U_DC(0);
|
|
return vec_any_ne(vec_and(nz, mask2), zero_s16v);
|
|
}
|
|
|
|
int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
|
|
{
|
|
LOAD_ZERO;
|
|
vector bool short mskA;
|
|
vec_u32_t i_qbitsv;
|
|
vec_u16_t coefvA;
|
|
vec_u32_t multEvenvA, multOddvA;
|
|
vec_u16_t mfvA;
|
|
vec_u16_t biasvA;
|
|
vec_s16_t one = vec_splat_s16(1);
|
|
vec_s16_t nz = zero_s16v;
|
|
|
|
vector bool short mskB;
|
|
vec_u16_t coefvB;
|
|
vec_u32_t multEvenvB, multOddvB;
|
|
vec_u16_t mfvB;
|
|
vec_u16_t biasvB;
|
|
|
|
vec_s16_t temp1v, temp2v, tmpv;
|
|
|
|
i_qbitsv = vec_splats( (uint32_t)16 );
|
|
|
|
for( int i = 0; i < 4; i++ )
|
|
QUANT_16_U( i*2*16, i*2*16+16 );
|
|
return vec_any_ne(nz, zero_s16v);
|
|
}
|
|
|
|
#define DEQUANT_SHL() \
|
|
{ \
|
|
dctv = vec_ld(8*y, dct); \
|
|
mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
|
|
mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
|
|
mfv = vec_packs(mf1v, mf2v); \
|
|
\
|
|
multEvenvA = vec_mule(dctv, mfv); \
|
|
multOddvA = vec_mulo(dctv, mfv); \
|
|
dctv = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
|
|
tmpv = xxpermdi( dctv, dctv, 2 ); \
|
|
dctv = vec_mergeh( dctv, tmpv ); \
|
|
dctv = vec_sl(dctv, i_qbitsv); \
|
|
vec_st(dctv, 8*y, dct); \
|
|
}
|
|
|
|
#ifdef WORDS_BIGENDIAN
|
|
#define VEC_MULE vec_mule
|
|
#define VEC_MULO vec_mulo
|
|
#else
|
|
#define VEC_MULE vec_mulo
|
|
#define VEC_MULO vec_mule
|
|
#endif
|
|
|
|
#define DEQUANT_SHR() \
|
|
{ \
|
|
dctv = vec_ld(8*y, dct); \
|
|
dct1v = vec_mergeh(dctv, dctv); \
|
|
dct2v = vec_mergel(dctv, dctv); \
|
|
mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
|
|
mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
|
|
\
|
|
multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \
|
|
multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \
|
|
temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
|
|
temp1v = vec_add(temp1v, fv); \
|
|
temp1v = vec_sra(temp1v, i_qbitsv); \
|
|
\
|
|
multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \
|
|
multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \
|
|
temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
|
|
temp2v = vec_add(temp2v, fv); \
|
|
temp2v = vec_sra(temp2v, i_qbitsv); \
|
|
\
|
|
dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
|
|
vec_st(dctv, y*8, dct); \
|
|
}
|
|
|
|
void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
|
|
{
|
|
int i_mf = i_qp%6;
|
|
int i_qbits = i_qp/6 - 4;
|
|
|
|
vec_s16_t dctv, tmpv;
|
|
vec_s16_t dct1v, dct2v;
|
|
vec_s32_t mf1v, mf2v;
|
|
vec_s16_t mfv;
|
|
vec_s32_t multEvenvA, multOddvA;
|
|
vec_s32_t temp1v, temp2v;
|
|
|
|
if( i_qbits >= 0 )
|
|
{
|
|
vec_u16_t i_qbitsv;
|
|
i_qbitsv = vec_splats( (uint16_t) i_qbits );
|
|
|
|
for( int y = 0; y < 4; y+=2 )
|
|
DEQUANT_SHL();
|
|
}
|
|
else
|
|
{
|
|
const int f = 1 << (-i_qbits-1);
|
|
|
|
vec_s32_t fv;
|
|
fv = vec_splats( f );
|
|
|
|
vec_u32_t i_qbitsv;
|
|
i_qbitsv = vec_splats( (uint32_t)-i_qbits );
|
|
|
|
vec_u32_t sixteenv;
|
|
sixteenv = vec_splats( (uint32_t)16 );
|
|
|
|
for( int y = 0; y < 4; y+=2 )
|
|
DEQUANT_SHR();
|
|
}
|
|
}
|
|
|
|
void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
|
|
{
|
|
int i_mf = i_qp%6;
|
|
int i_qbits = i_qp/6 - 6;
|
|
|
|
vec_s16_t dctv, tmpv;
|
|
vec_s16_t dct1v, dct2v;
|
|
vec_s32_t mf1v, mf2v;
|
|
vec_s16_t mfv;
|
|
vec_s32_t multEvenvA, multOddvA;
|
|
vec_s32_t temp1v, temp2v;
|
|
|
|
if( i_qbits >= 0 )
|
|
{
|
|
vec_u16_t i_qbitsv;
|
|
i_qbitsv = vec_splats((uint16_t)i_qbits );
|
|
|
|
for( int y = 0; y < 16; y+=2 )
|
|
DEQUANT_SHL();
|
|
}
|
|
else
|
|
{
|
|
const int f = 1 << (-i_qbits-1);
|
|
|
|
vec_s32_t fv;
|
|
fv = vec_splats( f );
|
|
|
|
vec_u32_t i_qbitsv;
|
|
i_qbitsv = vec_splats( (uint32_t)-i_qbits );
|
|
|
|
vec_u32_t sixteenv;
|
|
sixteenv = vec_splats( (uint32_t)16 );
|
|
|
|
for( int y = 0; y < 16; y+=2 )
|
|
DEQUANT_SHR();
|
|
}
|
|
}
|
|
#endif // !HIGH_BIT_DEPTH
|
|
|