2703 lines
96 KiB
ArmAsm
2703 lines
96 KiB
ArmAsm
/*****************************************************************************
|
|
* mc-a.S: LoongArch motion compensation
|
|
*****************************************************************************
|
|
* Copyright (C) 2023-2025 x264 project
|
|
*
|
|
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at licensing@x264.com.
|
|
*****************************************************************************/
|
|
|
|
#include "loongson_asm.S"
|
|
#include "loongson_util.S"
|
|
|
|
const ch_shuf
|
|
.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9
|
|
.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9
|
|
endconst
|
|
|
|
const pw_1024
|
|
.rept 16
|
|
.short 1024
|
|
.endr
|
|
endconst
|
|
|
|
const filt_mul20
|
|
.rept 32
|
|
.byte 20
|
|
.endr
|
|
endconst
|
|
|
|
const filt_mul15
|
|
.rept 16
|
|
.byte 1, -5
|
|
.endr
|
|
endconst
|
|
|
|
const filt_mul51
|
|
.rept 16
|
|
.byte -5, 1
|
|
.endr
|
|
endconst
|
|
|
|
const hpel_shuf
|
|
.rept 2
|
|
.byte 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_12
|
|
.rept 2
|
|
.byte 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_14
|
|
.rept 2
|
|
.byte 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_15
|
|
.rept 2
|
|
.byte 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_1
|
|
.rept 2
|
|
.byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_2
|
|
.rept 2
|
|
.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_3
|
|
.rept 2
|
|
.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_4
|
|
.rept 2
|
|
.byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
|
|
.endr
|
|
endconst
|
|
|
|
const shuf_6
|
|
.rept 2
|
|
.byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
.endr
|
|
endconst
|
|
|
|
#if !HIGH_BIT_DEPTH
|
|
|
|
.macro MC_CHROMA_START
|
|
srai.d t0, a5, 3
|
|
srai.d t1, a6, 3
|
|
slli.d t0, t0, 1
|
|
mul.d t1, t1, a4
|
|
add.d t1, t1, t0
|
|
add.d a3, a3, t1 /* src += (m_vy >> 3) * i_src_stride + (m_vx >> 3) * 2 */
|
|
.endm
|
|
|
|
/*
|
|
* void mc_chroma( uint8_t *p_dst_u, uint8_t *p_dst_v,
|
|
* intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t m_vx, int32_t m_vy,
|
|
* int32_t i_width, int32_t i_height )
|
|
*/
|
|
function_x264 mc_chroma_lasx
|
|
MC_CHROMA_START
|
|
andi a5, a5, 0x07 /* m_vx & 0x07 */
|
|
andi a6, a6, 0x07 /* m_vy & 0x07 */
|
|
move t0, a5
|
|
slli.d t0, t0, 8
|
|
sub.d t0, t0, a5
|
|
li.d a5, 8
|
|
addi.d t0, t0, 8
|
|
sub.d a5, a5, a6
|
|
mul.d a6, a6, t0 /* (x * 255 + 8) * y */
|
|
mul.d a5, a5, t0 /* (x * 255 + 8) * (8 - y) */
|
|
xvreplgr2vr.h xr6, a6 /* cD cC ... cD cC */
|
|
xvreplgr2vr.h xr7, a5 /* cB cA ... cB cA */
|
|
la.local t0, ch_shuf
|
|
xvld xr5, t0, 0
|
|
addi.d t0, a7, -4
|
|
ldptr.w a7, sp, 0 /* a7 = i_height */
|
|
slli.d t1, a4, 1
|
|
blt zero, t0, .L_WIDTH8
|
|
.L_LOOP4:
|
|
vld vr0, a3, 0
|
|
vldx vr1, a3, a4
|
|
vldx vr2, a3, t1
|
|
xvpermi.q xr0, xr1, 0x02
|
|
xvpermi.q xr1, xr2, 0x02
|
|
xvshuf.b xr0, xr0, xr0, xr5
|
|
xvshuf.b xr1, xr1, xr1, xr5
|
|
xvdp2.h.bu xr2, xr0, xr7
|
|
xvdp2.h.bu xr3, xr1, xr6
|
|
xvadd.h xr0, xr2, xr3
|
|
xvssrlrni.bu.h xr0, xr0, 6
|
|
xvstelm.w xr0, a0, 0, 0
|
|
xvstelm.w xr0, a1, 0, 1
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
xvstelm.w xr0, a0, 0, 4
|
|
xvstelm.w xr0, a1, 0, 5
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
add.d a3, a3, t1
|
|
addi.d a7, a7, -2
|
|
blt zero, a7, .L_LOOP4
|
|
b .ENDFUNC
|
|
.L_WIDTH8:
|
|
xvld xr0, a3, 0
|
|
xvpermi.d xr0, xr0, 0x94
|
|
xvshuf.b xr0, xr0, xr0, xr5
|
|
.L_LOOP8:
|
|
xvldx xr3, a3, a4
|
|
xvpermi.d xr3, xr3, 0x94
|
|
xvshuf.b xr3, xr3, xr3, xr5
|
|
xvdp2.h.bu xr1, xr0, xr7
|
|
xvdp2.h.bu xr2, xr3, xr6
|
|
xvdp2.h.bu xr8, xr3, xr7
|
|
|
|
xvldx xr0, a3, t1
|
|
xvpermi.d xr0, xr0, 0x94
|
|
xvshuf.b xr0, xr0, xr0, xr5
|
|
xvdp2.h.bu xr4, xr0, xr6
|
|
xvadd.h xr1, xr1, xr2
|
|
xvadd.h xr3, xr8, xr4
|
|
|
|
xvssrlrni.bu.h xr3, xr1, 6
|
|
|
|
xvpermi.q xr4, xr3, 0x01
|
|
xvpackev.w xr8, xr4, xr3
|
|
xvpackod.w xr9, xr4, xr3
|
|
vstelm.d vr8, a0, 0, 0
|
|
vstelm.d vr9, a1, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vstelm.d vr8, a0, 0, 1
|
|
vstelm.d vr9, a1, 0, 1
|
|
|
|
addi.d a7, a7, -2
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
add.d a3, a3, t1
|
|
blt zero, a7, .L_LOOP8
|
|
.ENDFUNC:
|
|
endfunc_x264
|
|
|
|
.macro PIXEL_AVG_START
|
|
slli.d t0, a3, 1
|
|
add.w t1, t0, a3
|
|
slli.d t2, a3, 2
|
|
slli.d t3, a5, 1
|
|
add.w t4, t3, a5
|
|
slli.d t5, a5, 2
|
|
slli.d t6, a1, 1
|
|
add.w t7, t6, a1
|
|
slli.d t8, a1, 2
|
|
.endm
|
|
|
|
.macro BIWEIGHT_AVG_START
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
xvreplgr2vr.b xr0, a6
|
|
xvreplgr2vr.b xr1, t0
|
|
xvpackev.b xr8, xr1, xr0
|
|
xvxor.v xr9, xr9, xr9
|
|
xvaddi.hu xr9, xr9, 6
|
|
.endm
|
|
|
|
.macro BIWEIGHT_AVG_CORE a, b
|
|
xvpermi.d \a, \a, 0x50
|
|
xvpermi.d \b, \b, 0x50
|
|
xvilvl.b \a, \b, \a
|
|
xvmulwev.h.bu.b \b, \a, xr8
|
|
xvmaddwod.h.bu.b \b, \a, xr8
|
|
xvssrarn.bu.h \b, \b, xr9
|
|
xvpermi.d \b, \b, 0x08
|
|
.endm
|
|
|
|
.macro PIXEL_AVG_START_W8
|
|
slli.d t0, a3, 1
|
|
add.w t1, t0, a3
|
|
slli.d t3, a5, 1
|
|
add.w t4, t3, a5
|
|
.endm
|
|
|
|
function_x264 pixel_avg_weight_w4_lasx
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
vreplgr2vr.b vr0, a6
|
|
vreplgr2vr.b vr1, t0
|
|
vpackev.b vr8, vr1, vr0
|
|
.LOOP_HEIGHT_W4_1:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fld.s f2, a4, 0
|
|
fldx.s f3, a4, a5
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr2, vr3, vr2
|
|
vilvl.b vr0, vr2, vr0
|
|
vmulwev.h.bu.b vr1, vr0, vr8
|
|
vmaddwod.h.bu.b vr1, vr0, vr8
|
|
vssrarni.bu.h vr1, vr1, 6
|
|
fst.s f1, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr1, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a5, a4, 1
|
|
addi.w a7, a7, -2
|
|
bnez a7, .LOOP_HEIGHT_W4_1
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w4_lasx
|
|
.LOOP_HEIGHT_W4:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fld.s f4, a4, 0
|
|
fldx.s f5, a4, a5
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr4, vr5, vr4
|
|
vavgr.bu vr0, vr0, vr4
|
|
fst.s f0, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr0, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a5, a4, 1
|
|
addi.w a7, a7, -2
|
|
bnez a7, .LOOP_HEIGHT_W4
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_weight_w8_lasx
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
xvreplgr2vr.b xr0, a6
|
|
xvreplgr2vr.b xr1, t0
|
|
xvpackev.b xr8, xr1, xr0
|
|
PIXEL_AVG_START_W8
|
|
.LOOP_HEIGHT_W8_1:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
fld.d f4, a4, 0
|
|
fldx.d f5, a4, a5
|
|
fldx.d f6, a4, t3
|
|
fldx.d f7, a4, t4
|
|
vilvl.b vr0, vr4, vr0
|
|
vilvl.b vr1, vr5, vr1
|
|
vilvl.b vr2, vr6, vr2
|
|
vilvl.b vr3, vr7, vr3
|
|
xvpermi.q xr1, xr0, 0x20
|
|
xvpermi.q xr3, xr2, 0x20
|
|
xvmulwev.h.bu.b xr2, xr1, xr8
|
|
xvmaddwod.h.bu.b xr2, xr1, xr8
|
|
xvmulwev.h.bu.b xr4, xr3, xr8
|
|
xvmaddwod.h.bu.b xr4, xr3, xr8
|
|
xvssrarni.bu.h xr4, xr2, 6
|
|
fst.d f4, a0, 0
|
|
add.d a0, a0, a1
|
|
xvstelm.d xr4, a0, 0, 2
|
|
add.d a0, a0, a1
|
|
xvstelm.d xr4, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
xvstelm.d xr4, a0, 0, 3
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.w a7, a7, -4
|
|
bnez a7, .LOOP_HEIGHT_W8_1
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w8_lasx
|
|
PIXEL_AVG_START_W8
|
|
.LOOP_HEIGHT_W8:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
fld.d f4, a4, 0
|
|
fldx.d f5, a4, a5
|
|
fldx.d f6, a4, t3
|
|
fldx.d f7, a4, t4
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr6, vr7, vr6
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr2, vr2, vr6
|
|
fst.d f0, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.d vr0, a0, 0, 1
|
|
fstx.d f2, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
vstelm.d vr2, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.w a7, a7, -4
|
|
bnez a7, .LOOP_HEIGHT_W8
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_weight_w16_lasx
|
|
BIWEIGHT_AVG_START
|
|
PIXEL_AVG_START
|
|
.L_HEIGHT_LOOP_T:
|
|
LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7
|
|
BIWEIGHT_AVG_CORE xr0, xr4
|
|
BIWEIGHT_AVG_CORE xr1, xr5
|
|
vst vr4, a0, 0
|
|
vstx vr5, a0, a1
|
|
BIWEIGHT_AVG_CORE xr2, xr6
|
|
BIWEIGHT_AVG_CORE xr3, xr7
|
|
vstx vr6, a0, t6
|
|
vstx vr7, a0, t7
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
add.d a0, a0, t8
|
|
addi.d a7, a7, -4
|
|
bnez a7, .L_HEIGHT_LOOP_T
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w16_lasx
|
|
PIXEL_AVG_START
|
|
.L_HEIGHT_LOOP:
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vldx vr2, a2, t0
|
|
vldx vr3, a2, t1
|
|
vld vr4, a4, 0
|
|
vldx vr5, a4, a5
|
|
vldx vr6, a4, t3
|
|
vldx vr7, a4, t4
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr1, vr1, vr5
|
|
vavgr.bu vr2, vr2, vr6
|
|
vavgr.bu vr3, vr3, vr7
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
vstx vr2, a0, t6
|
|
vstx vr3, a0, t7
|
|
add.d a0, a0, t8
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vldx vr2, a2, t0
|
|
vldx vr3, a2, t1
|
|
vld vr4, a4, 0
|
|
vldx vr5, a4, a5
|
|
vldx vr6, a4, t3
|
|
vldx vr7, a4, t4
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr1, vr1, vr5
|
|
vavgr.bu vr2, vr2, vr6
|
|
vavgr.bu vr3, vr3, vr7
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
vstx vr2, a0, t6
|
|
vstx vr3, a0, t7
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
add.d a0, a0, t8
|
|
addi.d a7, a7, -8
|
|
bnez a7, .L_HEIGHT_LOOP
|
|
endfunc_x264
|
|
|
|
.macro FILT_PACK_LASX s1, s2, s3
|
|
xvmulwev.w.h xr16, \s1, \s3
|
|
xvmulwev.w.h xr17, \s2, \s3
|
|
xvsrarni.h.w xr17, xr16, 15
|
|
xvmaxi.h xr17, xr17, 0
|
|
xvsat.hu xr17, xr17, 7
|
|
xvmulwod.w.h xr18, \s1, \s3
|
|
xvmulwod.w.h xr19, \s2, \s3
|
|
xvsrarni.h.w xr19, xr18, 15
|
|
xvmaxi.h xr19, xr19, 0
|
|
xvsat.hu xr19, xr19, 7
|
|
xvpackev.b \s1, xr19, xr17
|
|
.endm
|
|
|
|
/* s3: temp, s4: UNUSED, s5: imm */
|
|
.macro DO_FILT_V_LASX s1, s2, s3, s4, s5
|
|
alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */
|
|
alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */
|
|
xvld xr1, a3, 0
|
|
xvldx xr2, a3, a2
|
|
xvld \s3, t2, 0
|
|
xvld xr3, a1, 0
|
|
xvldx \s1, a1, a2
|
|
xvld \s2, t1, 0
|
|
xvilvh.b xr16, xr2, xr1
|
|
xvilvl.b xr17, xr2, xr1
|
|
xvilvh.b xr18, \s2, \s1
|
|
xvilvl.b xr19, \s2, \s1
|
|
xvilvh.b xr20, \s3, xr3
|
|
xvilvl.b xr21, \s3, xr3
|
|
xvdp2.h.bu.b xr1, xr17, xr12
|
|
xvdp2.h.bu.b xr4, xr16, xr12
|
|
xvdp2.h.bu.b \s1, xr19, xr0
|
|
xvdp2.h.bu.b xr2, xr18, xr0
|
|
xvdp2.h.bu.b xr3, xr21, xr14
|
|
xvdp2.h.bu.b \s2, xr20, xr14
|
|
xvadd.h xr1, xr1, \s1
|
|
xvadd.h xr4, xr4, xr2
|
|
xvadd.h xr1, xr1, xr3
|
|
xvadd.h xr4, xr4, \s2
|
|
xmov \s1, xr1
|
|
xmov \s2, xr1
|
|
addi.d a3, a3, 32
|
|
addi.d a1, a1, 32
|
|
xvpermi.q \s1, xr4, 0x2
|
|
xvpermi.q \s2, xr4, 0x13
|
|
FILT_PACK_LASX xr1, xr4, xr15
|
|
addi.d t1, a4, \s5
|
|
xvstx xr1, t0, t1
|
|
.endm
|
|
|
|
.macro FILT_H s1, s2, s3
|
|
xvsub.h \s1, \s1, \s2
|
|
xvsrai.h \s1, \s1, 2
|
|
xvsub.h \s1, \s1, \s2
|
|
xvadd.h \s1, \s1, \s3
|
|
xvsrai.h \s1, \s1, 2
|
|
xvadd.h \s1, \s1, \s3
|
|
.endm
|
|
|
|
.macro FILT_C s1, s2, s3
|
|
xmov xr3, \s1
|
|
xvpermi.q xr3, \s2, 0x03
|
|
xvshuf.b xr1, \s2, xr3, xr23
|
|
xvshuf.b xr2, \s2, xr3, xr24
|
|
xmov \s1, \s2
|
|
xvpermi.q \s1, \s3, 0x03
|
|
xvshuf.b xr3, \s1, \s2, xr29
|
|
xvshuf.b xr4, \s1, \s2, xr27
|
|
xvadd.h xr3, xr2, xr3
|
|
xmov xr2, \s1
|
|
xmov \s1, \s3
|
|
xvshuf.b \s3, xr2, \s2, xr30
|
|
xvadd.h xr4, xr4, \s2
|
|
xvadd.h \s3, \s3, xr1
|
|
FILT_H \s3, xr3, xr4
|
|
.endm
|
|
|
|
.macro DO_FILT_C_LASX s1, s2, s3, s4
|
|
FILT_C \s1, \s2, \s3
|
|
FILT_C \s2, \s1, \s4
|
|
FILT_PACK_LASX \s3, \s4, xr15
|
|
xvpermi.d \s3, \s3, 0xd8
|
|
xvstx \s3, a5, a4
|
|
.endm
|
|
|
|
.macro DO_FILT_H_LASX s1, s2, s3
|
|
xmov xr3, \s1
|
|
xvpermi.q xr3, \s2, 0x03
|
|
xvshuf.b xr1, \s2, xr3, xr24
|
|
xvshuf.b xr2, \s2, xr3, xr25
|
|
xmov xr3, \s2
|
|
xvpermi.q xr3, \s3, 0x03
|
|
xvshuf.b xr4, xr3, \s2, xr26
|
|
xvshuf.b xr5, xr3, \s2, xr27
|
|
xvshuf.b xr6, xr3, \s2, xr28
|
|
xmov \s1, \s2
|
|
xvdp2.h.bu.b xr16, xr1, xr12
|
|
xvdp2.h.bu.b xr17, xr2, xr12
|
|
xvdp2.h.bu.b xr18, \s2, xr14
|
|
xvdp2.h.bu.b xr19, xr4, xr14
|
|
xvdp2.h.bu.b xr20, xr5, xr0
|
|
xvdp2.h.bu.b xr21, xr6, xr0
|
|
xvadd.h xr1, xr16, xr18
|
|
xvadd.h xr2, xr17, xr19
|
|
xvadd.h xr1, xr1, xr20
|
|
xvadd.h xr2, xr2, xr21
|
|
FILT_PACK_LASX xr1, xr2, xr15
|
|
xvshuf.b xr1, xr1, xr1, xr22
|
|
xvstx xr1, a0, a4
|
|
xmov \s2, \s3
|
|
.endm
|
|
|
|
/*
|
|
* void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
|
|
* uint8_t *src, intptr_t stride, int width, int height )
|
|
*/
|
|
function_x264 hpel_filter_lasx
|
|
addi.d sp, sp, -56
|
|
fst.d f24, sp, 0
|
|
fst.d f25, sp, 8
|
|
fst.d f26, sp, 16
|
|
fst.d f27, sp, 24
|
|
fst.d f28, sp, 32
|
|
fst.d f29, sp, 40
|
|
fst.d f30, sp, 48
|
|
|
|
move a7, a3
|
|
addi.d a5, a5, -32
|
|
move t0, a1
|
|
andi a7, a7, 31
|
|
sub.d a3, a3, a7
|
|
add.d a0, a0, a5
|
|
add.d t0, t0, a5
|
|
add.d a7, a7, a5
|
|
add.d a5, a5, a2
|
|
move a2, a4
|
|
sub.d a7, zero, a7
|
|
add.d a1, a3, a2
|
|
sub.d a3, a3, a2
|
|
sub.d a3, a3, a2
|
|
move a4, a7
|
|
la.local t1, filt_mul51
|
|
xvld xr0, t1, 0
|
|
la.local t2, filt_mul15
|
|
xvld xr12, t2, 0
|
|
la.local t3, filt_mul20
|
|
xvld xr14, t3, 0
|
|
la.local t4, pw_1024
|
|
xvld xr15, t4, 0
|
|
la.local t1, hpel_shuf
|
|
xvld xr22, t1, 0
|
|
la.local t2, shuf_12
|
|
xvld xr23, t2, 0
|
|
la.local t3, shuf_1
|
|
xvld xr26, t3, 0
|
|
xvaddi.bu xr24, xr23, 2 /* shuf_14 */
|
|
xvaddi.bu xr25, xr23, 3 /* shuf_15 */
|
|
xvaddi.bu xr27, xr26, 1 /* shuf_2 */
|
|
xvaddi.bu xr28, xr26, 2 /* shuf_3 */
|
|
xvaddi.bu xr29, xr26, 3 /* shuf_4 */
|
|
xvaddi.bu xr30, xr26, 5 /* shuf_6 */
|
|
xvxor.v xr9, xr9, xr9
|
|
xvxor.v xr10, xr10, xr10
|
|
.LOOPY:
|
|
DO_FILT_V_LASX xr8, xr7, xr13, xr12, 0
|
|
.LOOPX:
|
|
DO_FILT_V_LASX xr6, xr5, xr11, xr12, 32
|
|
.LASTX:
|
|
xvsrli.h xr15, xr15, 1
|
|
DO_FILT_C_LASX xr9, xr8, xr7, xr6
|
|
xvadd.h xr15, xr15, xr15
|
|
xmov xr7, xr5
|
|
DO_FILT_H_LASX xr10, xr13, xr11
|
|
addi.d a4, a4, 32
|
|
blt a4, zero, .LOOPX
|
|
addi.d t1, a4, -32
|
|
blt t1, zero, .LASTX
|
|
//setup regs for next y
|
|
sub.d a4, a4, a7
|
|
sub.d a4, a4, a2
|
|
sub.d a1, a1, a4
|
|
sub.d a3, a3, a4
|
|
add.d a0, a0, a2
|
|
add.d t0, t0, a2
|
|
add.d a5, a5, a2
|
|
move a4, a7
|
|
addi.d a6, a6, -1
|
|
blt zero, a6, .LOOPY
|
|
fld.d f24, sp, 0
|
|
fld.d f25, sp, 8
|
|
fld.d f26, sp, 16
|
|
fld.d f27, sp, 24
|
|
fld.d f28, sp, 32
|
|
fld.d f29, sp, 40
|
|
fld.d f30, sp, 48
|
|
addi.d sp, sp, 56
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
|
|
* pixel *src2, intptr_t src2_stride, int weight);
|
|
*/
|
|
.macro PIXEL_AVG w, h
|
|
function_x264 pixel_avg_\w\()x\h\()_lasx
|
|
addi.d t0, a6, -32
|
|
addi.d a7, zero, \h
|
|
bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lasx
|
|
b x264_8_pixel_avg_w\w\()_lasx
|
|
endfunc_x264
|
|
.endm
|
|
|
|
PIXEL_AVG 16, 8
|
|
PIXEL_AVG 8, 16
|
|
PIXEL_AVG 8, 8
|
|
PIXEL_AVG 8, 4
|
|
PIXEL_AVG 4, 16
|
|
PIXEL_AVG 4, 8
|
|
PIXEL_AVG 4, 4
|
|
PIXEL_AVG 4, 2
|
|
|
|
function_x264 mc_weight_w20_noden_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.b xr0, a4, 36 // scale
|
|
.LOOP_WEIGHTW20_NODEN:
|
|
xvld xr3, a2, 0
|
|
xvldx xr4, a2, a3
|
|
xvmulwev.h.bu.b xr7, xr3, xr0
|
|
xvmulwev.h.bu.b xr8, xr4, xr0
|
|
xvmulwod.h.bu.b xr3, xr3, xr0
|
|
xvmulwod.h.bu.b xr4, xr4, xr0
|
|
xvadd.h xr7, xr7, xr1
|
|
xvadd.h xr8, xr8, xr1
|
|
xvadd.h xr3, xr3, xr1
|
|
xvadd.h xr4, xr4, xr1
|
|
xvssrarni.bu.h xr8, xr7, 0
|
|
xvssrarni.bu.h xr4, xr3, 0
|
|
xvilvl.b xr3, xr4, xr8
|
|
xvilvh.b xr4, xr4, xr8
|
|
vst vr3, a0, 0
|
|
xvstelm.w xr3, a0, 16, 4
|
|
add.d a0, a0, a1
|
|
vst vr4, a0, 0
|
|
xvstelm.w xr4, a0, 16, 4
|
|
alsl.d a2, a3, a2, 1
|
|
add.d a0, a0, a1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW20_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w16_noden_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
.LOOP_WEIGHTW16_NODEN:
|
|
vld vr3, a2, 0
|
|
vldx vr4, a2, a3
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
xvmul.h xr3, xr3, xr0
|
|
xvmul.h xr4, xr4, xr0
|
|
xvadd.h xr3, xr3, xr1
|
|
xvadd.h xr4, xr4, xr1
|
|
xvssrarni.bu.h xr4, xr3, 0
|
|
xvpermi.d xr3, xr4, 8
|
|
xvpermi.d xr4, xr4, 13
|
|
vst vr3, a0, 0
|
|
vstx vr4, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW16_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w8_noden_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
.LOOP_WEIGHTW8_NODEN:
|
|
fld.d f3, a2, 0
|
|
fldx.d f4, a2, a3
|
|
vilvl.d vr3, vr4, vr3
|
|
vext2xv.hu.bu xr3, xr3
|
|
xvmul.h xr3, xr3, xr0
|
|
xvadd.h xr3, xr3, xr1
|
|
xvssrarni.bu.h xr3, xr3, 0
|
|
xvstelm.d xr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
xvstelm.d xr3, a0, 0, 2
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW8_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w4_noden_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
.LOOP_WEIGHTW4_NODEN:
|
|
fld.s f3, a2, 0
|
|
fldx.s f4, a2, a3
|
|
vilvl.w vr3, vr4, vr3
|
|
vext2xv.hu.bu xr3, xr3
|
|
xvmul.h xr3, xr3, xr0
|
|
xvadd.h xr3, xr3, xr1
|
|
xvssrarni.bu.h xr3, xr3, 0
|
|
xvstelm.w xr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
xvstelm.w xr3, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW4_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w20_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.b xr0, a4, 36 // scale
|
|
xvldrepl.h xr2, a4, 32 // denom
|
|
xvsll.h xr1, xr1, xr2
|
|
.LOOP_WEIGHTW20:
|
|
xvld xr3, a2, 0
|
|
xvldx xr4, a2, a3
|
|
xvmulwev.h.bu.b xr7, xr3, xr0
|
|
xvmulwev.h.bu.b xr8, xr4, xr0
|
|
xvmulwod.h.bu.b xr3, xr3, xr0
|
|
xvmulwod.h.bu.b xr4, xr4, xr0
|
|
xvsadd.h xr7, xr7, xr1
|
|
xvsadd.h xr8, xr8, xr1
|
|
xvsadd.h xr3, xr3, xr1
|
|
xvsadd.h xr4, xr4, xr1
|
|
xvssrarn.bu.h xr7, xr7, xr2
|
|
xvssrarn.bu.h xr8, xr8, xr2
|
|
xvssrarn.bu.h xr3, xr3, xr2
|
|
xvssrarn.bu.h xr4, xr4, xr2
|
|
xvilvl.b xr3, xr3, xr7
|
|
xvilvl.b xr4, xr4, xr8
|
|
vst vr3, a0, 0
|
|
xvstelm.w xr3, a0, 16, 4
|
|
add.d a0, a0, a1
|
|
vst vr4, a0, 0
|
|
xvstelm.w xr4, a0, 16, 4
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW20
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w16_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
xvldrepl.h xr2, a4, 32 // denom
|
|
xvsll.h xr1, xr1, xr2
|
|
.LOOP_WEIGHTW16:
|
|
vld vr3, a2, 0
|
|
vldx vr4, a2, a3
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
xvmul.h xr3, xr3, xr0
|
|
xvmul.h xr4, xr4, xr0
|
|
xvsadd.h xr3, xr3, xr1
|
|
xvsadd.h xr4, xr4, xr1
|
|
xvssrarn.bu.h xr3, xr3, xr2
|
|
xvssrarn.bu.h xr4, xr4, xr2
|
|
xvpermi.d xr3, xr3, 8
|
|
xvpermi.d xr4, xr4, 8
|
|
vst vr3, a0, 0
|
|
vstx vr4, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW16
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w8_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
xvldrepl.h xr2, a4, 32 // denom
|
|
xvsll.h xr1, xr1, xr2
|
|
.LOOP_WEIGHTW8:
|
|
fld.d f3, a2, 0
|
|
fldx.d f4, a2, a3
|
|
vilvl.d vr3, vr4, vr3
|
|
vext2xv.hu.bu xr3, xr3
|
|
xvmul.h xr3, xr3, xr0
|
|
xvsadd.h xr3, xr3, xr1
|
|
xvssrarn.bu.h xr3, xr3, xr2
|
|
xvstelm.d xr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
xvstelm.d xr3, a0, 0, 2
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW8
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w4_lasx
|
|
xvldrepl.h xr1, a4, 40 // offset
|
|
xvldrepl.h xr0, a4, 36 // scale
|
|
xvldrepl.h xr2, a4, 32 // denom
|
|
xvsll.h xr1, xr1, xr2
|
|
.LOOP_WEIGHTW4:
|
|
fld.s f3, a2, 0
|
|
fldx.s f4, a2, a3
|
|
vilvl.w vr3, vr4, vr3
|
|
vext2xv.hu.bu xr3, xr3
|
|
xvmul.h xr3, xr3, xr0
|
|
xvsadd.h xr3, xr3, xr1
|
|
xvssrarn.bu.h xr3, xr3, xr2
|
|
xvstelm.w xr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
xvstelm.w xr3, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHTW4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w4_lasx
|
|
.avg2w4_loop_2:
|
|
addi.d a5, a5, -2
|
|
fld.s f0, a2, 0
|
|
fld.s f1, a4, 0
|
|
fldx.s f2, a2, a3
|
|
fldx.s f3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr1
|
|
vavgr.bu vr1, vr2, vr3
|
|
fst.s f0, a0, 0
|
|
fstx.s f1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .avg2w4_loop_2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w8_lasx
|
|
.avg2w8_loop_2:
|
|
addi.d a5, a5, -2
|
|
fld.d f0, a2, 0
|
|
fld.d f1, a4, 0
|
|
fldx.d f2, a2, a3
|
|
fldx.d f3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr1
|
|
vavgr.bu vr1, vr2, vr3
|
|
fst.d f0, a0, 0
|
|
fstx.d f1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .avg2w8_loop_2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w16_lasx
|
|
.avg2w16_loop_2:
|
|
addi.d a5, a5, -2
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vld vr2, a4, 0
|
|
vldx vr3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr2
|
|
vavgr.bu vr1, vr1, vr3
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .avg2w16_loop_2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w20_lasx
|
|
.avg2w20_loop_2:
|
|
addi.d a5, a5, -2
|
|
xvld xr0, a2, 0
|
|
xvldx xr1, a2, a3
|
|
xvld xr2, a4, 0
|
|
xvldx xr3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
xvavgr.bu xr0, xr0, xr2
|
|
xvavgr.bu xr1, xr1, xr3
|
|
vst vr0, a0, 0
|
|
xvstelm.w xr0, a0, 16, 4
|
|
add.d a0, a0, a1
|
|
vst vr1, a0, 0
|
|
xvstelm.w xr1, a0, 16, 4
|
|
add.d a0, a0, a1
|
|
blt zero, a5, .avg2w20_loop_2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride,
|
|
* uint8_t *p_src, int32_t i_src_stride,
|
|
* int32_t i_height )
|
|
*/
|
|
function_x264 mc_copy_w16_lasx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPYW16:
|
|
vld vr1, a2, 0
|
|
vldx vr2, a2, a3
|
|
vldx vr3, a2, t0
|
|
vldx vr4, a2, t1
|
|
|
|
vst vr1, a0, 0
|
|
vstx vr2, a0, a1
|
|
vstx vr3, a0, t2
|
|
vstx vr4, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPYW16
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_w8( uint8_t *p_dst, intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t i_height )
|
|
*/
|
|
function_x264 mc_copy_w8_lasx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPYW8:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
|
|
fst.d f0, a0, 0
|
|
fstx.d f1, a0, a1
|
|
fstx.d f2, a0, t2
|
|
fstx.d f3, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPYW8
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_w4( uint8_t *p_dst, intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t i_height )
|
|
*/
|
|
function_x264 mc_copy_w4_lasx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPYW4:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fldx.s f2, a2, t0
|
|
fldx.s f3, a2, t1
|
|
|
|
fst.s f0, a0, 0
|
|
fstx.s f1, a0, a1
|
|
fstx.s f2, a0, t2
|
|
fstx.s f3, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPYW4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void memzero_aligned( void *p_dst, size_t n )
|
|
*/
|
|
function_x264 memzero_aligned_lasx
|
|
xvxor.v xr1, xr1, xr1
|
|
.memzero_loop:
|
|
addi.d a1, a1, -128
|
|
.rept 4
|
|
xvst xr1, a0, 0
|
|
addi.d a0, a0, 32
|
|
.endr
|
|
blt zero, a1, .memzero_loop
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth,
|
|
* pixel *dstv, pixel *dstc, intptr_t src_stride,
|
|
* intptr_t dst_stride, int width, int height )
|
|
*/
|
|
function_x264 frame_init_lowres_core_lasx
|
|
andi t1, a7, 15
|
|
sub.w t0, a7, t1
|
|
slli.d t2, a5, 1
|
|
ldptr.w a7, sp, 0 // use a7 as height variable
|
|
|
|
.height_loop:
|
|
add.d t4, zero, t0
|
|
addi.d t3, a0, 0
|
|
addi.d t5, a1, 0
|
|
addi.d t6, a2, 0
|
|
addi.d t7, a3, 0
|
|
addi.d t8, a4, 0
|
|
.width16_loop:
|
|
xvld xr0, t3, 0
|
|
xvldx xr1, t3, a5
|
|
xvldx xr2, t3, t2
|
|
xvavgr.bu xr3, xr0, xr1
|
|
xvavgr.bu xr4, xr1, xr2
|
|
xvhaddw.hu.bu xr5, xr3, xr3
|
|
xvhaddw.hu.bu xr6, xr4, xr4
|
|
xvssrarni.bu.h xr6, xr5, 1
|
|
xvpermi.d xr7, xr6, 0xd8
|
|
vst vr7, t5, 0
|
|
xvpermi.q xr7, xr7, 0x11
|
|
vst vr7, t7, 0
|
|
|
|
addi.d t3, t3, 1
|
|
xvld xr0, t3, 0
|
|
xvldx xr1, t3, a5
|
|
xvldx xr2, t3, t2
|
|
xvavgr.bu xr3, xr0, xr1
|
|
xvavgr.bu xr4, xr1, xr2
|
|
xvhaddw.hu.bu xr5, xr3, xr3
|
|
xvhaddw.hu.bu xr6, xr4, xr4
|
|
xvssrarni.bu.h xr6, xr5, 1
|
|
xvpermi.d xr7, xr6, 0xd8
|
|
vst vr7, t6, 0
|
|
xvpermi.q xr7, xr7, 0x11
|
|
vst vr7, t8, 0
|
|
addi.d t3, t3, 31
|
|
addi.d t5, t5, 16
|
|
addi.d t6, t6, 16
|
|
addi.d t7, t7, 16
|
|
addi.d t8, t8, 16
|
|
addi.w t4, t4, -16
|
|
blt zero, t4, .width16_loop
|
|
|
|
beqz t1, .width16_end
|
|
vld vr0, t3, 0
|
|
vldx vr1, t3, a5
|
|
vldx vr2, t3, t2
|
|
vavgr.bu vr3, vr0, vr1
|
|
vavgr.bu vr4, vr1, vr2
|
|
vhaddw.hu.bu vr5, vr3, vr3
|
|
vhaddw.hu.bu vr6, vr4, vr4
|
|
vssrarni.bu.h vr6, vr5, 1
|
|
fst.d f6, t5, 0
|
|
vstelm.d vr6, t7, 0, 1
|
|
|
|
addi.d t3, t3, 1
|
|
vld vr0, t3, 0
|
|
vldx vr1, t3, a5
|
|
vldx vr2, t3, t2
|
|
vavgr.bu vr3, vr0, vr1
|
|
vavgr.bu vr4, vr1, vr2
|
|
vhaddw.hu.bu vr5, vr3, vr3
|
|
vhaddw.hu.bu vr6, vr4, vr4
|
|
vssrarni.bu.h vr6, vr5, 1
|
|
fst.d f6, t6, 0
|
|
vstelm.d vr6, t8, 0, 1
|
|
|
|
.width16_end:
|
|
add.d a0, a0, t2
|
|
add.d a1, a1, a6
|
|
add.d a2, a2, a6
|
|
add.d a3, a3, a6
|
|
add.d a4, a4, a6
|
|
addi.w a7, a7, -1
|
|
blt zero, a7, .height_loop
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_chroma(uint8_t *p_dst_u, uint8_t *p_dst_v,
|
|
* intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t m_vx, int32_t m_vy,
|
|
* int32_t i_width, int32_t i_height)
|
|
*/
|
|
|
|
function_x264 mc_chroma_lsx
|
|
MC_CHROMA_START
|
|
andi a5, a5, 0x07 /* m_vx & 0x07 */
|
|
andi a6, a6, 0x07 /* m_vy & 0x07 */
|
|
li.d t8, 8
|
|
sub.d t1, t8, a5 // 8-d8x
|
|
sub.d t2, t8, a6 // 8-d8y
|
|
mul.d t3, t1, t2 // CA
|
|
mul.d t4, a5, t2 // CB
|
|
mul.d t5, t1, a6 // CC
|
|
mul.d t6, a5, a6 // CD
|
|
vreplgr2vr.b vr0, t3
|
|
vreplgr2vr.b vr1, t4
|
|
vreplgr2vr.b vr2, t5
|
|
vreplgr2vr.b vr3, t6
|
|
|
|
add.d t0, a3, a4
|
|
ldptr.w t1, sp, 0 /* i_height */
|
|
move t3, t0
|
|
addi.d t4, zero, 1
|
|
addi.d t5, zero, 3
|
|
addi.d t6, zero, 7
|
|
bge t6, a7, .ENDLOOP_W8
|
|
.LOOP_W8:
|
|
vld vr4, a3, 0
|
|
vld vr5, t0, 0
|
|
vld vr6, a3, 2
|
|
vld vr7, t0, 2
|
|
vmulwev.h.bu vr8, vr4, vr0
|
|
vmulwod.h.bu vr9, vr4, vr0
|
|
vmulwev.h.bu vr10, vr5, vr2
|
|
vmulwod.h.bu vr11, vr5, vr2
|
|
vmaddwev.h.bu vr8, vr6, vr1
|
|
vmaddwod.h.bu vr9, vr6, vr1
|
|
vmaddwev.h.bu vr10, vr7, vr3
|
|
vmaddwod.h.bu vr11, vr7, vr3
|
|
vadd.h vr12, vr8, vr10
|
|
vadd.h vr13, vr9, vr11
|
|
vssrarni.bu.h vr13, vr12, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
vstelm.d vr13, a1, 0, 1
|
|
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
addi.d t1, t1, -1
|
|
move a3, t3
|
|
add.d t3, t3, a4
|
|
move t0, t3
|
|
blt zero, t1, .LOOP_W8
|
|
b .ENDLOOP_W2
|
|
.ENDLOOP_W8:
|
|
bge t5, a7, .ENDLOOP_W4
|
|
.LOOP_W4:
|
|
vld vr4, a3, 0
|
|
vld vr5, t0, 0
|
|
vld vr6, a3, 2
|
|
vld vr7, t0, 2
|
|
vmulwev.h.bu vr8, vr4, vr0
|
|
vmulwod.h.bu vr9, vr4, vr0
|
|
vmulwev.h.bu vr10, vr5, vr2
|
|
vmulwod.h.bu vr11, vr5, vr2
|
|
vmaddwev.h.bu vr8, vr6, vr1
|
|
vmaddwod.h.bu vr9, vr6, vr1
|
|
vmaddwev.h.bu vr10, vr7, vr3
|
|
vmaddwod.h.bu vr11, vr7, vr3
|
|
vadd.h vr12, vr8, vr10
|
|
vadd.h vr13, vr9, vr11
|
|
vssrarni.bu.h vr13, vr12, 6
|
|
vstelm.w vr13, a0, 0, 0
|
|
vstelm.w vr13, a1, 0, 2
|
|
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
move a3, t3
|
|
add.d t3, t3, a4
|
|
move t0, t3
|
|
addi.d t1, t1, -1
|
|
blt zero, t1, .LOOP_W4
|
|
b .ENDLOOP_W2
|
|
.ENDLOOP_W4:
|
|
bge t4, a7, .ENDLOOP_W2
|
|
.LOOP_W2:
|
|
vld vr4, a3, 0
|
|
vld vr5, t0, 0
|
|
vld vr6, a3, 2
|
|
vld vr7, t0, 2
|
|
vmulwev.h.bu vr8, vr4, vr0
|
|
vmulwod.h.bu vr9, vr4, vr0
|
|
vmulwev.h.bu vr10, vr5, vr2
|
|
vmulwod.h.bu vr11, vr5, vr2
|
|
vmaddwev.h.bu vr8, vr6, vr1
|
|
vmaddwod.h.bu vr9, vr6, vr1
|
|
vmaddwev.h.bu vr10, vr7, vr3
|
|
vmaddwod.h.bu vr11, vr7, vr3
|
|
vadd.h vr12, vr8, vr10
|
|
vadd.h vr13, vr9, vr11
|
|
vssrarni.bu.h vr13, vr12, 6
|
|
vstelm.h vr13, a0, 0, 0
|
|
vstelm.h vr13, a1, 0, 4
|
|
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
move a3, t3
|
|
add.d t3, t3, a4
|
|
move t0, t3
|
|
addi.d t1, t1, -1
|
|
blt zero, t1, .LOOP_W2
|
|
.ENDLOOP_W2:
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_weight_w4_lsx
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
vreplgr2vr.b vr0, a6
|
|
vreplgr2vr.b vr1, t0
|
|
vpackev.b vr8, vr1, vr0
|
|
.LOOP_AVG_WEIGHT_W4:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fld.s f2, a4, 0
|
|
fldx.s f3, a4, a5
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr2, vr3, vr2
|
|
vilvl.b vr0, vr2, vr0
|
|
vmulwev.h.bu.b vr1, vr0, vr8
|
|
vmaddwod.h.bu.b vr1, vr0, vr8
|
|
vssrarni.bu.h vr1, vr1, 6
|
|
fst.s f1, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr1, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a5, a4, 1
|
|
addi.w a7, a7, -2
|
|
bnez a7, .LOOP_AVG_WEIGHT_W4
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w4_lsx
|
|
.LOOP_AVG_W4:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fld.s f4, a4, 0
|
|
fldx.s f5, a4, a5
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr4, vr5, vr4
|
|
vavgr.bu vr0, vr0, vr4
|
|
fst.s f0, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr0, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a5, a4, 1
|
|
addi.w a7, a7, -2
|
|
bnez a7, .LOOP_AVG_W4
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_weight_w8_lsx
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
slli.d t5, a1, 1
|
|
add.d t6, a1, t5
|
|
add.d t7, a1, t6
|
|
vreplgr2vr.b vr0, a6
|
|
vreplgr2vr.b vr1, t0
|
|
vpackev.b vr8, vr1, vr0
|
|
PIXEL_AVG_START_W8
|
|
.LOOP_AVG_HEIGHT_W8:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
fld.d f4, a4, 0
|
|
fldx.d f5, a4, a5
|
|
fldx.d f6, a4, t3
|
|
fldx.d f7, a4, t4
|
|
vilvl.b vr0, vr4, vr0
|
|
vilvl.b vr1, vr5, vr1
|
|
vilvl.b vr2, vr6, vr2
|
|
vilvl.b vr3, vr7, vr3
|
|
vmulwev.h.bu.b vr4, vr0, vr8
|
|
vmulwev.h.bu.b vr5, vr1, vr8
|
|
vmulwev.h.bu.b vr6, vr2, vr8
|
|
vmulwev.h.bu.b vr7, vr3, vr8
|
|
vmaddwod.h.bu.b vr4, vr0, vr8
|
|
vmaddwod.h.bu.b vr5, vr1, vr8
|
|
vmaddwod.h.bu.b vr6, vr2, vr8
|
|
vmaddwod.h.bu.b vr7, vr3, vr8
|
|
vssrarni.bu.h vr4, vr4, 6
|
|
vssrarni.bu.h vr5, vr5, 6
|
|
vssrarni.bu.h vr6, vr6, 6
|
|
vssrarni.bu.h vr7, vr7, 6
|
|
fst.d f4, a0, 0
|
|
fstx.d f5, a0, a1
|
|
fstx.d f6, a0, t5
|
|
fstx.d f7, a0, t6
|
|
add.d a0, a0, t7
|
|
alsl.d a2, a3, a2, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.w a7, a7, -4
|
|
bnez a7, .LOOP_AVG_HEIGHT_W8
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w8_lsx
|
|
PIXEL_AVG_START_W8
|
|
.LOOP_AVG_W8:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
fld.d f4, a4, 0
|
|
fldx.d f5, a4, a5
|
|
fldx.d f6, a4, t3
|
|
fldx.d f7, a4, t4
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr6, vr7, vr6
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr2, vr2, vr6
|
|
fst.d f0, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.d vr0, a0, 0, 1
|
|
fstx.d f2, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
vstelm.d vr2, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.w a7, a7, -4
|
|
bnez a7, .LOOP_AVG_W8
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_weight_w16_lsx
|
|
addi.d t0, zero, 64
|
|
sub.d t0, t0, a6
|
|
vreplgr2vr.b vr8, a6
|
|
vreplgr2vr.b vr9, t0
|
|
PIXEL_AVG_START
|
|
.LOOP_AVG_HEIGHT_W16:
|
|
LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7
|
|
|
|
vmulwev.h.bu.b vr10, vr0, vr8
|
|
vmulwev.h.bu.b vr11, vr1, vr8
|
|
vmulwev.h.bu.b vr12, vr2, vr8
|
|
vmulwev.h.bu.b vr13, vr3, vr8
|
|
vmulwod.h.bu.b vr14, vr0, vr8
|
|
vmulwod.h.bu.b vr15, vr1, vr8
|
|
vmulwod.h.bu.b vr16, vr2, vr8
|
|
vmulwod.h.bu.b vr17, vr3, vr8
|
|
vmaddwev.h.bu.b vr10, vr4, vr9
|
|
vmaddwev.h.bu.b vr11, vr5, vr9
|
|
vmaddwev.h.bu.b vr12, vr6, vr9
|
|
vmaddwev.h.bu.b vr13, vr7, vr9
|
|
vmaddwod.h.bu.b vr14, vr4, vr9
|
|
vmaddwod.h.bu.b vr15, vr5, vr9
|
|
vmaddwod.h.bu.b vr16, vr6, vr9
|
|
vmaddwod.h.bu.b vr17, vr7, vr9
|
|
vssrarni.bu.h vr11, vr10, 6
|
|
vssrarni.bu.h vr13, vr12, 6
|
|
vssrarni.bu.h vr15, vr14, 6
|
|
vssrarni.bu.h vr17, vr16, 6
|
|
vilvl.b vr10, vr15, vr11
|
|
vilvh.b vr11, vr15, vr11
|
|
vilvl.b vr12, vr17, vr13
|
|
vilvh.b vr13, vr17, vr13
|
|
|
|
vst vr10, a0, 0
|
|
vstx vr11, a0, a1
|
|
vstx vr12, a0, t6
|
|
vstx vr13, a0, t7
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
add.d a0, a0, t8
|
|
addi.d a7, a7, -4
|
|
bnez a7, .LOOP_AVG_HEIGHT_W16
|
|
endfunc_x264
|
|
|
|
function_x264 pixel_avg_w16_lsx
|
|
PIXEL_AVG_START
|
|
.LOOP_AVG_W16:
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vldx vr2, a2, t0
|
|
vldx vr3, a2, t1
|
|
vld vr4, a4, 0
|
|
vldx vr5, a4, a5
|
|
vldx vr6, a4, t3
|
|
vldx vr7, a4, t4
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr1, vr1, vr5
|
|
vavgr.bu vr2, vr2, vr6
|
|
vavgr.bu vr3, vr3, vr7
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
vstx vr2, a0, t6
|
|
vstx vr3, a0, t7
|
|
add.d a0, a0, t8
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vldx vr2, a2, t0
|
|
vldx vr3, a2, t1
|
|
vld vr4, a4, 0
|
|
vldx vr5, a4, a5
|
|
vldx vr6, a4, t3
|
|
vldx vr7, a4, t4
|
|
vavgr.bu vr0, vr0, vr4
|
|
vavgr.bu vr1, vr1, vr5
|
|
vavgr.bu vr2, vr2, vr6
|
|
vavgr.bu vr3, vr3, vr7
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
vstx vr2, a0, t6
|
|
vstx vr3, a0, t7
|
|
add.d a2, a2, t2
|
|
add.d a4, a4, t5
|
|
add.d a0, a0, t8
|
|
addi.d a7, a7, -8
|
|
bnez a7, .LOOP_AVG_W16
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
|
|
* pixel *src2, intptr_t src2_stride, int weight);
|
|
*/
|
|
.macro PIXEL_AVG_LSX w, h
|
|
function_x264 pixel_avg_\w\()x\h\()_lsx
|
|
addi.d t0, a6, -32
|
|
addi.d a7, zero, \h
|
|
bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lsx
|
|
b x264_8_pixel_avg_w\w\()_lsx
|
|
endfunc_x264
|
|
.endm
|
|
|
|
PIXEL_AVG_LSX 16, 16
|
|
PIXEL_AVG_LSX 16, 8
|
|
PIXEL_AVG_LSX 8, 16
|
|
PIXEL_AVG_LSX 8, 8
|
|
PIXEL_AVG_LSX 8, 4
|
|
PIXEL_AVG_LSX 4, 16
|
|
PIXEL_AVG_LSX 4, 8
|
|
PIXEL_AVG_LSX 4, 4
|
|
PIXEL_AVG_LSX 4, 2
|
|
|
|
function_x264 mc_weight_w20_noden_lsx
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
.LOOP_WEIGHT_W20_NODEN:
|
|
vld vr3, a2, 0
|
|
vld vr4, a2, 16
|
|
add.d a2, a2, a3
|
|
vld vr5, a2, 0
|
|
vld vr6, a2, 16
|
|
vilvl.w vr4, vr6, vr4
|
|
vmulwev.h.bu.b vr7, vr3, vr0
|
|
vmulwod.h.bu.b vr8, vr3, vr0
|
|
vmulwev.h.bu.b vr9, vr4, vr0
|
|
vmulwod.h.bu.b vr10, vr4, vr0
|
|
vmulwev.h.bu.b vr11, vr5, vr0
|
|
vmulwod.h.bu.b vr12, vr5, vr0
|
|
vadd.h vr7, vr7, vr1
|
|
vadd.h vr8, vr8, vr1
|
|
vadd.h vr9, vr9, vr1
|
|
vadd.h vr10, vr10, vr1
|
|
vadd.h vr11, vr11, vr1
|
|
vadd.h vr12, vr12, vr1
|
|
vssrani.bu.h vr11, vr7, 0
|
|
vssrani.bu.h vr12, vr8, 0
|
|
vssrani.bu.h vr9, vr9, 0
|
|
vssrani.bu.h vr10, vr10, 0
|
|
vilvl.b vr7, vr12, vr11
|
|
vilvl.b vr9, vr10, vr9
|
|
vilvh.b vr11, vr12, vr11
|
|
|
|
vst vr7, a0, 0
|
|
vstelm.w vr9, a0, 16, 0
|
|
add.d a0, a0, a1
|
|
vst vr11, a0, 0
|
|
vstelm.w vr9, a0, 16, 1
|
|
add.d a0, a0, a1
|
|
add.d a2, a2, a3
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W20_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w16_noden_lsx
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
.LOOP_WEIGHT_W16_NODEN:
|
|
vld vr3, a2, 0
|
|
vldx vr4, a2, a3
|
|
vmulwev.h.bu.b vr5, vr3, vr0
|
|
vmulwod.h.bu.b vr6, vr3, vr0
|
|
vmulwev.h.bu.b vr7, vr4, vr0
|
|
vmulwod.h.bu.b vr8, vr4, vr0
|
|
vadd.h vr5, vr5, vr1
|
|
vadd.h vr6, vr6, vr1
|
|
vadd.h vr7, vr7, vr1
|
|
vadd.h vr8, vr8, vr1
|
|
vssrani.bu.h vr7, vr5, 0
|
|
vssrani.bu.h vr8, vr6, 0
|
|
vilvl.b vr5, vr8, vr7
|
|
vilvh.b vr7, vr8, vr7
|
|
vst vr5, a0, 0
|
|
vstx vr7, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W16_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w8_noden_lsx
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
.LOOP_WEIGHT_W8_NODEN:
|
|
fld.d f3, a2, 0
|
|
fldx.d f4, a2, a3
|
|
vilvl.d vr3, vr4, vr3
|
|
vmulwev.h.bu.b vr5, vr3, vr0
|
|
vmulwod.h.bu.b vr6, vr3, vr0
|
|
vadd.h vr5, vr5, vr1
|
|
vadd.h vr6, vr6, vr1
|
|
vssrani.bu.h vr5, vr5, 0
|
|
vssrani.bu.h vr6, vr6, 0
|
|
vilvl.b vr7, vr6, vr5
|
|
vstelm.d vr7, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.d vr7, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W8_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w4_noden_lsx
|
|
vldrepl.h vr0, a4, 36 // scale
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
.LOOP_WEIGHT_W4_NODEN:
|
|
fld.s f3, a2, 0
|
|
fldx.s f4, a2, a3
|
|
vilvl.w vr3, vr4, vr3
|
|
vsllwil.hu.bu vr3, vr3, 0
|
|
vmul.h vr3, vr3, vr0
|
|
vadd.h vr3, vr3, vr1
|
|
vssrani.bu.h vr3, vr3, 0
|
|
vstelm.w vr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr3, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W4_NODEN
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w20_lsx
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr2, a4, 32 // denom
|
|
vsll.h vr1, vr1, vr2
|
|
.LOOP_WEIGHT_W20:
|
|
vld vr3, a2, 0
|
|
vld vr4, a2, 16
|
|
add.d a2, a2, a3
|
|
vld vr5, a2, 0
|
|
vld vr6, a2, 16
|
|
vilvl.w vr4, vr6, vr4
|
|
|
|
vmulwev.h.bu.b vr7, vr3, vr0
|
|
vmulwod.h.bu.b vr8, vr3, vr0
|
|
vmulwev.h.bu.b vr9, vr4, vr0
|
|
vmulwod.h.bu.b vr10, vr4, vr0
|
|
vmulwev.h.bu.b vr11, vr5, vr0
|
|
vmulwod.h.bu.b vr12, vr5, vr0
|
|
vsadd.h vr7, vr7, vr1
|
|
vsadd.h vr8, vr8, vr1
|
|
vsadd.h vr9, vr9, vr1
|
|
vsadd.h vr10, vr10, vr1
|
|
vsadd.h vr11, vr11, vr1
|
|
vsadd.h vr12, vr12, vr1
|
|
vssrarn.bu.h vr7, vr7, vr2
|
|
vssrarn.bu.h vr8, vr8, vr2
|
|
vssrarn.bu.h vr9, vr9, vr2
|
|
vssrarn.bu.h vr10, vr10, vr2
|
|
vssrarn.bu.h vr11, vr11, vr2
|
|
vssrarn.bu.h vr12, vr12, vr2
|
|
vilvl.b vr7, vr8, vr7
|
|
vilvl.b vr9, vr10, vr9
|
|
vilvl.b vr11, vr12, vr11
|
|
|
|
vst vr7, a0, 0
|
|
vstelm.w vr9, a0, 16, 0
|
|
add.d a0, a0, a1
|
|
vst vr11, a0, 0
|
|
vstelm.w vr9, a0, 16, 1
|
|
add.d a0, a0, a1
|
|
add.d a2, a2, a3
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W20
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w16_lsx
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr2, a4, 32 // denom
|
|
vsll.h vr1, vr1, vr2
|
|
.LOOP_WEIGHT_W16:
|
|
vld vr3, a2, 0
|
|
vldx vr4, a2, a3
|
|
vmulwev.h.bu.b vr5, vr3, vr0
|
|
vmulwod.h.bu.b vr6, vr3, vr0
|
|
vmulwev.h.bu.b vr7, vr4, vr0
|
|
vmulwod.h.bu.b vr8, vr4, vr0
|
|
vsadd.h vr5, vr5, vr1
|
|
vsadd.h vr6, vr6, vr1
|
|
vsadd.h vr7, vr7, vr1
|
|
vsadd.h vr8, vr8, vr1
|
|
vssrarn.bu.h vr5, vr5, vr2
|
|
vssrarn.bu.h vr6, vr6, vr2
|
|
vssrarn.bu.h vr7, vr7, vr2
|
|
vssrarn.bu.h vr8, vr8, vr2
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr7, vr8, vr7
|
|
vst vr5, a0, 0
|
|
vstx vr7, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W16
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w8_lsx
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
vldrepl.b vr0, a4, 36 // scale
|
|
vldrepl.h vr2, a4, 32 // denom
|
|
vsll.h vr1, vr1, vr2
|
|
.LOOP_WEIGHT_W8:
|
|
fld.d f3, a2, 0
|
|
fldx.d f4, a2, a3
|
|
vilvl.d vr3, vr4, vr3
|
|
vmulwev.h.bu.b vr5, vr3, vr0
|
|
vmulwod.h.bu.b vr6, vr3, vr0
|
|
vsadd.h vr5, vr5, vr1
|
|
vsadd.h vr6, vr6, vr1
|
|
vssrarn.bu.h vr5, vr5, vr2
|
|
vssrarn.bu.h vr6, vr6, vr2
|
|
vilvl.b vr7, vr6, vr5
|
|
vstelm.d vr7, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.d vr7, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W8
|
|
endfunc_x264
|
|
|
|
function_x264 mc_weight_w4_lsx
|
|
vldrepl.h vr1, a4, 40 // offset
|
|
vldrepl.h vr0, a4, 36 // scale
|
|
vldrepl.h vr2, a4, 32 // denom
|
|
vsll.h vr1, vr1, vr2
|
|
.LOOP_WEIGHT_W4:
|
|
fld.s f3, a2, 0
|
|
fldx.s f4, a2, a3
|
|
vilvl.w vr3, vr4, vr3
|
|
vsllwil.hu.bu vr3, vr3, 0
|
|
vmul.h vr3, vr3, vr0
|
|
vsadd.h vr3, vr3, vr1
|
|
vssrarn.bu.h vr3, vr3, vr2
|
|
vstelm.w vr3, a0, 0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr3, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
alsl.d a2, a3, a2, 1
|
|
addi.w a5, a5, -2
|
|
blt zero, a5, .LOOP_WEIGHT_W4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w4_lsx
|
|
.LOOP_AVG2_W4:
|
|
addi.d a5, a5, -2
|
|
fld.s f0, a2, 0
|
|
fld.s f1, a4, 0
|
|
fldx.s f2, a2, a3
|
|
fldx.s f3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr1
|
|
vavgr.bu vr1, vr2, vr3
|
|
fst.s f0, a0, 0
|
|
fstx.s f1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .LOOP_AVG2_W4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w8_lsx
|
|
.LOOP_AVG2_W8:
|
|
addi.d a5, a5, -2
|
|
fld.d f0, a2, 0
|
|
fld.d f1, a4, 0
|
|
fldx.d f2, a2, a3
|
|
fldx.d f3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr1
|
|
vavgr.bu vr1, vr2, vr3
|
|
fst.d f0, a0, 0
|
|
fstx.d f1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .LOOP_AVG2_W8
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w16_lsx
|
|
.LOOP_AVG2_W16:
|
|
addi.d a5, a5, -2
|
|
vld vr0, a2, 0
|
|
vldx vr1, a2, a3
|
|
vld vr2, a4, 0
|
|
vldx vr3, a4, a3
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a4, a3, a4, 1
|
|
vavgr.bu vr0, vr0, vr2
|
|
vavgr.bu vr1, vr1, vr3
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
blt zero, a5, .LOOP_AVG2_W16
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
|
|
* intptr_t i_src_stride, uint8_t *src2, int i_height)
|
|
*/
|
|
function_x264 pixel_avg2_w20_lsx
|
|
.LOOP_AVG2_W20:
|
|
addi.d a5, a5, -2
|
|
vld vr0, a2, 0
|
|
vld vr1, a2, 16
|
|
vld vr2, a4, 0
|
|
vld vr3, a4, 16
|
|
add.d a2, a2, a3
|
|
add.d a4, a4, a3
|
|
vld vr4, a2, 0
|
|
vld vr5, a2, 16
|
|
vld vr6, a4, 0
|
|
vld vr7, a4, 16
|
|
vavgr.bu vr0, vr0, vr2
|
|
vavgr.bu vr1, vr1, vr3
|
|
vavgr.bu vr4, vr4, vr6
|
|
vavgr.bu vr5, vr5, vr7
|
|
|
|
vst vr0, a0, 0
|
|
vstelm.w vr1, a0, 16, 0
|
|
add.d a0, a0, a1
|
|
vst vr4, a0, 0
|
|
vstelm.w vr5, a0, 16, 0
|
|
add.d a2, a2, a3
|
|
add.d a4, a4, a3
|
|
add.d a0, a0, a1
|
|
blt zero, a5, .LOOP_AVG2_W20
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride,
|
|
* uint8_t *p_src, int32_t i_src_stride,
|
|
* int32_t i_height )
|
|
*/
|
|
function_x264 mc_copy_w16_lsx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPY_W16:
|
|
vld vr1, a2, 0
|
|
vldx vr2, a2, a3
|
|
vldx vr3, a2, t0
|
|
vldx vr4, a2, t1
|
|
|
|
vst vr1, a0, 0
|
|
vstx vr2, a0, a1
|
|
vstx vr3, a0, t2
|
|
vstx vr4, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPY_W16
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_w8(uint8_t *p_dst, intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t i_height)
|
|
*/
|
|
function_x264 mc_copy_w8_lsx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPY_W8:
|
|
fld.d f0, a2, 0
|
|
fldx.d f1, a2, a3
|
|
fldx.d f2, a2, t0
|
|
fldx.d f3, a2, t1
|
|
|
|
fst.d f0, a0, 0
|
|
fstx.d f1, a0, a1
|
|
fstx.d f2, a0, t2
|
|
fstx.d f3, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPY_W8
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void mc_copy_w4(uint8_t *p_dst, intptr_t i_dst_stride,
|
|
* uint8_t *p_src, intptr_t i_src_stride,
|
|
* int32_t i_height)
|
|
*/
|
|
function_x264 mc_copy_w4_lsx
|
|
slli.d t0, a3, 1
|
|
add.d t1, t0, a3
|
|
slli.d t2, a1, 1
|
|
add.d t3, t2, a1
|
|
.LOOP_COPY_W4:
|
|
fld.s f0, a2, 0
|
|
fldx.s f1, a2, a3
|
|
fldx.s f2, a2, t0
|
|
fldx.s f3, a2, t1
|
|
|
|
fst.s f0, a0, 0
|
|
fstx.s f1, a0, a1
|
|
fstx.s f2, a0, t2
|
|
fstx.s f3, a0, t3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
addi.w a4, a4, -4
|
|
blt zero, a4, .LOOP_COPY_W4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void store_interleave_chroma(uint8_t *p_dst, intptr_t i_dst_stride,
|
|
* uint8_t *p_src0, uint8_t *p_src1,
|
|
* int32_t i_height)
|
|
*/
|
|
function_x264 store_interleave_chroma_lsx
|
|
.loop_interleave_chroma:
|
|
fld.d f0, a2, 0
|
|
fld.d f1, a3, 0
|
|
addi.d a2, a2, FDEC_STRIDE
|
|
addi.d a3, a3, FDEC_STRIDE
|
|
vilvl.b vr0, vr1, vr0
|
|
vst vr0, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
blt zero, a4, .loop_interleave_chroma
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void load_deinterleave_chroma_fenc(pixel *dst, pixel *src,
|
|
* intptr_t i_src, int height)
|
|
*/
|
|
function_x264 load_deinterleave_chroma_fenc_lsx
|
|
addi.d t0, a0, FENC_STRIDE/2
|
|
andi t1, a3, 1
|
|
sub.w t2, a3, t1
|
|
.loop_deinterleave_fenc:
|
|
vld vr0, a1, 0
|
|
vldx vr1, a1, a2
|
|
vpickev.b vr2, vr1, vr0
|
|
vpickod.b vr3, vr1, vr0
|
|
fst.d f2, a0, 0
|
|
fst.d f3, t0, 0
|
|
vstelm.d vr2, a0, FENC_STRIDE, 1
|
|
vstelm.d vr3, t0, FENC_STRIDE, 1
|
|
addi.d a0, a0, FENC_STRIDE * 2
|
|
addi.d t0, t0, FENC_STRIDE * 2
|
|
alsl.d a1, a2, a1, 1
|
|
addi.w t2, t2, -2
|
|
blt zero, t2, .loop_deinterleave_fenc
|
|
|
|
beqz t1, .loop_deinterleave_fenc_end
|
|
vld vr0, a1, 0
|
|
vpickev.b vr1, vr0, vr0
|
|
vpickod.b vr2, vr0, vr0
|
|
fst.d f1, a0, 0
|
|
fst.d f2, t0, 0
|
|
.loop_deinterleave_fenc_end:
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void load_deinterleave_chroma_fdec(pixel *dst, pixel *src,
|
|
* intptr_t i_src, int height)
|
|
*/
|
|
function_x264 load_deinterleave_chroma_fdec_lsx
|
|
addi.d t0, a0, FDEC_STRIDE/2
|
|
andi t1, a3, 1
|
|
sub.w t2, a3, t1
|
|
.loop_deinterleave_fdec:
|
|
vld vr0, a1, 0
|
|
vldx vr1, a1, a2
|
|
vpickev.b vr2, vr1, vr0
|
|
vpickod.b vr3, vr1, vr0
|
|
fst.d f2, a0, 0
|
|
fst.d f3, t0, 0
|
|
vstelm.d vr2, a0, FDEC_STRIDE, 1
|
|
vstelm.d vr3, t0, FDEC_STRIDE, 1
|
|
addi.d a0, a0, FDEC_STRIDE * 2
|
|
addi.d t0, t0, FDEC_STRIDE * 2
|
|
alsl.d a1, a2, a1, 1
|
|
addi.w t2, t2, -2
|
|
blt zero, t2, .loop_deinterleave_fdec
|
|
|
|
beqz t1, .loop_deinterleave_fdec_end
|
|
vld vr0, a1, 0
|
|
vpickev.b vr1, vr0, vr0
|
|
vpickod.b vr2, vr0, vr0
|
|
fst.d f1, a0, 0
|
|
fst.d f2, t0, 0
|
|
.loop_deinterleave_fdec_end:
|
|
endfunc_x264
|
|
|
|
/*
|
|
* x264_plane_copy_interleave(pixel *dst, intptr_t i_dst,
|
|
* pixel *srcu, intptr_t i_srcu,
|
|
* pixel *srcv, intptr_t i_srcv, int w, int h)
|
|
*/
|
|
function_x264 plane_copy_interleave_core_lsx
|
|
.loop_h:
|
|
add.d t0, a0, zero
|
|
add.d t2, a2, zero
|
|
add.d t4, a4, zero
|
|
add.d t6, a6, zero
|
|
.loop_copy_interleavew16:
|
|
vld vr0, t2, 0
|
|
vld vr1, t4, 0
|
|
vilvl.b vr2, vr1, vr0
|
|
vilvh.b vr3, vr1, vr0
|
|
vst vr2, t0, 0
|
|
vst vr3, t0, 16
|
|
addi.d t2, t2, 16
|
|
addi.d t4, t4, 16
|
|
addi.d t0, t0, 32
|
|
addi.w t6, t6, -16
|
|
blt zero, t6, .loop_copy_interleavew16
|
|
|
|
add.d a2, a2, a3
|
|
add.d a4, a4, a5
|
|
add.d a0, a0, a1
|
|
addi.w a7, a7, -1
|
|
blt zero, a7, .loop_h
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_plane_copy_deinterleave(pixel *dsta, intptr_t i_dsta,
|
|
* pixel *dstb, intptr_t i_dstb,
|
|
* pixel *src, intptr_t i_src, int w, int h)
|
|
*/
|
|
function_x264 plane_copy_deinterleave_lsx
|
|
.LOOP_PLANE_COPY_H:
|
|
add.d t0, a0, zero
|
|
add.d t2, a2, zero
|
|
add.d t4, a4, zero
|
|
add.d t6, a6, zero
|
|
.LOOP_PLANE_COPY_W16:
|
|
vld vr0, t4, 0
|
|
vld vr1, t4, 16
|
|
vpickev.b vr2, vr1, vr0
|
|
vpickod.b vr3, vr1, vr0
|
|
vst vr2, t0, 0
|
|
vst vr3, t2, 0
|
|
addi.d t4, t4, 32
|
|
addi.d t0, t0, 16
|
|
addi.d t2, t2, 16
|
|
addi.w t6, t6, -16
|
|
blt zero, t6, .LOOP_PLANE_COPY_W16
|
|
|
|
add.d a2, a2, a3
|
|
add.d a4, a4, a5
|
|
add.d a0, a0, a1
|
|
addi.w a7, a7, -1
|
|
blt zero, a7, .LOOP_PLANE_COPY_H
|
|
endfunc_x264
|
|
|
|
function_x264 plane_copy_deinterleave_lasx
|
|
.LOOP_PLANE_COPY_H_LASX:
|
|
add.d t0, a0, zero
|
|
add.d t2, a2, zero
|
|
add.d t4, a4, zero
|
|
add.d t6, a6, zero
|
|
.LOOP_PLANE_COPY_W32_LASX:
|
|
xvld xr0, t4, 0
|
|
xvld xr1, t4, 32
|
|
xvpickev.b xr2, xr1, xr0
|
|
xvpickod.b xr3, xr1, xr0
|
|
xvpermi.d xr2, xr2, 0xd8
|
|
xvpermi.d xr3, xr3, 0xd8
|
|
xvst xr2, t0, 0
|
|
xvst xr3, t2, 0
|
|
addi.d t4, t4, 64
|
|
addi.d t0, t0, 32
|
|
addi.d t2, t2, 32
|
|
addi.w t6, t6, -32
|
|
blt zero, t6, .LOOP_PLANE_COPY_W32_LASX
|
|
|
|
add.d a2, a2, a3
|
|
add.d a4, a4, a5
|
|
add.d a0, a0, a1
|
|
addi.w a7, a7, -1
|
|
blt zero, a7, .LOOP_PLANE_COPY_H_LASX
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void prefetch_ref(uint8_t *pix, intptr_t stride, int32_t parity)
|
|
*/
|
|
function_x264 prefetch_ref_lsx
|
|
addi.d a2, a2, -1
|
|
addi.d a0, a0, 64
|
|
and a2, a2, a1
|
|
alsl.d t1, a2, a0, 3
|
|
alsl.d a2, a1, a1, 1
|
|
preld 0, t1, 0
|
|
add.d t2, t1, a1
|
|
preld 0, t2, 0
|
|
add.d t2, t2, a1
|
|
preld 0, t2, 0
|
|
add.d t1, t1, a2
|
|
preld 0, t1, 0
|
|
alsl.d a0, a1, t2, 1
|
|
preld 0, a0, 0
|
|
add.d t1, a0, a1
|
|
preld 0, t1, 0
|
|
add.d t1, t1, a1
|
|
preld 0, t1, 0
|
|
add.d a0, a0, a2
|
|
preld 0, a0, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void prefetch_fenc_422(uint8_t *pix_y, intptr_t stride_y,
|
|
* uint8_t *pix_uv, intptr_t stride_uv,
|
|
* int32_t mb_x)
|
|
*/
|
|
function_x264 prefetch_fenc_422_lsx
|
|
andi t0, a4, 3
|
|
mul.d t0, t0, a1
|
|
andi a4, a4, 6
|
|
mul.d t1, a4, a3
|
|
addi.d a0, a0, 64
|
|
addi.d a2, a2, 64
|
|
alsl.d a0, t0, a0, 2
|
|
preld 0, a0, 0
|
|
add.d t2, a0, a1
|
|
preld 0, t2, 0
|
|
add.d a0, t2, a1
|
|
preld 0, a0, 0
|
|
add.d a0, a0, a1
|
|
preld 0, a0, 0
|
|
alsl.d a2, t1, a2, 2
|
|
preld 0, a2, 0
|
|
add.d t3, a2, a3
|
|
preld 0, t3, 0
|
|
add.d a2, t3, a3
|
|
preld 0, a2, 0
|
|
add.d a2, a2, a3
|
|
preld 0, a2, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void prefetch_fenc_420(uint8_t *pix_y, intptr_t stride_y,
|
|
* uint8_t *pix_uv, intptr_t stride_uv,
|
|
* int32_t mb_x)
|
|
*/
|
|
function_x264 prefetch_fenc_420_lsx
|
|
andi t0, a4, 3
|
|
mul.d t0, t0, a1
|
|
andi a4, a4, 6
|
|
mul.d t1, a4, a3
|
|
addi.d a0, a0, 64
|
|
addi.d a2, a2, 64
|
|
alsl.d a0, t0, a0, 2
|
|
preld 0, a0, 0
|
|
add.d t2, a0, a1
|
|
preld 0, t2, 0
|
|
add.d a0, t2, a1
|
|
preld 0, a0, 0
|
|
add.d a0, a0, a1
|
|
preld 0, a0, 0
|
|
alsl.d a2, t1, a2, 2
|
|
preld 0, a2, 0
|
|
add.d a2, a2, a3
|
|
preld 0, a2, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void *memcpy_aligned(void *dst, const void *src, size_t n)
|
|
*/
|
|
function_x264 memcpy_aligned_lsx
|
|
andi t0, a2, 16
|
|
beqz t0, 2f
|
|
addi.d a2, a2, -16
|
|
vld vr0, a1, 0
|
|
vst vr0, a0, 0
|
|
addi.d a1, a1, 16
|
|
addi.d a0, a0, 16
|
|
2:
|
|
andi t0, a2, 32
|
|
beqz t0, 3f
|
|
addi.d a2, a2, -32
|
|
vld vr0, a1, 0
|
|
vld vr1, a1, 16
|
|
vst vr0, a0, 0
|
|
vst vr1, a0, 16
|
|
addi.d a1, a1, 32
|
|
addi.d a0, a0, 32
|
|
3:
|
|
beqz a2, 5f
|
|
4:
|
|
addi.d a2, a2, -64
|
|
vld vr0, a1, 48
|
|
vld vr1, a1, 32
|
|
vld vr2, a1, 16
|
|
vld vr3, a1, 0
|
|
vst vr0, a0, 48
|
|
vst vr1, a0, 32
|
|
vst vr2, a0, 16
|
|
vst vr3, a0, 0
|
|
addi.d a1, a1, 64
|
|
addi.d a0, a0, 64
|
|
blt zero, a2, 4b
|
|
5:
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void memzero_aligned(void *p_dst, size_t n)
|
|
*/
|
|
function_x264 memzero_aligned_lsx
|
|
vxor.v vr1, vr1, vr1
|
|
.loop_memzero:
|
|
addi.d a1, a1, -128
|
|
vst vr1, a0, 0
|
|
vst vr1, a0, 16
|
|
vst vr1, a0, 32
|
|
vst vr1, a0, 48
|
|
vst vr1, a0, 64
|
|
vst vr1, a0, 80
|
|
vst vr1, a0, 96
|
|
vst vr1, a0, 112
|
|
addi.d a0, a0, 128
|
|
blt zero, a1, .loop_memzero
|
|
endfunc_x264
|
|
|
|
.macro FILT_H_LSX s1, s2, s3
|
|
vsub.h \s1, \s1, \s2
|
|
vsrai.h \s1, \s1, 2
|
|
vsub.h \s1, \s1, \s2
|
|
vadd.h \s1, \s1, \s3
|
|
vsrai.h \s1, \s1, 2
|
|
vadd.h \s1, \s1, \s3
|
|
.endm
|
|
|
|
//s1: s1.0, s2: s2.0, s3: s3.0, s4: s1.1 s5: s2.1 s6: s3.1
|
|
.macro FILT_C_LSX s1, s2, s3, s4, s5, s6
|
|
vaddi.bu vr17, vr23, 2 //vr24
|
|
vaddi.bu vr19, vr26, 1 //vr27
|
|
vaddi.bu vr18, vr26, 3 //vr29
|
|
|
|
vshuf.b vr1, \s2, \s4, vr23
|
|
vshuf.b vr2, \s2, \s4, vr17
|
|
vshuf.b vr3, \s5, \s2, vr18
|
|
vshuf.b vr4, \s5, \s2, vr19
|
|
vadd.h vr3, vr2, vr3
|
|
|
|
vshuf.b vr16, \s5, \s2, vr23
|
|
vshuf.b vr17, \s5, \s2, vr17
|
|
vshuf.b vr18, \s3, \s5, vr18
|
|
vshuf.b vr19, \s3, \s5, vr19
|
|
vadd.h vr18, vr17, vr18
|
|
|
|
vmov vr2, \s5
|
|
vmov \s1, \s3
|
|
vmov vr20, \s3
|
|
vmov \s4, \s6
|
|
|
|
vaddi.bu vr17, vr26, 5 //vr30
|
|
|
|
vshuf.b \s3, vr2, \s2, vr17
|
|
vshuf.b \s6, vr20, \s5, vr17
|
|
|
|
vadd.h vr4, vr4, \s2
|
|
vadd.h \s3, \s3, vr1
|
|
vadd.h vr19, vr19, \s5
|
|
vadd.h \s6, \s6, vr16
|
|
|
|
FILT_H_LSX \s3, vr3, vr4
|
|
FILT_H_LSX \s6, vr18, vr19
|
|
.endm
|
|
|
|
.macro FILT_PACK_LSX s1, s2, s3
|
|
vmulwev.w.h vr16, \s1, \s3
|
|
vmulwev.w.h vr17, \s2, \s3
|
|
vsrarni.h.w vr17, vr16, 15
|
|
vmaxi.h vr17, vr17, 0
|
|
vsat.hu vr17, vr17, 7
|
|
vmulwod.w.h vr18, \s1, \s3
|
|
vmulwod.w.h vr19, \s2, \s3
|
|
vsrarni.h.w vr19, vr18, 15
|
|
vmaxi.h vr19, vr19, 0
|
|
vsat.hu vr19, vr19, 7
|
|
vpackev.b \s1, vr19, vr17
|
|
.endm
|
|
|
|
//s1: s1.0, s2: s2.0, s3: s3.0, s4: s4.0
|
|
//s5: s1.1, s6: s2.1, s7: s3.1, s8: s4.1
|
|
|
|
.macro DO_FILT_C_LSX s1, s2, s3, s4, s5, s6, s7, s8
|
|
FILT_C_LSX \s1, \s2, \s3, \s5, \s6, \s7
|
|
FILT_C_LSX \s2, \s1, \s4, \s6, \s5, \s8
|
|
FILT_PACK_LSX \s3, \s4, vr15
|
|
FILT_PACK_LSX \s7, \s8, vr15
|
|
vilvl.d vr16, \s7, \s3
|
|
vilvh.d vr17, \s7, \s3
|
|
addi.d t3, a5, 16
|
|
vstx vr16, a5, a4
|
|
vstx vr17, t3, a4
|
|
.endm
|
|
|
|
.macro DO_FILT_H_LSX s1, s2, s3, s4, s5, s6
|
|
vaddi.bu vr16, vr23, 2 //vr24
|
|
vaddi.bu vr17, vr23, 3 //vr25
|
|
vaddi.bu vr18, vr26, 1 //vr27
|
|
vaddi.bu vr19, vr26, 2 //vr28
|
|
vld vr3, t5, 0
|
|
|
|
vshuf.b vr1, \s2, \s4, vr16
|
|
vshuf.b vr2, \s2, \s4, vr17
|
|
vshuf.b vr4, \s5, \s2, vr26
|
|
vshuf.b vr5, \s5, \s2, vr18
|
|
vshuf.b vr6, \s5, \s2, vr19
|
|
|
|
vdp2.h.bu.b vr16, vr1, vr12
|
|
vdp2.h.bu.b vr17, vr2, vr12
|
|
vdp2.h.bu.b vr18, \s2, vr14
|
|
vdp2.h.bu.b vr19, vr4, vr14
|
|
vdp2.h.bu.b vr20, vr5, vr0
|
|
vdp2.h.bu.b vr21, vr6, vr0
|
|
vadd.h vr1, vr16, vr18
|
|
vadd.h vr2, vr17, vr19
|
|
vadd.h vr1, vr1, vr20
|
|
vadd.h vr2, vr2, vr21
|
|
FILT_PACK_LSX vr1, vr2, vr15
|
|
vshuf.b vr1, vr1, vr1, vr3
|
|
vstx vr1, a0, a4
|
|
|
|
vaddi.bu vr16, vr23, 2 //vr24
|
|
vaddi.bu vr17, vr23, 3 //vr25
|
|
vaddi.bu vr18, vr26, 1 //vr27
|
|
vaddi.bu vr19, vr26, 2 //vr28
|
|
|
|
vshuf.b vr1, \s5, \s2, vr16
|
|
vshuf.b vr2, \s5, \s2, vr17
|
|
vshuf.b vr4, \s3, \s5, vr26
|
|
vshuf.b vr5, \s3, \s5, vr18
|
|
vshuf.b vr6, \s3, \s5, vr19
|
|
|
|
vdp2.h.bu.b vr16, vr1, vr12
|
|
vdp2.h.bu.b vr17, vr2, vr12
|
|
vdp2.h.bu.b vr18, \s5, vr14
|
|
vdp2.h.bu.b vr19, vr4, vr14
|
|
vdp2.h.bu.b vr20, vr5, vr0
|
|
vdp2.h.bu.b vr21, vr6, vr0
|
|
vadd.h vr1, vr16, vr18
|
|
vadd.h vr2, vr17, vr19
|
|
vadd.h vr1, vr1, vr20
|
|
vadd.h vr2, vr2, vr21
|
|
FILT_PACK_LSX vr1, vr2, vr15
|
|
vshuf.b vr1, vr1, vr1, vr3
|
|
addi.d a0, a0, 16
|
|
vstx vr1, a0, a4
|
|
addi.d a0, a0, -16
|
|
|
|
vmov \s1, \s2
|
|
vmov \s2, \s3
|
|
vmov \s4, \s5
|
|
vmov \s5, \s6
|
|
.endm
|
|
|
|
/* s3: temp, s4: UNUSED, s5: imm */
|
|
.macro DO_FILT_V0_LSX s1, s2, s3, s4, s5
|
|
alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */
|
|
alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */
|
|
vld vr1, a3, 0
|
|
vldx vr2, a3, a2
|
|
vld \s3, t2, 0
|
|
vld vr3, a1, 0
|
|
vldx \s1, a1, a2
|
|
vld \s2, t1, 0
|
|
vilvh.b vr16, vr2, vr1
|
|
vilvl.b vr17, vr2, vr1
|
|
vilvh.b vr18, \s2, \s1
|
|
vilvl.b vr19, \s2, \s1
|
|
vilvh.b vr20, \s3, vr3
|
|
vilvl.b vr21, \s3, vr3
|
|
vdp2.h.bu.b vr1, vr17, vr12
|
|
vdp2.h.bu.b vr4, vr16, vr12
|
|
vdp2.h.bu.b \s1, vr19, vr0
|
|
vdp2.h.bu.b vr2, vr18, vr0
|
|
vdp2.h.bu.b vr3, vr21, vr14
|
|
vdp2.h.bu.b \s2, vr20, vr14
|
|
vadd.h vr1, vr1, \s1
|
|
vadd.h vr4, vr4, vr2
|
|
vadd.h vr1, vr1, vr3
|
|
vadd.h vr4, vr4, \s2
|
|
vmov \s1, vr1
|
|
vmov \s2, vr4
|
|
addi.d a3, a3, 16
|
|
addi.d a1, a1, 16
|
|
FILT_PACK_LSX vr1, vr4, vr15
|
|
addi.d t3, a4, \s5
|
|
vstx vr1, t0, t3
|
|
.endm
|
|
|
|
.macro DO_FILT_V1_LSX s1, s2, s3, s4, s5
|
|
vld vr1, a3, 0
|
|
vldx vr2, a3, a2
|
|
vld \s3, t2, 16
|
|
vld vr3, a1, 0
|
|
vldx \s1, a1, a2
|
|
vld \s2, t1, 16
|
|
vilvh.b vr16, vr2, vr1
|
|
vilvl.b vr17, vr2, vr1
|
|
vilvh.b vr18, \s2, \s1
|
|
vilvl.b vr19, \s2, \s1
|
|
vilvh.b vr20, \s3, vr3
|
|
vilvl.b vr21, \s3, vr3
|
|
vdp2.h.bu.b vr1, vr17, vr12
|
|
vdp2.h.bu.b vr4, vr16, vr12
|
|
vdp2.h.bu.b \s1, vr19, vr0
|
|
vdp2.h.bu.b vr2, vr18, vr0
|
|
vdp2.h.bu.b vr3, vr21, vr14
|
|
vdp2.h.bu.b \s2, vr20, vr14
|
|
vadd.h vr1, vr1, \s1
|
|
vadd.h vr4, vr4, vr2
|
|
vadd.h vr1, vr1, vr3
|
|
vadd.h vr4, vr4, \s2
|
|
vmov \s1, vr1
|
|
vmov \s2, vr4
|
|
addi.d a3, a3, 16
|
|
addi.d a1, a1, 16
|
|
FILT_PACK_LSX vr1, vr4, vr15
|
|
addi.d t3, a4, \s5
|
|
addi.d t3, t3, 16
|
|
vstx vr1, t0, t3
|
|
.endm
|
|
|
|
/*
|
|
* void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
|
|
* uint8_t *src, intptr_t stride, int width, int height )
|
|
*/
|
|
function_x264 hpel_filter_lsx
|
|
addi.d sp, sp, -64
|
|
fst.d f24, sp, 0
|
|
fst.d f25, sp, 8
|
|
fst.d f26, sp, 16
|
|
fst.d f27, sp, 24
|
|
fst.d f28, sp, 32
|
|
fst.d f29, sp, 40
|
|
fst.d f30, sp, 48
|
|
fst.d f31, sp, 56
|
|
|
|
move a7, a3
|
|
addi.d a5, a5, -32
|
|
move t0, a1
|
|
andi a7, a7, 31
|
|
sub.d a3, a3, a7
|
|
add.d a0, a0, a5
|
|
add.d t0, t0, a5
|
|
add.d a7, a7, a5
|
|
add.d a5, a5, a2
|
|
move a2, a4
|
|
sub.d a7, zero, a7
|
|
add.d a1, a3, a2
|
|
sub.d a3, a3, a2
|
|
sub.d a3, a3, a2
|
|
move a4, a7
|
|
la.local t1, filt_mul51
|
|
vld vr0, t1, 0
|
|
la.local t2, filt_mul15
|
|
vld vr12, t2, 0
|
|
la.local t3, filt_mul20
|
|
vld vr14, t3, 0
|
|
la.local t4, pw_1024
|
|
vld vr15, t4, 0
|
|
la.local t5, hpel_shuf
|
|
la.local t2, shuf_12
|
|
vld vr23, t2, 0
|
|
la.local t3, shuf_1
|
|
vld vr26, t3, 0
|
|
vxor.v vr9, vr9, vr9
|
|
vxor.v vr10, vr10, vr10
|
|
vxor.v vr11, vr11, vr11
|
|
vxor.v vr13, vr13, vr13
|
|
.LOOPY_LSX:
|
|
DO_FILT_V0_LSX vr24, vr25, vr31, vr12, 0
|
|
DO_FILT_V1_LSX vr8, vr7, vr22, vr12, 0
|
|
.LOOPX_LSX:
|
|
DO_FILT_V0_LSX vr27, vr28, vr29, vr12, 32
|
|
DO_FILT_V1_LSX vr6, vr5, vr30, vr12, 32
|
|
.LSTX:
|
|
vsrli.h vr15, vr15, 1
|
|
DO_FILT_C_LSX vr9, vr24, vr8, vr27, vr10, vr25, vr7, vr28
|
|
vadd.h vr15, vr15, vr15
|
|
vmov vr8, vr6
|
|
vmov vr7, vr5
|
|
|
|
DO_FILT_H_LSX vr11, vr31, vr29, vr13, vr22, vr30
|
|
addi.d a4, a4, 32
|
|
blt a4, zero, .LOOPX_LSX
|
|
addi.d t1, a4, -32
|
|
blt t1, zero, .LSTX
|
|
//setup regs for next y
|
|
sub.d a4, a4, a7
|
|
sub.d a4, a4, a2
|
|
sub.d a1, a1, a4
|
|
sub.d a3, a3, a4
|
|
add.d a0, a0, a2
|
|
add.d t0, t0, a2
|
|
add.d a5, a5, a2
|
|
move a4, a7
|
|
addi.d a6, a6, -1
|
|
blt zero, a6, .LOOPY_LSX
|
|
fld.d f24, sp, 0
|
|
fld.d f25, sp, 8
|
|
fld.d f26, sp, 16
|
|
fld.d f27, sp, 24
|
|
fld.d f28, sp, 32
|
|
fld.d f29, sp, 40
|
|
fld.d f30, sp, 48
|
|
fld.d f31, sp, 56
|
|
addi.d sp, sp, 64
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth,
|
|
* pixel *dstv, pixel *dstc, intptr_t src_stride,
|
|
* intptr_t dst_stride, int width, int height)
|
|
*/
|
|
function_x264 frame_init_lowres_core_lsx
|
|
addi.d t0, zero, 15
|
|
addi.d t1, zero, 7
|
|
addi.d t2, zero, 3
|
|
addi.d t3, zero, 1
|
|
ld.d t4, sp, 0
|
|
addi.d sp, sp, -16
|
|
st.d s0, sp, 0
|
|
st.d s1, sp, 8
|
|
slli.d s0, a5, 1
|
|
.LOOPH:
|
|
bge zero, t4, .ENDLOOPH
|
|
addi.d t4, t4, -1
|
|
add.d t5, a0, a5
|
|
add.d t7, t5, a5
|
|
move t6, a7
|
|
.LOOPW16:
|
|
bge t0, t6, .LOOPW8
|
|
vld vr0, a0, 0
|
|
vld vr1, t5, 0
|
|
vld vr2, t7, 0
|
|
vld vr3, a0, 1
|
|
vld vr4, t5, 1
|
|
vld vr5, t7, 1
|
|
vld vr6, a0, 16
|
|
vld vr7, t5, 16
|
|
vld vr8, t7, 16
|
|
vld vr9, a0, 17
|
|
vld vr10, t5, 17
|
|
vld vr11, t7, 17
|
|
|
|
// Calculate dst0, dsth, dstv and dstc
|
|
vavgr.bu vr12, vr0, vr1
|
|
vavgr.bu vr13, vr1, vr2
|
|
vavgr.bu vr14, vr3, vr4
|
|
vavgr.bu vr15, vr4, vr5
|
|
vavgr.bu vr16, vr6, vr7
|
|
vavgr.bu vr17, vr7, vr8
|
|
vavgr.bu vr18, vr9, vr10
|
|
vavgr.bu vr19, vr10, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vhaddw.hu.bu vr17, vr17, vr17
|
|
vhaddw.hu.bu vr18, vr18, vr18
|
|
vhaddw.hu.bu vr19, vr19, vr19
|
|
vssrarni.bu.h vr13, vr12, 1
|
|
vssrarni.bu.h vr15, vr14, 1
|
|
vssrarni.bu.h vr17, vr16, 1
|
|
vssrarni.bu.h vr19, vr18, 1
|
|
vilvl.d vr12, vr17, vr13
|
|
vilvl.d vr14, vr19, vr15
|
|
vilvh.d vr13, vr17, vr13
|
|
vilvh.d vr15, vr19, vr15
|
|
vst vr12, a1, 0
|
|
vst vr14, a2, 0
|
|
vst vr13, a3, 0
|
|
vst vr15, a4, 0
|
|
|
|
addi.d a1, a1, 16
|
|
addi.d a2, a2, 16
|
|
addi.d a3, a3, 16
|
|
addi.d a4, a4, 16
|
|
addi.d a0, a0, 32
|
|
addi.d t5, t5, 32
|
|
addi.d t7, t7, 32
|
|
addi.d t6, t6, -16
|
|
b .LOOPW16
|
|
.LOOPW8:
|
|
bge t1, t6, .LOOPW4
|
|
vld vr0, a0, 0
|
|
vld vr1, t5, 0
|
|
vld vr2, t7, 0
|
|
vld vr3, a0, 1
|
|
vld vr4, t5, 1
|
|
vld vr5, t7, 1
|
|
|
|
// Calculate dst0, dsth, dstv and dstc
|
|
vavgr.bu vr12, vr0, vr1
|
|
vavgr.bu vr13, vr1, vr2
|
|
vavgr.bu vr14, vr3, vr4
|
|
vavgr.bu vr15, vr4, vr5
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vssrarni.bu.h vr13, vr12, 1
|
|
vssrarni.bu.h vr15, vr14, 1
|
|
vstelm.d vr13, a1, 0, 0
|
|
vstelm.d vr15, a2, 0, 0
|
|
vstelm.d vr13, a3, 0, 1
|
|
vstelm.d vr15, a4, 0, 1
|
|
|
|
addi.d a1, a1, 8
|
|
addi.d a2, a2, 8
|
|
addi.d a3, a3, 8
|
|
addi.d a4, a4, 8
|
|
addi.d a0, a0, 16
|
|
addi.d t5, t5, 16
|
|
addi.d t7, t7, 16
|
|
addi.d t6, t6, -8
|
|
b .LOOPW8
|
|
.LOOPW4:
|
|
bge t2, t6, .LOOPW2
|
|
vld vr0, a0, 0
|
|
vld vr1, t5, 0
|
|
vld vr2, t7, 0
|
|
vld vr3, a0, 1
|
|
vld vr4, t5, 1
|
|
vld vr5, t7, 1
|
|
|
|
// Calculate dst0, dsth, dstv and dstc
|
|
vavgr.bu vr12, vr0, vr1
|
|
vavgr.bu vr13, vr1, vr2
|
|
vavgr.bu vr14, vr3, vr4
|
|
vavgr.bu vr15, vr4, vr5
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vssrarni.bu.h vr13, vr12, 1
|
|
vssrarni.bu.h vr15, vr14, 1
|
|
vstelm.w vr13, a1, 0, 0
|
|
vstelm.w vr15, a2, 0, 0
|
|
vstelm.w vr13, a3, 0, 2
|
|
vstelm.w vr15, a4, 0, 2
|
|
|
|
addi.d a1, a1, 4
|
|
addi.d a2, a2, 4
|
|
addi.d a3, a3, 4
|
|
addi.d a4, a4, 4
|
|
addi.d a0, a0, 8
|
|
addi.d t5, t5, 8
|
|
addi.d t7, t7, 8
|
|
addi.d t6, t6, -4
|
|
b .LOOPW4
|
|
.LOOPW2:
|
|
bge t3, t6, .LOOPW1
|
|
vld vr0, a0, 0
|
|
vld vr1, t5, 0
|
|
vld vr2, t7, 0
|
|
vld vr3, a0, 1
|
|
vld vr4, t5, 1
|
|
vld vr5, t7, 1
|
|
|
|
// Calculate dst0, dsth, dstv and dstc
|
|
vavgr.bu vr12, vr0, vr1
|
|
vavgr.bu vr13, vr1, vr2
|
|
vavgr.bu vr14, vr3, vr4
|
|
vavgr.bu vr15, vr4, vr5
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vssrarni.bu.h vr13, vr12, 1
|
|
vssrarni.bu.h vr15, vr14, 1
|
|
vstelm.h vr13, a1, 0, 0
|
|
vstelm.h vr15, a2, 0, 0
|
|
vstelm.h vr13, a3, 0, 4
|
|
vstelm.h vr15, a4, 0, 4
|
|
|
|
addi.d a1, a1, 2
|
|
addi.d a2, a2, 2
|
|
addi.d a3, a3, 2
|
|
addi.d a4, a4, 2
|
|
addi.d a0, a0, 4
|
|
addi.d t5, t5, 4
|
|
addi.d t7, t7, 4
|
|
addi.d t6, t6, -2
|
|
b .LOOPW2
|
|
.LOOPW1:
|
|
bge zero, t6, .ENDLOOPW1
|
|
vld vr0, a0, 0
|
|
vld vr1, t5, 0
|
|
vld vr2, t7, 0
|
|
vld vr3, a0, 1
|
|
vld vr4, t5, 1
|
|
vld vr5, t7, 1
|
|
|
|
// Calculate dst0, dsth, dstv and dstc
|
|
vavgr.bu vr12, vr0, vr1
|
|
vavgr.bu vr13, vr1, vr2
|
|
vavgr.bu vr14, vr3, vr4
|
|
vavgr.bu vr15, vr4, vr5
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vssrarni.bu.h vr13, vr12, 1
|
|
vssrarni.bu.h vr15, vr14, 1
|
|
vstelm.b vr13, a1, 0, 0
|
|
vstelm.b vr15, a2, 0, 0
|
|
vstelm.b vr13, a3, 0, 8
|
|
vstelm.b vr15, a4, 0, 8
|
|
.ENDLOOPW1:
|
|
sub.d s1, a7, t6
|
|
sub.d a0, a0, s1
|
|
sub.d a0, a0, s1
|
|
add.d a0, a0, s0
|
|
sub.d a1, a1, s1
|
|
add.d a1, a1, a6
|
|
sub.d a2, a2, s1
|
|
add.d a2, a2, a6
|
|
sub.d a3, a3, s1
|
|
add.d a3, a3, a6
|
|
sub.d a4, a4, s1
|
|
add.d a4, a4, a6
|
|
b .LOOPH
|
|
.ENDLOOPH:
|
|
ld.d s0, sp, 0
|
|
ld.d s1, sp, 8
|
|
addi.d sp, sp, 16
|
|
endfunc_x264
|
|
#endif /* !HIGH_BIT_DEPTH */
|