2025-04-28 08:47:28 +08:00

2703 lines
96 KiB
ArmAsm

/*****************************************************************************
* mc-a.S: LoongArch motion compensation
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "loongson_asm.S"
#include "loongson_util.S"
const ch_shuf
.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9
.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9
endconst
const pw_1024
.rept 16
.short 1024
.endr
endconst
const filt_mul20
.rept 32
.byte 20
.endr
endconst
const filt_mul15
.rept 16
.byte 1, -5
.endr
endconst
const filt_mul51
.rept 16
.byte -5, 1
.endr
endconst
const hpel_shuf
.rept 2
.byte 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
.endr
endconst
const shuf_12
.rept 2
.byte 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
.endr
endconst
const shuf_14
.rept 2
.byte 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
.endr
endconst
const shuf_15
.rept 2
.byte 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
.endr
endconst
const shuf_1
.rept 2
.byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
.endr
endconst
const shuf_2
.rept 2
.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
.endr
endconst
const shuf_3
.rept 2
.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
.endr
endconst
const shuf_4
.rept 2
.byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
.endr
endconst
const shuf_6
.rept 2
.byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
.endr
endconst
#if !HIGH_BIT_DEPTH
.macro MC_CHROMA_START
srai.d t0, a5, 3
srai.d t1, a6, 3
slli.d t0, t0, 1
mul.d t1, t1, a4
add.d t1, t1, t0
add.d a3, a3, t1 /* src += (m_vy >> 3) * i_src_stride + (m_vx >> 3) * 2 */
.endm
/*
* void mc_chroma( uint8_t *p_dst_u, uint8_t *p_dst_v,
* intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t m_vx, int32_t m_vy,
* int32_t i_width, int32_t i_height )
*/
function_x264 mc_chroma_lasx
MC_CHROMA_START
andi a5, a5, 0x07 /* m_vx & 0x07 */
andi a6, a6, 0x07 /* m_vy & 0x07 */
move t0, a5
slli.d t0, t0, 8
sub.d t0, t0, a5
li.d a5, 8
addi.d t0, t0, 8
sub.d a5, a5, a6
mul.d a6, a6, t0 /* (x * 255 + 8) * y */
mul.d a5, a5, t0 /* (x * 255 + 8) * (8 - y) */
xvreplgr2vr.h xr6, a6 /* cD cC ... cD cC */
xvreplgr2vr.h xr7, a5 /* cB cA ... cB cA */
la.local t0, ch_shuf
xvld xr5, t0, 0
addi.d t0, a7, -4
ldptr.w a7, sp, 0 /* a7 = i_height */
slli.d t1, a4, 1
blt zero, t0, .L_WIDTH8
.L_LOOP4:
vld vr0, a3, 0
vldx vr1, a3, a4
vldx vr2, a3, t1
xvpermi.q xr0, xr1, 0x02
xvpermi.q xr1, xr2, 0x02
xvshuf.b xr0, xr0, xr0, xr5
xvshuf.b xr1, xr1, xr1, xr5
xvdp2.h.bu xr2, xr0, xr7
xvdp2.h.bu xr3, xr1, xr6
xvadd.h xr0, xr2, xr3
xvssrlrni.bu.h xr0, xr0, 6
xvstelm.w xr0, a0, 0, 0
xvstelm.w xr0, a1, 0, 1
add.d a0, a0, a2
add.d a1, a1, a2
xvstelm.w xr0, a0, 0, 4
xvstelm.w xr0, a1, 0, 5
add.d a0, a0, a2
add.d a1, a1, a2
add.d a3, a3, t1
addi.d a7, a7, -2
blt zero, a7, .L_LOOP4
b .ENDFUNC
.L_WIDTH8:
xvld xr0, a3, 0
xvpermi.d xr0, xr0, 0x94
xvshuf.b xr0, xr0, xr0, xr5
.L_LOOP8:
xvldx xr3, a3, a4
xvpermi.d xr3, xr3, 0x94
xvshuf.b xr3, xr3, xr3, xr5
xvdp2.h.bu xr1, xr0, xr7
xvdp2.h.bu xr2, xr3, xr6
xvdp2.h.bu xr8, xr3, xr7
xvldx xr0, a3, t1
xvpermi.d xr0, xr0, 0x94
xvshuf.b xr0, xr0, xr0, xr5
xvdp2.h.bu xr4, xr0, xr6
xvadd.h xr1, xr1, xr2
xvadd.h xr3, xr8, xr4
xvssrlrni.bu.h xr3, xr1, 6
xvpermi.q xr4, xr3, 0x01
xvpackev.w xr8, xr4, xr3
xvpackod.w xr9, xr4, xr3
vstelm.d vr8, a0, 0, 0
vstelm.d vr9, a1, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vstelm.d vr8, a0, 0, 1
vstelm.d vr9, a1, 0, 1
addi.d a7, a7, -2
add.d a0, a0, a2
add.d a1, a1, a2
add.d a3, a3, t1
blt zero, a7, .L_LOOP8
.ENDFUNC:
endfunc_x264
.macro PIXEL_AVG_START
slli.d t0, a3, 1
add.w t1, t0, a3
slli.d t2, a3, 2
slli.d t3, a5, 1
add.w t4, t3, a5
slli.d t5, a5, 2
slli.d t6, a1, 1
add.w t7, t6, a1
slli.d t8, a1, 2
.endm
.macro BIWEIGHT_AVG_START
addi.d t0, zero, 64
sub.d t0, t0, a6
xvreplgr2vr.b xr0, a6
xvreplgr2vr.b xr1, t0
xvpackev.b xr8, xr1, xr0
xvxor.v xr9, xr9, xr9
xvaddi.hu xr9, xr9, 6
.endm
.macro BIWEIGHT_AVG_CORE a, b
xvpermi.d \a, \a, 0x50
xvpermi.d \b, \b, 0x50
xvilvl.b \a, \b, \a
xvmulwev.h.bu.b \b, \a, xr8
xvmaddwod.h.bu.b \b, \a, xr8
xvssrarn.bu.h \b, \b, xr9
xvpermi.d \b, \b, 0x08
.endm
.macro PIXEL_AVG_START_W8
slli.d t0, a3, 1
add.w t1, t0, a3
slli.d t3, a5, 1
add.w t4, t3, a5
.endm
function_x264 pixel_avg_weight_w4_lasx
addi.d t0, zero, 64
sub.d t0, t0, a6
vreplgr2vr.b vr0, a6
vreplgr2vr.b vr1, t0
vpackev.b vr8, vr1, vr0
.LOOP_HEIGHT_W4_1:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fld.s f2, a4, 0
fldx.s f3, a4, a5
vilvl.w vr0, vr1, vr0
vilvl.w vr2, vr3, vr2
vilvl.b vr0, vr2, vr0
vmulwev.h.bu.b vr1, vr0, vr8
vmaddwod.h.bu.b vr1, vr0, vr8
vssrarni.bu.h vr1, vr1, 6
fst.s f1, a0, 0
add.d a0, a0, a1
vstelm.w vr1, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a4, a5, a4, 1
addi.w a7, a7, -2
bnez a7, .LOOP_HEIGHT_W4_1
endfunc_x264
function_x264 pixel_avg_w4_lasx
.LOOP_HEIGHT_W4:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fld.s f4, a4, 0
fldx.s f5, a4, a5
vilvl.w vr0, vr1, vr0
vilvl.w vr4, vr5, vr4
vavgr.bu vr0, vr0, vr4
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a4, a5, a4, 1
addi.w a7, a7, -2
bnez a7, .LOOP_HEIGHT_W4
endfunc_x264
function_x264 pixel_avg_weight_w8_lasx
addi.d t0, zero, 64
sub.d t0, t0, a6
xvreplgr2vr.b xr0, a6
xvreplgr2vr.b xr1, t0
xvpackev.b xr8, xr1, xr0
PIXEL_AVG_START_W8
.LOOP_HEIGHT_W8_1:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fld.d f4, a4, 0
fldx.d f5, a4, a5
fldx.d f6, a4, t3
fldx.d f7, a4, t4
vilvl.b vr0, vr4, vr0
vilvl.b vr1, vr5, vr1
vilvl.b vr2, vr6, vr2
vilvl.b vr3, vr7, vr3
xvpermi.q xr1, xr0, 0x20
xvpermi.q xr3, xr2, 0x20
xvmulwev.h.bu.b xr2, xr1, xr8
xvmaddwod.h.bu.b xr2, xr1, xr8
xvmulwev.h.bu.b xr4, xr3, xr8
xvmaddwod.h.bu.b xr4, xr3, xr8
xvssrarni.bu.h xr4, xr2, 6
fst.d f4, a0, 0
add.d a0, a0, a1
xvstelm.d xr4, a0, 0, 2
add.d a0, a0, a1
xvstelm.d xr4, a0, 0, 1
add.d a0, a0, a1
xvstelm.d xr4, a0, 0, 3
add.d a0, a0, a1
alsl.d a2, a3, a2, 2
alsl.d a4, a5, a4, 2
addi.w a7, a7, -4
bnez a7, .LOOP_HEIGHT_W8_1
endfunc_x264
function_x264 pixel_avg_w8_lasx
PIXEL_AVG_START_W8
.LOOP_HEIGHT_W8:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fld.d f4, a4, 0
fldx.d f5, a4, a5
fldx.d f6, a4, t3
fldx.d f7, a4, t4
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr6, vr7, vr6
vavgr.bu vr0, vr0, vr4
vavgr.bu vr2, vr2, vr6
fst.d f0, a0, 0
add.d a0, a0, a1
vstelm.d vr0, a0, 0, 1
fstx.d f2, a0, a1
alsl.d a0, a1, a0, 1
vstelm.d vr2, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 2
alsl.d a4, a5, a4, 2
addi.w a7, a7, -4
bnez a7, .LOOP_HEIGHT_W8
endfunc_x264
function_x264 pixel_avg_weight_w16_lasx
BIWEIGHT_AVG_START
PIXEL_AVG_START
.L_HEIGHT_LOOP_T:
LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7
BIWEIGHT_AVG_CORE xr0, xr4
BIWEIGHT_AVG_CORE xr1, xr5
vst vr4, a0, 0
vstx vr5, a0, a1
BIWEIGHT_AVG_CORE xr2, xr6
BIWEIGHT_AVG_CORE xr3, xr7
vstx vr6, a0, t6
vstx vr7, a0, t7
add.d a2, a2, t2
add.d a4, a4, t5
add.d a0, a0, t8
addi.d a7, a7, -4
bnez a7, .L_HEIGHT_LOOP_T
endfunc_x264
function_x264 pixel_avg_w16_lasx
PIXEL_AVG_START
.L_HEIGHT_LOOP:
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t0
vldx vr3, a2, t1
vld vr4, a4, 0
vldx vr5, a4, a5
vldx vr6, a4, t3
vldx vr7, a4, t4
vavgr.bu vr0, vr0, vr4
vavgr.bu vr1, vr1, vr5
vavgr.bu vr2, vr2, vr6
vavgr.bu vr3, vr3, vr7
vst vr0, a0, 0
vstx vr1, a0, a1
vstx vr2, a0, t6
vstx vr3, a0, t7
add.d a0, a0, t8
add.d a2, a2, t2
add.d a4, a4, t5
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t0
vldx vr3, a2, t1
vld vr4, a4, 0
vldx vr5, a4, a5
vldx vr6, a4, t3
vldx vr7, a4, t4
vavgr.bu vr0, vr0, vr4
vavgr.bu vr1, vr1, vr5
vavgr.bu vr2, vr2, vr6
vavgr.bu vr3, vr3, vr7
vst vr0, a0, 0
vstx vr1, a0, a1
vstx vr2, a0, t6
vstx vr3, a0, t7
add.d a2, a2, t2
add.d a4, a4, t5
add.d a0, a0, t8
addi.d a7, a7, -8
bnez a7, .L_HEIGHT_LOOP
endfunc_x264
.macro FILT_PACK_LASX s1, s2, s3
xvmulwev.w.h xr16, \s1, \s3
xvmulwev.w.h xr17, \s2, \s3
xvsrarni.h.w xr17, xr16, 15
xvmaxi.h xr17, xr17, 0
xvsat.hu xr17, xr17, 7
xvmulwod.w.h xr18, \s1, \s3
xvmulwod.w.h xr19, \s2, \s3
xvsrarni.h.w xr19, xr18, 15
xvmaxi.h xr19, xr19, 0
xvsat.hu xr19, xr19, 7
xvpackev.b \s1, xr19, xr17
.endm
/* s3: temp, s4: UNUSED, s5: imm */
.macro DO_FILT_V_LASX s1, s2, s3, s4, s5
alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */
alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */
xvld xr1, a3, 0
xvldx xr2, a3, a2
xvld \s3, t2, 0
xvld xr3, a1, 0
xvldx \s1, a1, a2
xvld \s2, t1, 0
xvilvh.b xr16, xr2, xr1
xvilvl.b xr17, xr2, xr1
xvilvh.b xr18, \s2, \s1
xvilvl.b xr19, \s2, \s1
xvilvh.b xr20, \s3, xr3
xvilvl.b xr21, \s3, xr3
xvdp2.h.bu.b xr1, xr17, xr12
xvdp2.h.bu.b xr4, xr16, xr12
xvdp2.h.bu.b \s1, xr19, xr0
xvdp2.h.bu.b xr2, xr18, xr0
xvdp2.h.bu.b xr3, xr21, xr14
xvdp2.h.bu.b \s2, xr20, xr14
xvadd.h xr1, xr1, \s1
xvadd.h xr4, xr4, xr2
xvadd.h xr1, xr1, xr3
xvadd.h xr4, xr4, \s2
xmov \s1, xr1
xmov \s2, xr1
addi.d a3, a3, 32
addi.d a1, a1, 32
xvpermi.q \s1, xr4, 0x2
xvpermi.q \s2, xr4, 0x13
FILT_PACK_LASX xr1, xr4, xr15
addi.d t1, a4, \s5
xvstx xr1, t0, t1
.endm
.macro FILT_H s1, s2, s3
xvsub.h \s1, \s1, \s2
xvsrai.h \s1, \s1, 2
xvsub.h \s1, \s1, \s2
xvadd.h \s1, \s1, \s3
xvsrai.h \s1, \s1, 2
xvadd.h \s1, \s1, \s3
.endm
.macro FILT_C s1, s2, s3
xmov xr3, \s1
xvpermi.q xr3, \s2, 0x03
xvshuf.b xr1, \s2, xr3, xr23
xvshuf.b xr2, \s2, xr3, xr24
xmov \s1, \s2
xvpermi.q \s1, \s3, 0x03
xvshuf.b xr3, \s1, \s2, xr29
xvshuf.b xr4, \s1, \s2, xr27
xvadd.h xr3, xr2, xr3
xmov xr2, \s1
xmov \s1, \s3
xvshuf.b \s3, xr2, \s2, xr30
xvadd.h xr4, xr4, \s2
xvadd.h \s3, \s3, xr1
FILT_H \s3, xr3, xr4
.endm
.macro DO_FILT_C_LASX s1, s2, s3, s4
FILT_C \s1, \s2, \s3
FILT_C \s2, \s1, \s4
FILT_PACK_LASX \s3, \s4, xr15
xvpermi.d \s3, \s3, 0xd8
xvstx \s3, a5, a4
.endm
.macro DO_FILT_H_LASX s1, s2, s3
xmov xr3, \s1
xvpermi.q xr3, \s2, 0x03
xvshuf.b xr1, \s2, xr3, xr24
xvshuf.b xr2, \s2, xr3, xr25
xmov xr3, \s2
xvpermi.q xr3, \s3, 0x03
xvshuf.b xr4, xr3, \s2, xr26
xvshuf.b xr5, xr3, \s2, xr27
xvshuf.b xr6, xr3, \s2, xr28
xmov \s1, \s2
xvdp2.h.bu.b xr16, xr1, xr12
xvdp2.h.bu.b xr17, xr2, xr12
xvdp2.h.bu.b xr18, \s2, xr14
xvdp2.h.bu.b xr19, xr4, xr14
xvdp2.h.bu.b xr20, xr5, xr0
xvdp2.h.bu.b xr21, xr6, xr0
xvadd.h xr1, xr16, xr18
xvadd.h xr2, xr17, xr19
xvadd.h xr1, xr1, xr20
xvadd.h xr2, xr2, xr21
FILT_PACK_LASX xr1, xr2, xr15
xvshuf.b xr1, xr1, xr1, xr22
xvstx xr1, a0, a4
xmov \s2, \s3
.endm
/*
* void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
* uint8_t *src, intptr_t stride, int width, int height )
*/
function_x264 hpel_filter_lasx
addi.d sp, sp, -56
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
move a7, a3
addi.d a5, a5, -32
move t0, a1
andi a7, a7, 31
sub.d a3, a3, a7
add.d a0, a0, a5
add.d t0, t0, a5
add.d a7, a7, a5
add.d a5, a5, a2
move a2, a4
sub.d a7, zero, a7
add.d a1, a3, a2
sub.d a3, a3, a2
sub.d a3, a3, a2
move a4, a7
la.local t1, filt_mul51
xvld xr0, t1, 0
la.local t2, filt_mul15
xvld xr12, t2, 0
la.local t3, filt_mul20
xvld xr14, t3, 0
la.local t4, pw_1024
xvld xr15, t4, 0
la.local t1, hpel_shuf
xvld xr22, t1, 0
la.local t2, shuf_12
xvld xr23, t2, 0
la.local t3, shuf_1
xvld xr26, t3, 0
xvaddi.bu xr24, xr23, 2 /* shuf_14 */
xvaddi.bu xr25, xr23, 3 /* shuf_15 */
xvaddi.bu xr27, xr26, 1 /* shuf_2 */
xvaddi.bu xr28, xr26, 2 /* shuf_3 */
xvaddi.bu xr29, xr26, 3 /* shuf_4 */
xvaddi.bu xr30, xr26, 5 /* shuf_6 */
xvxor.v xr9, xr9, xr9
xvxor.v xr10, xr10, xr10
.LOOPY:
DO_FILT_V_LASX xr8, xr7, xr13, xr12, 0
.LOOPX:
DO_FILT_V_LASX xr6, xr5, xr11, xr12, 32
.LASTX:
xvsrli.h xr15, xr15, 1
DO_FILT_C_LASX xr9, xr8, xr7, xr6
xvadd.h xr15, xr15, xr15
xmov xr7, xr5
DO_FILT_H_LASX xr10, xr13, xr11
addi.d a4, a4, 32
blt a4, zero, .LOOPX
addi.d t1, a4, -32
blt t1, zero, .LASTX
//setup regs for next y
sub.d a4, a4, a7
sub.d a4, a4, a2
sub.d a1, a1, a4
sub.d a3, a3, a4
add.d a0, a0, a2
add.d t0, t0, a2
add.d a5, a5, a2
move a4, a7
addi.d a6, a6, -1
blt zero, a6, .LOOPY
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
addi.d sp, sp, 56
endfunc_x264
/*
* void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
* pixel *src2, intptr_t src2_stride, int weight);
*/
.macro PIXEL_AVG w, h
function_x264 pixel_avg_\w\()x\h\()_lasx
addi.d t0, a6, -32
addi.d a7, zero, \h
bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lasx
b x264_8_pixel_avg_w\w\()_lasx
endfunc_x264
.endm
PIXEL_AVG 16, 8
PIXEL_AVG 8, 16
PIXEL_AVG 8, 8
PIXEL_AVG 8, 4
PIXEL_AVG 4, 16
PIXEL_AVG 4, 8
PIXEL_AVG 4, 4
PIXEL_AVG 4, 2
function_x264 mc_weight_w20_noden_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.b xr0, a4, 36 // scale
.LOOP_WEIGHTW20_NODEN:
xvld xr3, a2, 0
xvldx xr4, a2, a3
xvmulwev.h.bu.b xr7, xr3, xr0
xvmulwev.h.bu.b xr8, xr4, xr0
xvmulwod.h.bu.b xr3, xr3, xr0
xvmulwod.h.bu.b xr4, xr4, xr0
xvadd.h xr7, xr7, xr1
xvadd.h xr8, xr8, xr1
xvadd.h xr3, xr3, xr1
xvadd.h xr4, xr4, xr1
xvssrarni.bu.h xr8, xr7, 0
xvssrarni.bu.h xr4, xr3, 0
xvilvl.b xr3, xr4, xr8
xvilvh.b xr4, xr4, xr8
vst vr3, a0, 0
xvstelm.w xr3, a0, 16, 4
add.d a0, a0, a1
vst vr4, a0, 0
xvstelm.w xr4, a0, 16, 4
alsl.d a2, a3, a2, 1
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW20_NODEN
endfunc_x264
function_x264 mc_weight_w16_noden_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
.LOOP_WEIGHTW16_NODEN:
vld vr3, a2, 0
vldx vr4, a2, a3
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr4, xr4
xvmul.h xr3, xr3, xr0
xvmul.h xr4, xr4, xr0
xvadd.h xr3, xr3, xr1
xvadd.h xr4, xr4, xr1
xvssrarni.bu.h xr4, xr3, 0
xvpermi.d xr3, xr4, 8
xvpermi.d xr4, xr4, 13
vst vr3, a0, 0
vstx vr4, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW16_NODEN
endfunc_x264
function_x264 mc_weight_w8_noden_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
.LOOP_WEIGHTW8_NODEN:
fld.d f3, a2, 0
fldx.d f4, a2, a3
vilvl.d vr3, vr4, vr3
vext2xv.hu.bu xr3, xr3
xvmul.h xr3, xr3, xr0
xvadd.h xr3, xr3, xr1
xvssrarni.bu.h xr3, xr3, 0
xvstelm.d xr3, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr3, a0, 0, 2
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW8_NODEN
endfunc_x264
function_x264 mc_weight_w4_noden_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
.LOOP_WEIGHTW4_NODEN:
fld.s f3, a2, 0
fldx.s f4, a2, a3
vilvl.w vr3, vr4, vr3
vext2xv.hu.bu xr3, xr3
xvmul.h xr3, xr3, xr0
xvadd.h xr3, xr3, xr1
xvssrarni.bu.h xr3, xr3, 0
xvstelm.w xr3, a0, 0, 0
add.d a0, a0, a1
xvstelm.w xr3, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW4_NODEN
endfunc_x264
function_x264 mc_weight_w20_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.b xr0, a4, 36 // scale
xvldrepl.h xr2, a4, 32 // denom
xvsll.h xr1, xr1, xr2
.LOOP_WEIGHTW20:
xvld xr3, a2, 0
xvldx xr4, a2, a3
xvmulwev.h.bu.b xr7, xr3, xr0
xvmulwev.h.bu.b xr8, xr4, xr0
xvmulwod.h.bu.b xr3, xr3, xr0
xvmulwod.h.bu.b xr4, xr4, xr0
xvsadd.h xr7, xr7, xr1
xvsadd.h xr8, xr8, xr1
xvsadd.h xr3, xr3, xr1
xvsadd.h xr4, xr4, xr1
xvssrarn.bu.h xr7, xr7, xr2
xvssrarn.bu.h xr8, xr8, xr2
xvssrarn.bu.h xr3, xr3, xr2
xvssrarn.bu.h xr4, xr4, xr2
xvilvl.b xr3, xr3, xr7
xvilvl.b xr4, xr4, xr8
vst vr3, a0, 0
xvstelm.w xr3, a0, 16, 4
add.d a0, a0, a1
vst vr4, a0, 0
xvstelm.w xr4, a0, 16, 4
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW20
endfunc_x264
function_x264 mc_weight_w16_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
xvldrepl.h xr2, a4, 32 // denom
xvsll.h xr1, xr1, xr2
.LOOP_WEIGHTW16:
vld vr3, a2, 0
vldx vr4, a2, a3
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr4, xr4
xvmul.h xr3, xr3, xr0
xvmul.h xr4, xr4, xr0
xvsadd.h xr3, xr3, xr1
xvsadd.h xr4, xr4, xr1
xvssrarn.bu.h xr3, xr3, xr2
xvssrarn.bu.h xr4, xr4, xr2
xvpermi.d xr3, xr3, 8
xvpermi.d xr4, xr4, 8
vst vr3, a0, 0
vstx vr4, a0, a1
alsl.d a0, a1, a0, 1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW16
endfunc_x264
function_x264 mc_weight_w8_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
xvldrepl.h xr2, a4, 32 // denom
xvsll.h xr1, xr1, xr2
.LOOP_WEIGHTW8:
fld.d f3, a2, 0
fldx.d f4, a2, a3
vilvl.d vr3, vr4, vr3
vext2xv.hu.bu xr3, xr3
xvmul.h xr3, xr3, xr0
xvsadd.h xr3, xr3, xr1
xvssrarn.bu.h xr3, xr3, xr2
xvstelm.d xr3, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr3, a0, 0, 2
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW8
endfunc_x264
function_x264 mc_weight_w4_lasx
xvldrepl.h xr1, a4, 40 // offset
xvldrepl.h xr0, a4, 36 // scale
xvldrepl.h xr2, a4, 32 // denom
xvsll.h xr1, xr1, xr2
.LOOP_WEIGHTW4:
fld.s f3, a2, 0
fldx.s f4, a2, a3
vilvl.w vr3, vr4, vr3
vext2xv.hu.bu xr3, xr3
xvmul.h xr3, xr3, xr0
xvsadd.h xr3, xr3, xr1
xvssrarn.bu.h xr3, xr3, xr2
xvstelm.w xr3, a0, 0, 0
add.d a0, a0, a1
xvstelm.w xr3, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHTW4
endfunc_x264
/*
* void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w4_lasx
.avg2w4_loop_2:
addi.d a5, a5, -2
fld.s f0, a2, 0
fld.s f1, a4, 0
fldx.s f2, a2, a3
fldx.s f3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr1
vavgr.bu vr1, vr2, vr3
fst.s f0, a0, 0
fstx.s f1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .avg2w4_loop_2
endfunc_x264
/*
* void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w8_lasx
.avg2w8_loop_2:
addi.d a5, a5, -2
fld.d f0, a2, 0
fld.d f1, a4, 0
fldx.d f2, a2, a3
fldx.d f3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr1
vavgr.bu vr1, vr2, vr3
fst.d f0, a0, 0
fstx.d f1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .avg2w8_loop_2
endfunc_x264
/*
* void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w16_lasx
.avg2w16_loop_2:
addi.d a5, a5, -2
vld vr0, a2, 0
vldx vr1, a2, a3
vld vr2, a4, 0
vldx vr3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vst vr0, a0, 0
vstx vr1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .avg2w16_loop_2
endfunc_x264
/*
* void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w20_lasx
.avg2w20_loop_2:
addi.d a5, a5, -2
xvld xr0, a2, 0
xvldx xr1, a2, a3
xvld xr2, a4, 0
xvldx xr3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
xvavgr.bu xr0, xr0, xr2
xvavgr.bu xr1, xr1, xr3
vst vr0, a0, 0
xvstelm.w xr0, a0, 16, 4
add.d a0, a0, a1
vst vr1, a0, 0
xvstelm.w xr1, a0, 16, 4
add.d a0, a0, a1
blt zero, a5, .avg2w20_loop_2
endfunc_x264
/*
* void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride,
* uint8_t *p_src, int32_t i_src_stride,
* int32_t i_height )
*/
function_x264 mc_copy_w16_lasx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPYW16:
vld vr1, a2, 0
vldx vr2, a2, a3
vldx vr3, a2, t0
vldx vr4, a2, t1
vst vr1, a0, 0
vstx vr2, a0, a1
vstx vr3, a0, t2
vstx vr4, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPYW16
endfunc_x264
/*
* void mc_copy_w8( uint8_t *p_dst, intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t i_height )
*/
function_x264 mc_copy_w8_lasx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPYW8:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fst.d f0, a0, 0
fstx.d f1, a0, a1
fstx.d f2, a0, t2
fstx.d f3, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPYW8
endfunc_x264
/*
* void mc_copy_w4( uint8_t *p_dst, intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t i_height )
*/
function_x264 mc_copy_w4_lasx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPYW4:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fldx.s f2, a2, t0
fldx.s f3, a2, t1
fst.s f0, a0, 0
fstx.s f1, a0, a1
fstx.s f2, a0, t2
fstx.s f3, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPYW4
endfunc_x264
/*
* void memzero_aligned( void *p_dst, size_t n )
*/
function_x264 memzero_aligned_lasx
xvxor.v xr1, xr1, xr1
.memzero_loop:
addi.d a1, a1, -128
.rept 4
xvst xr1, a0, 0
addi.d a0, a0, 32
.endr
blt zero, a1, .memzero_loop
endfunc_x264
/*
* void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth,
* pixel *dstv, pixel *dstc, intptr_t src_stride,
* intptr_t dst_stride, int width, int height )
*/
function_x264 frame_init_lowres_core_lasx
andi t1, a7, 15
sub.w t0, a7, t1
slli.d t2, a5, 1
ldptr.w a7, sp, 0 // use a7 as height variable
.height_loop:
add.d t4, zero, t0
addi.d t3, a0, 0
addi.d t5, a1, 0
addi.d t6, a2, 0
addi.d t7, a3, 0
addi.d t8, a4, 0
.width16_loop:
xvld xr0, t3, 0
xvldx xr1, t3, a5
xvldx xr2, t3, t2
xvavgr.bu xr3, xr0, xr1
xvavgr.bu xr4, xr1, xr2
xvhaddw.hu.bu xr5, xr3, xr3
xvhaddw.hu.bu xr6, xr4, xr4
xvssrarni.bu.h xr6, xr5, 1
xvpermi.d xr7, xr6, 0xd8
vst vr7, t5, 0
xvpermi.q xr7, xr7, 0x11
vst vr7, t7, 0
addi.d t3, t3, 1
xvld xr0, t3, 0
xvldx xr1, t3, a5
xvldx xr2, t3, t2
xvavgr.bu xr3, xr0, xr1
xvavgr.bu xr4, xr1, xr2
xvhaddw.hu.bu xr5, xr3, xr3
xvhaddw.hu.bu xr6, xr4, xr4
xvssrarni.bu.h xr6, xr5, 1
xvpermi.d xr7, xr6, 0xd8
vst vr7, t6, 0
xvpermi.q xr7, xr7, 0x11
vst vr7, t8, 0
addi.d t3, t3, 31
addi.d t5, t5, 16
addi.d t6, t6, 16
addi.d t7, t7, 16
addi.d t8, t8, 16
addi.w t4, t4, -16
blt zero, t4, .width16_loop
beqz t1, .width16_end
vld vr0, t3, 0
vldx vr1, t3, a5
vldx vr2, t3, t2
vavgr.bu vr3, vr0, vr1
vavgr.bu vr4, vr1, vr2
vhaddw.hu.bu vr5, vr3, vr3
vhaddw.hu.bu vr6, vr4, vr4
vssrarni.bu.h vr6, vr5, 1
fst.d f6, t5, 0
vstelm.d vr6, t7, 0, 1
addi.d t3, t3, 1
vld vr0, t3, 0
vldx vr1, t3, a5
vldx vr2, t3, t2
vavgr.bu vr3, vr0, vr1
vavgr.bu vr4, vr1, vr2
vhaddw.hu.bu vr5, vr3, vr3
vhaddw.hu.bu vr6, vr4, vr4
vssrarni.bu.h vr6, vr5, 1
fst.d f6, t6, 0
vstelm.d vr6, t8, 0, 1
.width16_end:
add.d a0, a0, t2
add.d a1, a1, a6
add.d a2, a2, a6
add.d a3, a3, a6
add.d a4, a4, a6
addi.w a7, a7, -1
blt zero, a7, .height_loop
endfunc_x264
/*
* void mc_chroma(uint8_t *p_dst_u, uint8_t *p_dst_v,
* intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t m_vx, int32_t m_vy,
* int32_t i_width, int32_t i_height)
*/
function_x264 mc_chroma_lsx
MC_CHROMA_START
andi a5, a5, 0x07 /* m_vx & 0x07 */
andi a6, a6, 0x07 /* m_vy & 0x07 */
li.d t8, 8
sub.d t1, t8, a5 // 8-d8x
sub.d t2, t8, a6 // 8-d8y
mul.d t3, t1, t2 // CA
mul.d t4, a5, t2 // CB
mul.d t5, t1, a6 // CC
mul.d t6, a5, a6 // CD
vreplgr2vr.b vr0, t3
vreplgr2vr.b vr1, t4
vreplgr2vr.b vr2, t5
vreplgr2vr.b vr3, t6
add.d t0, a3, a4
ldptr.w t1, sp, 0 /* i_height */
move t3, t0
addi.d t4, zero, 1
addi.d t5, zero, 3
addi.d t6, zero, 7
bge t6, a7, .ENDLOOP_W8
.LOOP_W8:
vld vr4, a3, 0
vld vr5, t0, 0
vld vr6, a3, 2
vld vr7, t0, 2
vmulwev.h.bu vr8, vr4, vr0
vmulwod.h.bu vr9, vr4, vr0
vmulwev.h.bu vr10, vr5, vr2
vmulwod.h.bu vr11, vr5, vr2
vmaddwev.h.bu vr8, vr6, vr1
vmaddwod.h.bu vr9, vr6, vr1
vmaddwev.h.bu vr10, vr7, vr3
vmaddwod.h.bu vr11, vr7, vr3
vadd.h vr12, vr8, vr10
vadd.h vr13, vr9, vr11
vssrarni.bu.h vr13, vr12, 6
vstelm.d vr13, a0, 0, 0
vstelm.d vr13, a1, 0, 1
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -1
move a3, t3
add.d t3, t3, a4
move t0, t3
blt zero, t1, .LOOP_W8
b .ENDLOOP_W2
.ENDLOOP_W8:
bge t5, a7, .ENDLOOP_W4
.LOOP_W4:
vld vr4, a3, 0
vld vr5, t0, 0
vld vr6, a3, 2
vld vr7, t0, 2
vmulwev.h.bu vr8, vr4, vr0
vmulwod.h.bu vr9, vr4, vr0
vmulwev.h.bu vr10, vr5, vr2
vmulwod.h.bu vr11, vr5, vr2
vmaddwev.h.bu vr8, vr6, vr1
vmaddwod.h.bu vr9, vr6, vr1
vmaddwev.h.bu vr10, vr7, vr3
vmaddwod.h.bu vr11, vr7, vr3
vadd.h vr12, vr8, vr10
vadd.h vr13, vr9, vr11
vssrarni.bu.h vr13, vr12, 6
vstelm.w vr13, a0, 0, 0
vstelm.w vr13, a1, 0, 2
add.d a0, a0, a2
add.d a1, a1, a2
move a3, t3
add.d t3, t3, a4
move t0, t3
addi.d t1, t1, -1
blt zero, t1, .LOOP_W4
b .ENDLOOP_W2
.ENDLOOP_W4:
bge t4, a7, .ENDLOOP_W2
.LOOP_W2:
vld vr4, a3, 0
vld vr5, t0, 0
vld vr6, a3, 2
vld vr7, t0, 2
vmulwev.h.bu vr8, vr4, vr0
vmulwod.h.bu vr9, vr4, vr0
vmulwev.h.bu vr10, vr5, vr2
vmulwod.h.bu vr11, vr5, vr2
vmaddwev.h.bu vr8, vr6, vr1
vmaddwod.h.bu vr9, vr6, vr1
vmaddwev.h.bu vr10, vr7, vr3
vmaddwod.h.bu vr11, vr7, vr3
vadd.h vr12, vr8, vr10
vadd.h vr13, vr9, vr11
vssrarni.bu.h vr13, vr12, 6
vstelm.h vr13, a0, 0, 0
vstelm.h vr13, a1, 0, 4
add.d a0, a0, a2
add.d a1, a1, a2
move a3, t3
add.d t3, t3, a4
move t0, t3
addi.d t1, t1, -1
blt zero, t1, .LOOP_W2
.ENDLOOP_W2:
endfunc_x264
function_x264 pixel_avg_weight_w4_lsx
addi.d t0, zero, 64
sub.d t0, t0, a6
vreplgr2vr.b vr0, a6
vreplgr2vr.b vr1, t0
vpackev.b vr8, vr1, vr0
.LOOP_AVG_WEIGHT_W4:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fld.s f2, a4, 0
fldx.s f3, a4, a5
vilvl.w vr0, vr1, vr0
vilvl.w vr2, vr3, vr2
vilvl.b vr0, vr2, vr0
vmulwev.h.bu.b vr1, vr0, vr8
vmaddwod.h.bu.b vr1, vr0, vr8
vssrarni.bu.h vr1, vr1, 6
fst.s f1, a0, 0
add.d a0, a0, a1
vstelm.w vr1, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a4, a5, a4, 1
addi.w a7, a7, -2
bnez a7, .LOOP_AVG_WEIGHT_W4
endfunc_x264
function_x264 pixel_avg_w4_lsx
.LOOP_AVG_W4:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fld.s f4, a4, 0
fldx.s f5, a4, a5
vilvl.w vr0, vr1, vr0
vilvl.w vr4, vr5, vr4
vavgr.bu vr0, vr0, vr4
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a4, a5, a4, 1
addi.w a7, a7, -2
bnez a7, .LOOP_AVG_W4
endfunc_x264
function_x264 pixel_avg_weight_w8_lsx
addi.d t0, zero, 64
sub.d t0, t0, a6
slli.d t5, a1, 1
add.d t6, a1, t5
add.d t7, a1, t6
vreplgr2vr.b vr0, a6
vreplgr2vr.b vr1, t0
vpackev.b vr8, vr1, vr0
PIXEL_AVG_START_W8
.LOOP_AVG_HEIGHT_W8:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fld.d f4, a4, 0
fldx.d f5, a4, a5
fldx.d f6, a4, t3
fldx.d f7, a4, t4
vilvl.b vr0, vr4, vr0
vilvl.b vr1, vr5, vr1
vilvl.b vr2, vr6, vr2
vilvl.b vr3, vr7, vr3
vmulwev.h.bu.b vr4, vr0, vr8
vmulwev.h.bu.b vr5, vr1, vr8
vmulwev.h.bu.b vr6, vr2, vr8
vmulwev.h.bu.b vr7, vr3, vr8
vmaddwod.h.bu.b vr4, vr0, vr8
vmaddwod.h.bu.b vr5, vr1, vr8
vmaddwod.h.bu.b vr6, vr2, vr8
vmaddwod.h.bu.b vr7, vr3, vr8
vssrarni.bu.h vr4, vr4, 6
vssrarni.bu.h vr5, vr5, 6
vssrarni.bu.h vr6, vr6, 6
vssrarni.bu.h vr7, vr7, 6
fst.d f4, a0, 0
fstx.d f5, a0, a1
fstx.d f6, a0, t5
fstx.d f7, a0, t6
add.d a0, a0, t7
alsl.d a2, a3, a2, 2
alsl.d a4, a5, a4, 2
addi.w a7, a7, -4
bnez a7, .LOOP_AVG_HEIGHT_W8
endfunc_x264
function_x264 pixel_avg_w8_lsx
PIXEL_AVG_START_W8
.LOOP_AVG_W8:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fld.d f4, a4, 0
fldx.d f5, a4, a5
fldx.d f6, a4, t3
fldx.d f7, a4, t4
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr6, vr7, vr6
vavgr.bu vr0, vr0, vr4
vavgr.bu vr2, vr2, vr6
fst.d f0, a0, 0
add.d a0, a0, a1
vstelm.d vr0, a0, 0, 1
fstx.d f2, a0, a1
alsl.d a0, a1, a0, 1
vstelm.d vr2, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 2
alsl.d a4, a5, a4, 2
addi.w a7, a7, -4
bnez a7, .LOOP_AVG_W8
endfunc_x264
function_x264 pixel_avg_weight_w16_lsx
addi.d t0, zero, 64
sub.d t0, t0, a6
vreplgr2vr.b vr8, a6
vreplgr2vr.b vr9, t0
PIXEL_AVG_START
.LOOP_AVG_HEIGHT_W16:
LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7
vmulwev.h.bu.b vr10, vr0, vr8
vmulwev.h.bu.b vr11, vr1, vr8
vmulwev.h.bu.b vr12, vr2, vr8
vmulwev.h.bu.b vr13, vr3, vr8
vmulwod.h.bu.b vr14, vr0, vr8
vmulwod.h.bu.b vr15, vr1, vr8
vmulwod.h.bu.b vr16, vr2, vr8
vmulwod.h.bu.b vr17, vr3, vr8
vmaddwev.h.bu.b vr10, vr4, vr9
vmaddwev.h.bu.b vr11, vr5, vr9
vmaddwev.h.bu.b vr12, vr6, vr9
vmaddwev.h.bu.b vr13, vr7, vr9
vmaddwod.h.bu.b vr14, vr4, vr9
vmaddwod.h.bu.b vr15, vr5, vr9
vmaddwod.h.bu.b vr16, vr6, vr9
vmaddwod.h.bu.b vr17, vr7, vr9
vssrarni.bu.h vr11, vr10, 6
vssrarni.bu.h vr13, vr12, 6
vssrarni.bu.h vr15, vr14, 6
vssrarni.bu.h vr17, vr16, 6
vilvl.b vr10, vr15, vr11
vilvh.b vr11, vr15, vr11
vilvl.b vr12, vr17, vr13
vilvh.b vr13, vr17, vr13
vst vr10, a0, 0
vstx vr11, a0, a1
vstx vr12, a0, t6
vstx vr13, a0, t7
add.d a2, a2, t2
add.d a4, a4, t5
add.d a0, a0, t8
addi.d a7, a7, -4
bnez a7, .LOOP_AVG_HEIGHT_W16
endfunc_x264
function_x264 pixel_avg_w16_lsx
PIXEL_AVG_START
.LOOP_AVG_W16:
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t0
vldx vr3, a2, t1
vld vr4, a4, 0
vldx vr5, a4, a5
vldx vr6, a4, t3
vldx vr7, a4, t4
vavgr.bu vr0, vr0, vr4
vavgr.bu vr1, vr1, vr5
vavgr.bu vr2, vr2, vr6
vavgr.bu vr3, vr3, vr7
vst vr0, a0, 0
vstx vr1, a0, a1
vstx vr2, a0, t6
vstx vr3, a0, t7
add.d a0, a0, t8
add.d a2, a2, t2
add.d a4, a4, t5
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t0
vldx vr3, a2, t1
vld vr4, a4, 0
vldx vr5, a4, a5
vldx vr6, a4, t3
vldx vr7, a4, t4
vavgr.bu vr0, vr0, vr4
vavgr.bu vr1, vr1, vr5
vavgr.bu vr2, vr2, vr6
vavgr.bu vr3, vr3, vr7
vst vr0, a0, 0
vstx vr1, a0, a1
vstx vr2, a0, t6
vstx vr3, a0, t7
add.d a2, a2, t2
add.d a4, a4, t5
add.d a0, a0, t8
addi.d a7, a7, -8
bnez a7, .LOOP_AVG_W16
endfunc_x264
/*
* void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
* pixel *src2, intptr_t src2_stride, int weight);
*/
.macro PIXEL_AVG_LSX w, h
function_x264 pixel_avg_\w\()x\h\()_lsx
addi.d t0, a6, -32
addi.d a7, zero, \h
bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lsx
b x264_8_pixel_avg_w\w\()_lsx
endfunc_x264
.endm
PIXEL_AVG_LSX 16, 16
PIXEL_AVG_LSX 16, 8
PIXEL_AVG_LSX 8, 16
PIXEL_AVG_LSX 8, 8
PIXEL_AVG_LSX 8, 4
PIXEL_AVG_LSX 4, 16
PIXEL_AVG_LSX 4, 8
PIXEL_AVG_LSX 4, 4
PIXEL_AVG_LSX 4, 2
function_x264 mc_weight_w20_noden_lsx
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr1, a4, 40 // offset
.LOOP_WEIGHT_W20_NODEN:
vld vr3, a2, 0
vld vr4, a2, 16
add.d a2, a2, a3
vld vr5, a2, 0
vld vr6, a2, 16
vilvl.w vr4, vr6, vr4
vmulwev.h.bu.b vr7, vr3, vr0
vmulwod.h.bu.b vr8, vr3, vr0
vmulwev.h.bu.b vr9, vr4, vr0
vmulwod.h.bu.b vr10, vr4, vr0
vmulwev.h.bu.b vr11, vr5, vr0
vmulwod.h.bu.b vr12, vr5, vr0
vadd.h vr7, vr7, vr1
vadd.h vr8, vr8, vr1
vadd.h vr9, vr9, vr1
vadd.h vr10, vr10, vr1
vadd.h vr11, vr11, vr1
vadd.h vr12, vr12, vr1
vssrani.bu.h vr11, vr7, 0
vssrani.bu.h vr12, vr8, 0
vssrani.bu.h vr9, vr9, 0
vssrani.bu.h vr10, vr10, 0
vilvl.b vr7, vr12, vr11
vilvl.b vr9, vr10, vr9
vilvh.b vr11, vr12, vr11
vst vr7, a0, 0
vstelm.w vr9, a0, 16, 0
add.d a0, a0, a1
vst vr11, a0, 0
vstelm.w vr9, a0, 16, 1
add.d a0, a0, a1
add.d a2, a2, a3
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W20_NODEN
endfunc_x264
function_x264 mc_weight_w16_noden_lsx
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr1, a4, 40 // offset
.LOOP_WEIGHT_W16_NODEN:
vld vr3, a2, 0
vldx vr4, a2, a3
vmulwev.h.bu.b vr5, vr3, vr0
vmulwod.h.bu.b vr6, vr3, vr0
vmulwev.h.bu.b vr7, vr4, vr0
vmulwod.h.bu.b vr8, vr4, vr0
vadd.h vr5, vr5, vr1
vadd.h vr6, vr6, vr1
vadd.h vr7, vr7, vr1
vadd.h vr8, vr8, vr1
vssrani.bu.h vr7, vr5, 0
vssrani.bu.h vr8, vr6, 0
vilvl.b vr5, vr8, vr7
vilvh.b vr7, vr8, vr7
vst vr5, a0, 0
vstx vr7, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W16_NODEN
endfunc_x264
function_x264 mc_weight_w8_noden_lsx
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr1, a4, 40 // offset
.LOOP_WEIGHT_W8_NODEN:
fld.d f3, a2, 0
fldx.d f4, a2, a3
vilvl.d vr3, vr4, vr3
vmulwev.h.bu.b vr5, vr3, vr0
vmulwod.h.bu.b vr6, vr3, vr0
vadd.h vr5, vr5, vr1
vadd.h vr6, vr6, vr1
vssrani.bu.h vr5, vr5, 0
vssrani.bu.h vr6, vr6, 0
vilvl.b vr7, vr6, vr5
vstelm.d vr7, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr7, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W8_NODEN
endfunc_x264
function_x264 mc_weight_w4_noden_lsx
vldrepl.h vr0, a4, 36 // scale
vldrepl.h vr1, a4, 40 // offset
.LOOP_WEIGHT_W4_NODEN:
fld.s f3, a2, 0
fldx.s f4, a2, a3
vilvl.w vr3, vr4, vr3
vsllwil.hu.bu vr3, vr3, 0
vmul.h vr3, vr3, vr0
vadd.h vr3, vr3, vr1
vssrani.bu.h vr3, vr3, 0
vstelm.w vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W4_NODEN
endfunc_x264
function_x264 mc_weight_w20_lsx
vldrepl.h vr1, a4, 40 // offset
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr2, a4, 32 // denom
vsll.h vr1, vr1, vr2
.LOOP_WEIGHT_W20:
vld vr3, a2, 0
vld vr4, a2, 16
add.d a2, a2, a3
vld vr5, a2, 0
vld vr6, a2, 16
vilvl.w vr4, vr6, vr4
vmulwev.h.bu.b vr7, vr3, vr0
vmulwod.h.bu.b vr8, vr3, vr0
vmulwev.h.bu.b vr9, vr4, vr0
vmulwod.h.bu.b vr10, vr4, vr0
vmulwev.h.bu.b vr11, vr5, vr0
vmulwod.h.bu.b vr12, vr5, vr0
vsadd.h vr7, vr7, vr1
vsadd.h vr8, vr8, vr1
vsadd.h vr9, vr9, vr1
vsadd.h vr10, vr10, vr1
vsadd.h vr11, vr11, vr1
vsadd.h vr12, vr12, vr1
vssrarn.bu.h vr7, vr7, vr2
vssrarn.bu.h vr8, vr8, vr2
vssrarn.bu.h vr9, vr9, vr2
vssrarn.bu.h vr10, vr10, vr2
vssrarn.bu.h vr11, vr11, vr2
vssrarn.bu.h vr12, vr12, vr2
vilvl.b vr7, vr8, vr7
vilvl.b vr9, vr10, vr9
vilvl.b vr11, vr12, vr11
vst vr7, a0, 0
vstelm.w vr9, a0, 16, 0
add.d a0, a0, a1
vst vr11, a0, 0
vstelm.w vr9, a0, 16, 1
add.d a0, a0, a1
add.d a2, a2, a3
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W20
endfunc_x264
function_x264 mc_weight_w16_lsx
vldrepl.h vr1, a4, 40 // offset
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr2, a4, 32 // denom
vsll.h vr1, vr1, vr2
.LOOP_WEIGHT_W16:
vld vr3, a2, 0
vldx vr4, a2, a3
vmulwev.h.bu.b vr5, vr3, vr0
vmulwod.h.bu.b vr6, vr3, vr0
vmulwev.h.bu.b vr7, vr4, vr0
vmulwod.h.bu.b vr8, vr4, vr0
vsadd.h vr5, vr5, vr1
vsadd.h vr6, vr6, vr1
vsadd.h vr7, vr7, vr1
vsadd.h vr8, vr8, vr1
vssrarn.bu.h vr5, vr5, vr2
vssrarn.bu.h vr6, vr6, vr2
vssrarn.bu.h vr7, vr7, vr2
vssrarn.bu.h vr8, vr8, vr2
vilvl.b vr5, vr6, vr5
vilvl.b vr7, vr8, vr7
vst vr5, a0, 0
vstx vr7, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W16
endfunc_x264
function_x264 mc_weight_w8_lsx
vldrepl.h vr1, a4, 40 // offset
vldrepl.b vr0, a4, 36 // scale
vldrepl.h vr2, a4, 32 // denom
vsll.h vr1, vr1, vr2
.LOOP_WEIGHT_W8:
fld.d f3, a2, 0
fldx.d f4, a2, a3
vilvl.d vr3, vr4, vr3
vmulwev.h.bu.b vr5, vr3, vr0
vmulwod.h.bu.b vr6, vr3, vr0
vsadd.h vr5, vr5, vr1
vsadd.h vr6, vr6, vr1
vssrarn.bu.h vr5, vr5, vr2
vssrarn.bu.h vr6, vr6, vr2
vilvl.b vr7, vr6, vr5
vstelm.d vr7, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr7, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W8
endfunc_x264
function_x264 mc_weight_w4_lsx
vldrepl.h vr1, a4, 40 // offset
vldrepl.h vr0, a4, 36 // scale
vldrepl.h vr2, a4, 32 // denom
vsll.h vr1, vr1, vr2
.LOOP_WEIGHT_W4:
fld.s f3, a2, 0
fldx.s f4, a2, a3
vilvl.w vr3, vr4, vr3
vsllwil.hu.bu vr3, vr3, 0
vmul.h vr3, vr3, vr0
vsadd.h vr3, vr3, vr1
vssrarn.bu.h vr3, vr3, vr2
vstelm.w vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 1
add.d a0, a0, a1
alsl.d a2, a3, a2, 1
addi.w a5, a5, -2
blt zero, a5, .LOOP_WEIGHT_W4
endfunc_x264
/*
* void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w4_lsx
.LOOP_AVG2_W4:
addi.d a5, a5, -2
fld.s f0, a2, 0
fld.s f1, a4, 0
fldx.s f2, a2, a3
fldx.s f3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr1
vavgr.bu vr1, vr2, vr3
fst.s f0, a0, 0
fstx.s f1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .LOOP_AVG2_W4
endfunc_x264
/*
* void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w8_lsx
.LOOP_AVG2_W8:
addi.d a5, a5, -2
fld.d f0, a2, 0
fld.d f1, a4, 0
fldx.d f2, a2, a3
fldx.d f3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr1
vavgr.bu vr1, vr2, vr3
fst.d f0, a0, 0
fstx.d f1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .LOOP_AVG2_W8
endfunc_x264
/*
* void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w16_lsx
.LOOP_AVG2_W16:
addi.d a5, a5, -2
vld vr0, a2, 0
vldx vr1, a2, a3
vld vr2, a4, 0
vldx vr3, a4, a3
alsl.d a2, a3, a2, 1
alsl.d a4, a3, a4, 1
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vst vr0, a0, 0
vstx vr1, a0, a1
alsl.d a0, a1, a0, 1
blt zero, a5, .LOOP_AVG2_W16
endfunc_x264
/*
* void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1,
* intptr_t i_src_stride, uint8_t *src2, int i_height)
*/
function_x264 pixel_avg2_w20_lsx
.LOOP_AVG2_W20:
addi.d a5, a5, -2
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a4, 0
vld vr3, a4, 16
add.d a2, a2, a3
add.d a4, a4, a3
vld vr4, a2, 0
vld vr5, a2, 16
vld vr6, a4, 0
vld vr7, a4, 16
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr4, vr4, vr6
vavgr.bu vr5, vr5, vr7
vst vr0, a0, 0
vstelm.w vr1, a0, 16, 0
add.d a0, a0, a1
vst vr4, a0, 0
vstelm.w vr5, a0, 16, 0
add.d a2, a2, a3
add.d a4, a4, a3
add.d a0, a0, a1
blt zero, a5, .LOOP_AVG2_W20
endfunc_x264
/*
* void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride,
* uint8_t *p_src, int32_t i_src_stride,
* int32_t i_height )
*/
function_x264 mc_copy_w16_lsx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPY_W16:
vld vr1, a2, 0
vldx vr2, a2, a3
vldx vr3, a2, t0
vldx vr4, a2, t1
vst vr1, a0, 0
vstx vr2, a0, a1
vstx vr3, a0, t2
vstx vr4, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPY_W16
endfunc_x264
/*
* void mc_copy_w8(uint8_t *p_dst, intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t i_height)
*/
function_x264 mc_copy_w8_lsx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPY_W8:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t0
fldx.d f3, a2, t1
fst.d f0, a0, 0
fstx.d f1, a0, a1
fstx.d f2, a0, t2
fstx.d f3, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPY_W8
endfunc_x264
/*
* void mc_copy_w4(uint8_t *p_dst, intptr_t i_dst_stride,
* uint8_t *p_src, intptr_t i_src_stride,
* int32_t i_height)
*/
function_x264 mc_copy_w4_lsx
slli.d t0, a3, 1
add.d t1, t0, a3
slli.d t2, a1, 1
add.d t3, t2, a1
.LOOP_COPY_W4:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fldx.s f2, a2, t0
fldx.s f3, a2, t1
fst.s f0, a0, 0
fstx.s f1, a0, a1
fstx.s f2, a0, t2
fstx.s f3, a0, t3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
addi.w a4, a4, -4
blt zero, a4, .LOOP_COPY_W4
endfunc_x264
/*
* void store_interleave_chroma(uint8_t *p_dst, intptr_t i_dst_stride,
* uint8_t *p_src0, uint8_t *p_src1,
* int32_t i_height)
*/
function_x264 store_interleave_chroma_lsx
.loop_interleave_chroma:
fld.d f0, a2, 0
fld.d f1, a3, 0
addi.d a2, a2, FDEC_STRIDE
addi.d a3, a3, FDEC_STRIDE
vilvl.b vr0, vr1, vr0
vst vr0, a0, 0
add.d a0, a0, a1
addi.w a4, a4, -1
blt zero, a4, .loop_interleave_chroma
endfunc_x264
/*
* void load_deinterleave_chroma_fenc(pixel *dst, pixel *src,
* intptr_t i_src, int height)
*/
function_x264 load_deinterleave_chroma_fenc_lsx
addi.d t0, a0, FENC_STRIDE/2
andi t1, a3, 1
sub.w t2, a3, t1
.loop_deinterleave_fenc:
vld vr0, a1, 0
vldx vr1, a1, a2
vpickev.b vr2, vr1, vr0
vpickod.b vr3, vr1, vr0
fst.d f2, a0, 0
fst.d f3, t0, 0
vstelm.d vr2, a0, FENC_STRIDE, 1
vstelm.d vr3, t0, FENC_STRIDE, 1
addi.d a0, a0, FENC_STRIDE * 2
addi.d t0, t0, FENC_STRIDE * 2
alsl.d a1, a2, a1, 1
addi.w t2, t2, -2
blt zero, t2, .loop_deinterleave_fenc
beqz t1, .loop_deinterleave_fenc_end
vld vr0, a1, 0
vpickev.b vr1, vr0, vr0
vpickod.b vr2, vr0, vr0
fst.d f1, a0, 0
fst.d f2, t0, 0
.loop_deinterleave_fenc_end:
endfunc_x264
/*
* void load_deinterleave_chroma_fdec(pixel *dst, pixel *src,
* intptr_t i_src, int height)
*/
function_x264 load_deinterleave_chroma_fdec_lsx
addi.d t0, a0, FDEC_STRIDE/2
andi t1, a3, 1
sub.w t2, a3, t1
.loop_deinterleave_fdec:
vld vr0, a1, 0
vldx vr1, a1, a2
vpickev.b vr2, vr1, vr0
vpickod.b vr3, vr1, vr0
fst.d f2, a0, 0
fst.d f3, t0, 0
vstelm.d vr2, a0, FDEC_STRIDE, 1
vstelm.d vr3, t0, FDEC_STRIDE, 1
addi.d a0, a0, FDEC_STRIDE * 2
addi.d t0, t0, FDEC_STRIDE * 2
alsl.d a1, a2, a1, 1
addi.w t2, t2, -2
blt zero, t2, .loop_deinterleave_fdec
beqz t1, .loop_deinterleave_fdec_end
vld vr0, a1, 0
vpickev.b vr1, vr0, vr0
vpickod.b vr2, vr0, vr0
fst.d f1, a0, 0
fst.d f2, t0, 0
.loop_deinterleave_fdec_end:
endfunc_x264
/*
* x264_plane_copy_interleave(pixel *dst, intptr_t i_dst,
* pixel *srcu, intptr_t i_srcu,
* pixel *srcv, intptr_t i_srcv, int w, int h)
*/
function_x264 plane_copy_interleave_core_lsx
.loop_h:
add.d t0, a0, zero
add.d t2, a2, zero
add.d t4, a4, zero
add.d t6, a6, zero
.loop_copy_interleavew16:
vld vr0, t2, 0
vld vr1, t4, 0
vilvl.b vr2, vr1, vr0
vilvh.b vr3, vr1, vr0
vst vr2, t0, 0
vst vr3, t0, 16
addi.d t2, t2, 16
addi.d t4, t4, 16
addi.d t0, t0, 32
addi.w t6, t6, -16
blt zero, t6, .loop_copy_interleavew16
add.d a2, a2, a3
add.d a4, a4, a5
add.d a0, a0, a1
addi.w a7, a7, -1
blt zero, a7, .loop_h
endfunc_x264
/*
* void x264_plane_copy_deinterleave(pixel *dsta, intptr_t i_dsta,
* pixel *dstb, intptr_t i_dstb,
* pixel *src, intptr_t i_src, int w, int h)
*/
function_x264 plane_copy_deinterleave_lsx
.LOOP_PLANE_COPY_H:
add.d t0, a0, zero
add.d t2, a2, zero
add.d t4, a4, zero
add.d t6, a6, zero
.LOOP_PLANE_COPY_W16:
vld vr0, t4, 0
vld vr1, t4, 16
vpickev.b vr2, vr1, vr0
vpickod.b vr3, vr1, vr0
vst vr2, t0, 0
vst vr3, t2, 0
addi.d t4, t4, 32
addi.d t0, t0, 16
addi.d t2, t2, 16
addi.w t6, t6, -16
blt zero, t6, .LOOP_PLANE_COPY_W16
add.d a2, a2, a3
add.d a4, a4, a5
add.d a0, a0, a1
addi.w a7, a7, -1
blt zero, a7, .LOOP_PLANE_COPY_H
endfunc_x264
function_x264 plane_copy_deinterleave_lasx
.LOOP_PLANE_COPY_H_LASX:
add.d t0, a0, zero
add.d t2, a2, zero
add.d t4, a4, zero
add.d t6, a6, zero
.LOOP_PLANE_COPY_W32_LASX:
xvld xr0, t4, 0
xvld xr1, t4, 32
xvpickev.b xr2, xr1, xr0
xvpickod.b xr3, xr1, xr0
xvpermi.d xr2, xr2, 0xd8
xvpermi.d xr3, xr3, 0xd8
xvst xr2, t0, 0
xvst xr3, t2, 0
addi.d t4, t4, 64
addi.d t0, t0, 32
addi.d t2, t2, 32
addi.w t6, t6, -32
blt zero, t6, .LOOP_PLANE_COPY_W32_LASX
add.d a2, a2, a3
add.d a4, a4, a5
add.d a0, a0, a1
addi.w a7, a7, -1
blt zero, a7, .LOOP_PLANE_COPY_H_LASX
endfunc_x264
/*
* void prefetch_ref(uint8_t *pix, intptr_t stride, int32_t parity)
*/
function_x264 prefetch_ref_lsx
addi.d a2, a2, -1
addi.d a0, a0, 64
and a2, a2, a1
alsl.d t1, a2, a0, 3
alsl.d a2, a1, a1, 1
preld 0, t1, 0
add.d t2, t1, a1
preld 0, t2, 0
add.d t2, t2, a1
preld 0, t2, 0
add.d t1, t1, a2
preld 0, t1, 0
alsl.d a0, a1, t2, 1
preld 0, a0, 0
add.d t1, a0, a1
preld 0, t1, 0
add.d t1, t1, a1
preld 0, t1, 0
add.d a0, a0, a2
preld 0, a0, 0
endfunc_x264
/*
* void prefetch_fenc_422(uint8_t *pix_y, intptr_t stride_y,
* uint8_t *pix_uv, intptr_t stride_uv,
* int32_t mb_x)
*/
function_x264 prefetch_fenc_422_lsx
andi t0, a4, 3
mul.d t0, t0, a1
andi a4, a4, 6
mul.d t1, a4, a3
addi.d a0, a0, 64
addi.d a2, a2, 64
alsl.d a0, t0, a0, 2
preld 0, a0, 0
add.d t2, a0, a1
preld 0, t2, 0
add.d a0, t2, a1
preld 0, a0, 0
add.d a0, a0, a1
preld 0, a0, 0
alsl.d a2, t1, a2, 2
preld 0, a2, 0
add.d t3, a2, a3
preld 0, t3, 0
add.d a2, t3, a3
preld 0, a2, 0
add.d a2, a2, a3
preld 0, a2, 0
endfunc_x264
/*
* void prefetch_fenc_420(uint8_t *pix_y, intptr_t stride_y,
* uint8_t *pix_uv, intptr_t stride_uv,
* int32_t mb_x)
*/
function_x264 prefetch_fenc_420_lsx
andi t0, a4, 3
mul.d t0, t0, a1
andi a4, a4, 6
mul.d t1, a4, a3
addi.d a0, a0, 64
addi.d a2, a2, 64
alsl.d a0, t0, a0, 2
preld 0, a0, 0
add.d t2, a0, a1
preld 0, t2, 0
add.d a0, t2, a1
preld 0, a0, 0
add.d a0, a0, a1
preld 0, a0, 0
alsl.d a2, t1, a2, 2
preld 0, a2, 0
add.d a2, a2, a3
preld 0, a2, 0
endfunc_x264
/*
* void *memcpy_aligned(void *dst, const void *src, size_t n)
*/
function_x264 memcpy_aligned_lsx
andi t0, a2, 16
beqz t0, 2f
addi.d a2, a2, -16
vld vr0, a1, 0
vst vr0, a0, 0
addi.d a1, a1, 16
addi.d a0, a0, 16
2:
andi t0, a2, 32
beqz t0, 3f
addi.d a2, a2, -32
vld vr0, a1, 0
vld vr1, a1, 16
vst vr0, a0, 0
vst vr1, a0, 16
addi.d a1, a1, 32
addi.d a0, a0, 32
3:
beqz a2, 5f
4:
addi.d a2, a2, -64
vld vr0, a1, 48
vld vr1, a1, 32
vld vr2, a1, 16
vld vr3, a1, 0
vst vr0, a0, 48
vst vr1, a0, 32
vst vr2, a0, 16
vst vr3, a0, 0
addi.d a1, a1, 64
addi.d a0, a0, 64
blt zero, a2, 4b
5:
endfunc_x264
/*
* void memzero_aligned(void *p_dst, size_t n)
*/
function_x264 memzero_aligned_lsx
vxor.v vr1, vr1, vr1
.loop_memzero:
addi.d a1, a1, -128
vst vr1, a0, 0
vst vr1, a0, 16
vst vr1, a0, 32
vst vr1, a0, 48
vst vr1, a0, 64
vst vr1, a0, 80
vst vr1, a0, 96
vst vr1, a0, 112
addi.d a0, a0, 128
blt zero, a1, .loop_memzero
endfunc_x264
.macro FILT_H_LSX s1, s2, s3
vsub.h \s1, \s1, \s2
vsrai.h \s1, \s1, 2
vsub.h \s1, \s1, \s2
vadd.h \s1, \s1, \s3
vsrai.h \s1, \s1, 2
vadd.h \s1, \s1, \s3
.endm
//s1: s1.0, s2: s2.0, s3: s3.0, s4: s1.1 s5: s2.1 s6: s3.1
.macro FILT_C_LSX s1, s2, s3, s4, s5, s6
vaddi.bu vr17, vr23, 2 //vr24
vaddi.bu vr19, vr26, 1 //vr27
vaddi.bu vr18, vr26, 3 //vr29
vshuf.b vr1, \s2, \s4, vr23
vshuf.b vr2, \s2, \s4, vr17
vshuf.b vr3, \s5, \s2, vr18
vshuf.b vr4, \s5, \s2, vr19
vadd.h vr3, vr2, vr3
vshuf.b vr16, \s5, \s2, vr23
vshuf.b vr17, \s5, \s2, vr17
vshuf.b vr18, \s3, \s5, vr18
vshuf.b vr19, \s3, \s5, vr19
vadd.h vr18, vr17, vr18
vmov vr2, \s5
vmov \s1, \s3
vmov vr20, \s3
vmov \s4, \s6
vaddi.bu vr17, vr26, 5 //vr30
vshuf.b \s3, vr2, \s2, vr17
vshuf.b \s6, vr20, \s5, vr17
vadd.h vr4, vr4, \s2
vadd.h \s3, \s3, vr1
vadd.h vr19, vr19, \s5
vadd.h \s6, \s6, vr16
FILT_H_LSX \s3, vr3, vr4
FILT_H_LSX \s6, vr18, vr19
.endm
.macro FILT_PACK_LSX s1, s2, s3
vmulwev.w.h vr16, \s1, \s3
vmulwev.w.h vr17, \s2, \s3
vsrarni.h.w vr17, vr16, 15
vmaxi.h vr17, vr17, 0
vsat.hu vr17, vr17, 7
vmulwod.w.h vr18, \s1, \s3
vmulwod.w.h vr19, \s2, \s3
vsrarni.h.w vr19, vr18, 15
vmaxi.h vr19, vr19, 0
vsat.hu vr19, vr19, 7
vpackev.b \s1, vr19, vr17
.endm
//s1: s1.0, s2: s2.0, s3: s3.0, s4: s4.0
//s5: s1.1, s6: s2.1, s7: s3.1, s8: s4.1
.macro DO_FILT_C_LSX s1, s2, s3, s4, s5, s6, s7, s8
FILT_C_LSX \s1, \s2, \s3, \s5, \s6, \s7
FILT_C_LSX \s2, \s1, \s4, \s6, \s5, \s8
FILT_PACK_LSX \s3, \s4, vr15
FILT_PACK_LSX \s7, \s8, vr15
vilvl.d vr16, \s7, \s3
vilvh.d vr17, \s7, \s3
addi.d t3, a5, 16
vstx vr16, a5, a4
vstx vr17, t3, a4
.endm
.macro DO_FILT_H_LSX s1, s2, s3, s4, s5, s6
vaddi.bu vr16, vr23, 2 //vr24
vaddi.bu vr17, vr23, 3 //vr25
vaddi.bu vr18, vr26, 1 //vr27
vaddi.bu vr19, vr26, 2 //vr28
vld vr3, t5, 0
vshuf.b vr1, \s2, \s4, vr16
vshuf.b vr2, \s2, \s4, vr17
vshuf.b vr4, \s5, \s2, vr26
vshuf.b vr5, \s5, \s2, vr18
vshuf.b vr6, \s5, \s2, vr19
vdp2.h.bu.b vr16, vr1, vr12
vdp2.h.bu.b vr17, vr2, vr12
vdp2.h.bu.b vr18, \s2, vr14
vdp2.h.bu.b vr19, vr4, vr14
vdp2.h.bu.b vr20, vr5, vr0
vdp2.h.bu.b vr21, vr6, vr0
vadd.h vr1, vr16, vr18
vadd.h vr2, vr17, vr19
vadd.h vr1, vr1, vr20
vadd.h vr2, vr2, vr21
FILT_PACK_LSX vr1, vr2, vr15
vshuf.b vr1, vr1, vr1, vr3
vstx vr1, a0, a4
vaddi.bu vr16, vr23, 2 //vr24
vaddi.bu vr17, vr23, 3 //vr25
vaddi.bu vr18, vr26, 1 //vr27
vaddi.bu vr19, vr26, 2 //vr28
vshuf.b vr1, \s5, \s2, vr16
vshuf.b vr2, \s5, \s2, vr17
vshuf.b vr4, \s3, \s5, vr26
vshuf.b vr5, \s3, \s5, vr18
vshuf.b vr6, \s3, \s5, vr19
vdp2.h.bu.b vr16, vr1, vr12
vdp2.h.bu.b vr17, vr2, vr12
vdp2.h.bu.b vr18, \s5, vr14
vdp2.h.bu.b vr19, vr4, vr14
vdp2.h.bu.b vr20, vr5, vr0
vdp2.h.bu.b vr21, vr6, vr0
vadd.h vr1, vr16, vr18
vadd.h vr2, vr17, vr19
vadd.h vr1, vr1, vr20
vadd.h vr2, vr2, vr21
FILT_PACK_LSX vr1, vr2, vr15
vshuf.b vr1, vr1, vr1, vr3
addi.d a0, a0, 16
vstx vr1, a0, a4
addi.d a0, a0, -16
vmov \s1, \s2
vmov \s2, \s3
vmov \s4, \s5
vmov \s5, \s6
.endm
/* s3: temp, s4: UNUSED, s5: imm */
.macro DO_FILT_V0_LSX s1, s2, s3, s4, s5
alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */
alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */
vld vr1, a3, 0
vldx vr2, a3, a2
vld \s3, t2, 0
vld vr3, a1, 0
vldx \s1, a1, a2
vld \s2, t1, 0
vilvh.b vr16, vr2, vr1
vilvl.b vr17, vr2, vr1
vilvh.b vr18, \s2, \s1
vilvl.b vr19, \s2, \s1
vilvh.b vr20, \s3, vr3
vilvl.b vr21, \s3, vr3
vdp2.h.bu.b vr1, vr17, vr12
vdp2.h.bu.b vr4, vr16, vr12
vdp2.h.bu.b \s1, vr19, vr0
vdp2.h.bu.b vr2, vr18, vr0
vdp2.h.bu.b vr3, vr21, vr14
vdp2.h.bu.b \s2, vr20, vr14
vadd.h vr1, vr1, \s1
vadd.h vr4, vr4, vr2
vadd.h vr1, vr1, vr3
vadd.h vr4, vr4, \s2
vmov \s1, vr1
vmov \s2, vr4
addi.d a3, a3, 16
addi.d a1, a1, 16
FILT_PACK_LSX vr1, vr4, vr15
addi.d t3, a4, \s5
vstx vr1, t0, t3
.endm
.macro DO_FILT_V1_LSX s1, s2, s3, s4, s5
vld vr1, a3, 0
vldx vr2, a3, a2
vld \s3, t2, 16
vld vr3, a1, 0
vldx \s1, a1, a2
vld \s2, t1, 16
vilvh.b vr16, vr2, vr1
vilvl.b vr17, vr2, vr1
vilvh.b vr18, \s2, \s1
vilvl.b vr19, \s2, \s1
vilvh.b vr20, \s3, vr3
vilvl.b vr21, \s3, vr3
vdp2.h.bu.b vr1, vr17, vr12
vdp2.h.bu.b vr4, vr16, vr12
vdp2.h.bu.b \s1, vr19, vr0
vdp2.h.bu.b vr2, vr18, vr0
vdp2.h.bu.b vr3, vr21, vr14
vdp2.h.bu.b \s2, vr20, vr14
vadd.h vr1, vr1, \s1
vadd.h vr4, vr4, vr2
vadd.h vr1, vr1, vr3
vadd.h vr4, vr4, \s2
vmov \s1, vr1
vmov \s2, vr4
addi.d a3, a3, 16
addi.d a1, a1, 16
FILT_PACK_LSX vr1, vr4, vr15
addi.d t3, a4, \s5
addi.d t3, t3, 16
vstx vr1, t0, t3
.endm
/*
* void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
* uint8_t *src, intptr_t stride, int width, int height )
*/
function_x264 hpel_filter_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
move a7, a3
addi.d a5, a5, -32
move t0, a1
andi a7, a7, 31
sub.d a3, a3, a7
add.d a0, a0, a5
add.d t0, t0, a5
add.d a7, a7, a5
add.d a5, a5, a2
move a2, a4
sub.d a7, zero, a7
add.d a1, a3, a2
sub.d a3, a3, a2
sub.d a3, a3, a2
move a4, a7
la.local t1, filt_mul51
vld vr0, t1, 0
la.local t2, filt_mul15
vld vr12, t2, 0
la.local t3, filt_mul20
vld vr14, t3, 0
la.local t4, pw_1024
vld vr15, t4, 0
la.local t5, hpel_shuf
la.local t2, shuf_12
vld vr23, t2, 0
la.local t3, shuf_1
vld vr26, t3, 0
vxor.v vr9, vr9, vr9
vxor.v vr10, vr10, vr10
vxor.v vr11, vr11, vr11
vxor.v vr13, vr13, vr13
.LOOPY_LSX:
DO_FILT_V0_LSX vr24, vr25, vr31, vr12, 0
DO_FILT_V1_LSX vr8, vr7, vr22, vr12, 0
.LOOPX_LSX:
DO_FILT_V0_LSX vr27, vr28, vr29, vr12, 32
DO_FILT_V1_LSX vr6, vr5, vr30, vr12, 32
.LSTX:
vsrli.h vr15, vr15, 1
DO_FILT_C_LSX vr9, vr24, vr8, vr27, vr10, vr25, vr7, vr28
vadd.h vr15, vr15, vr15
vmov vr8, vr6
vmov vr7, vr5
DO_FILT_H_LSX vr11, vr31, vr29, vr13, vr22, vr30
addi.d a4, a4, 32
blt a4, zero, .LOOPX_LSX
addi.d t1, a4, -32
blt t1, zero, .LSTX
//setup regs for next y
sub.d a4, a4, a7
sub.d a4, a4, a2
sub.d a1, a1, a4
sub.d a3, a3, a4
add.d a0, a0, a2
add.d t0, t0, a2
add.d a5, a5, a2
move a4, a7
addi.d a6, a6, -1
blt zero, a6, .LOOPY_LSX
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth,
* pixel *dstv, pixel *dstc, intptr_t src_stride,
* intptr_t dst_stride, int width, int height)
*/
function_x264 frame_init_lowres_core_lsx
addi.d t0, zero, 15
addi.d t1, zero, 7
addi.d t2, zero, 3
addi.d t3, zero, 1
ld.d t4, sp, 0
addi.d sp, sp, -16
st.d s0, sp, 0
st.d s1, sp, 8
slli.d s0, a5, 1
.LOOPH:
bge zero, t4, .ENDLOOPH
addi.d t4, t4, -1
add.d t5, a0, a5
add.d t7, t5, a5
move t6, a7
.LOOPW16:
bge t0, t6, .LOOPW8
vld vr0, a0, 0
vld vr1, t5, 0
vld vr2, t7, 0
vld vr3, a0, 1
vld vr4, t5, 1
vld vr5, t7, 1
vld vr6, a0, 16
vld vr7, t5, 16
vld vr8, t7, 16
vld vr9, a0, 17
vld vr10, t5, 17
vld vr11, t7, 17
// Calculate dst0, dsth, dstv and dstc
vavgr.bu vr12, vr0, vr1
vavgr.bu vr13, vr1, vr2
vavgr.bu vr14, vr3, vr4
vavgr.bu vr15, vr4, vr5
vavgr.bu vr16, vr6, vr7
vavgr.bu vr17, vr7, vr8
vavgr.bu vr18, vr9, vr10
vavgr.bu vr19, vr10, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vhaddw.hu.bu vr19, vr19, vr19
vssrarni.bu.h vr13, vr12, 1
vssrarni.bu.h vr15, vr14, 1
vssrarni.bu.h vr17, vr16, 1
vssrarni.bu.h vr19, vr18, 1
vilvl.d vr12, vr17, vr13
vilvl.d vr14, vr19, vr15
vilvh.d vr13, vr17, vr13
vilvh.d vr15, vr19, vr15
vst vr12, a1, 0
vst vr14, a2, 0
vst vr13, a3, 0
vst vr15, a4, 0
addi.d a1, a1, 16
addi.d a2, a2, 16
addi.d a3, a3, 16
addi.d a4, a4, 16
addi.d a0, a0, 32
addi.d t5, t5, 32
addi.d t7, t7, 32
addi.d t6, t6, -16
b .LOOPW16
.LOOPW8:
bge t1, t6, .LOOPW4
vld vr0, a0, 0
vld vr1, t5, 0
vld vr2, t7, 0
vld vr3, a0, 1
vld vr4, t5, 1
vld vr5, t7, 1
// Calculate dst0, dsth, dstv and dstc
vavgr.bu vr12, vr0, vr1
vavgr.bu vr13, vr1, vr2
vavgr.bu vr14, vr3, vr4
vavgr.bu vr15, vr4, vr5
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vssrarni.bu.h vr13, vr12, 1
vssrarni.bu.h vr15, vr14, 1
vstelm.d vr13, a1, 0, 0
vstelm.d vr15, a2, 0, 0
vstelm.d vr13, a3, 0, 1
vstelm.d vr15, a4, 0, 1
addi.d a1, a1, 8
addi.d a2, a2, 8
addi.d a3, a3, 8
addi.d a4, a4, 8
addi.d a0, a0, 16
addi.d t5, t5, 16
addi.d t7, t7, 16
addi.d t6, t6, -8
b .LOOPW8
.LOOPW4:
bge t2, t6, .LOOPW2
vld vr0, a0, 0
vld vr1, t5, 0
vld vr2, t7, 0
vld vr3, a0, 1
vld vr4, t5, 1
vld vr5, t7, 1
// Calculate dst0, dsth, dstv and dstc
vavgr.bu vr12, vr0, vr1
vavgr.bu vr13, vr1, vr2
vavgr.bu vr14, vr3, vr4
vavgr.bu vr15, vr4, vr5
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vssrarni.bu.h vr13, vr12, 1
vssrarni.bu.h vr15, vr14, 1
vstelm.w vr13, a1, 0, 0
vstelm.w vr15, a2, 0, 0
vstelm.w vr13, a3, 0, 2
vstelm.w vr15, a4, 0, 2
addi.d a1, a1, 4
addi.d a2, a2, 4
addi.d a3, a3, 4
addi.d a4, a4, 4
addi.d a0, a0, 8
addi.d t5, t5, 8
addi.d t7, t7, 8
addi.d t6, t6, -4
b .LOOPW4
.LOOPW2:
bge t3, t6, .LOOPW1
vld vr0, a0, 0
vld vr1, t5, 0
vld vr2, t7, 0
vld vr3, a0, 1
vld vr4, t5, 1
vld vr5, t7, 1
// Calculate dst0, dsth, dstv and dstc
vavgr.bu vr12, vr0, vr1
vavgr.bu vr13, vr1, vr2
vavgr.bu vr14, vr3, vr4
vavgr.bu vr15, vr4, vr5
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vssrarni.bu.h vr13, vr12, 1
vssrarni.bu.h vr15, vr14, 1
vstelm.h vr13, a1, 0, 0
vstelm.h vr15, a2, 0, 0
vstelm.h vr13, a3, 0, 4
vstelm.h vr15, a4, 0, 4
addi.d a1, a1, 2
addi.d a2, a2, 2
addi.d a3, a3, 2
addi.d a4, a4, 2
addi.d a0, a0, 4
addi.d t5, t5, 4
addi.d t7, t7, 4
addi.d t6, t6, -2
b .LOOPW2
.LOOPW1:
bge zero, t6, .ENDLOOPW1
vld vr0, a0, 0
vld vr1, t5, 0
vld vr2, t7, 0
vld vr3, a0, 1
vld vr4, t5, 1
vld vr5, t7, 1
// Calculate dst0, dsth, dstv and dstc
vavgr.bu vr12, vr0, vr1
vavgr.bu vr13, vr1, vr2
vavgr.bu vr14, vr3, vr4
vavgr.bu vr15, vr4, vr5
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vssrarni.bu.h vr13, vr12, 1
vssrarni.bu.h vr15, vr14, 1
vstelm.b vr13, a1, 0, 0
vstelm.b vr15, a2, 0, 0
vstelm.b vr13, a3, 0, 8
vstelm.b vr15, a4, 0, 8
.ENDLOOPW1:
sub.d s1, a7, t6
sub.d a0, a0, s1
sub.d a0, a0, s1
add.d a0, a0, s0
sub.d a1, a1, s1
add.d a1, a1, a6
sub.d a2, a2, s1
add.d a2, a2, a6
sub.d a3, a3, s1
add.d a3, a3, a6
sub.d a4, a4, s1
add.d a4, a4, a6
b .LOOPH
.ENDLOOPH:
ld.d s0, sp, 0
ld.d s1, sp, 8
addi.d sp, sp, 16
endfunc_x264
#endif /* !HIGH_BIT_DEPTH */