2025-04-28 08:47:28 +08:00

1232 lines
41 KiB
ArmAsm

/*****************************************************************************
* quant-a.S: LoongArch quantization and level-run
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "loongson_asm.S"
#include "loongson_util.S"
const last64_shuf
.int 0, 4, 1, 5, 2, 6, 3, 7
endconst
/*
* int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
*/
.macro QUANT_ONE_LASX s1, s2, s3, s4
xvld xr1, \s1, 0 /* Load dctcoef */
xvadda.h \s4, xr1, \s3
xvmuh.hu \s4, \s4, \s2
xvsigncov.h \s4, xr1, \s4
xvst \s4, \s1, 0
.endm
function_x264 quant_4x4x4_lasx
xvld xr2, a1, 0
xvld xr3, a2, 0
QUANT_ONE_LASX a0, xr2, xr3, xr4
addi.d a0, a0, 32
QUANT_ONE_LASX a0, xr2, xr3, xr0
xvssrlni.h.w xr0, xr4, 0
addi.d a0, a0, 32
QUANT_ONE_LASX a0, xr2, xr3, xr4
addi.d a0, a0, 32
QUANT_ONE_LASX a0, xr2, xr3, xr5
xvssrlni.h.w xr5, xr4, 0
xvssrlni.h.w xr5, xr0, 0
xvseqi.w xr5, xr5, 0
xvmskltz.w xr5, xr5
xvpickve2gr.w t0, xr5, 0
xvpickve2gr.w t1, xr5, 4
alsl.d t0, t1, t0, 4
and t0, t0, t1
xori a0, t0, 0xf
endfunc_x264
.macro QUANT_ONE_LSX tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
vld vr0, \tmp1, 0
vld vr1, \tmp1, 16
vadda.h \tmp6, vr0, \tmp4
vadda.h \tmp7, vr1, \tmp5
vmuh.hu \tmp6, \tmp6, \tmp2
vmuh.hu \tmp7, \tmp7, \tmp3
vsigncov.h \tmp6, vr0, \tmp6
vsigncov.h \tmp7, vr1, \tmp7
vst \tmp6, \tmp1, 0
vst \tmp7, \tmp1, 16
.endm
function_x264 quant_4x4x4_lsx
vld vr2, a1, 0
vld vr3, a1, 16
vld vr4, a2, 0
vld vr5, a2, 16
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr6, vr7
addi.d a0, a0, 32
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr8, vr9
vssrlni.h.w vr8, vr6, 0
vssrlni.h.w vr9, vr7, 0
addi.d a0, a0, 32
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11
addi.d a0, a0, 32
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13
vssrlni.h.w vr12, vr10, 0
vssrlni.h.w vr13, vr11, 0
vssrlni.h.w vr12, vr8, 0
vssrlni.h.w vr13, vr9, 0
vseqi.w vr12, vr12, 0
vseqi.w vr13, vr13, 0
vmskltz.w vr12, vr12
vmskltz.w vr13, vr13
vpickve2gr.w t0, vr12, 0
vpickve2gr.w t1, vr13, 0
alsl.d t0, t1, t0, 4
and t0, t0, t1
xori a0, t0, 0xf
endfunc_x264
function_x264 quant_4x4_lsx
vld vr2, a1, 0
vld vr3, a1, 16
vld vr4, a2, 0
vld vr5, a2, 16
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11
vor.v vr22, vr10, vr11
vpickve2gr.d t0, vr22, 0
vpickve2gr.d t1, vr22, 1
or t2, t0, t1
addi.w t3, zero, 1
maskeqz a0, t3, t2
endfunc_x264
function_x264 quant_8x8_lsx
vld vr2, a1, 0
vld vr3, a1, 16
vld vr4, a2, 0
vld vr5, a2, 16
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13
addi.d a0, a0, 32
vld vr2, a1, 32
vld vr3, a1, 48
vld vr4, a2, 32
vld vr5, a2, 48
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr14, vr15
addi.d a0, a0, 32
vld vr2, a1, 64
vld vr3, a1, 80
vld vr4, a2, 64
vld vr5, a2, 80
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr22, vr23
addi.d a0, a0, 32
vld vr2, a1, 96
vld vr3, a1, 112
vld vr4, a2, 96
vld vr5, a2, 112
QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr7, vr8
vor.v vr12, vr12, vr14
vor.v vr13, vr13, vr15
vor.v vr22, vr22, vr7
vor.v vr23, vr23, vr8
vor.v vr12, vr12, vr22
vor.v vr13, vr13, vr23
vor.v vr11, vr12, vr13
vpickve2gr.d t0, vr11, 0
vpickve2gr.d t1, vr11, 1
or t2, t0, t1
addi.w t3, zero, 1
maskeqz a0, t3, t2
endfunc_x264
function_x264 quant_4x4_dc_lsx
vld vr0, a0, 0
vld vr1, a0, 16
vreplgr2vr.w vr2, a1
vreplgr2vr.w vr3, a2
vslei.h vr4, vr0, 0
vslei.h vr5, vr1, 0
vexth.w.h vr7, vr0
vsllwil.w.h vr6, vr0, 0
vexth.w.h vr9, vr1
vsllwil.w.h vr8, vr1, 0
vadda.w vr6, vr3, vr6
vadda.w vr7, vr3, vr7
vadda.w vr8, vr3, vr8
vadda.w vr9, vr3, vr9
vmul.w vr6, vr6, vr2
vmul.w vr7, vr7, vr2
vmul.w vr8, vr8, vr2
vmul.w vr9, vr9, vr2
vsrani.h.w vr8, vr6, 16
vsrani.h.w vr9, vr7, 16
vpermi.w vr10, vr9, 0x0E
vpermi.w vr9, vr8, 0x44
vpermi.w vr10, vr8, 0x4E
vneg.h vr11, vr9
vneg.h vr12, vr10
vbitsel.v vr13, vr9, vr11, vr4
vbitsel.v vr14, vr10, vr12, vr5
vst vr13, a0, 0
vst vr14, a0, 16
vor.v vr15, vr11, vr12
vpickve2gr.d t0, vr15, 0
vpickve2gr.d t1, vr15, 1
or t2, t0, t1
addi.w t3, zero, 1
maskeqz a0, t3, t2
endfunc_x264
/*
* int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
*/
function_x264 quant_2x2_dc_lsx
fld.d f0, a0, 0
vreplgr2vr.w vr1, a1
vreplgr2vr.w vr2, a2
vslei.h vr3, vr0, 0
vsllwil.w.h vr4, vr0, 0
vadda.w vr4, vr4, vr2
vmul.w vr4, vr4, vr1
vsrani.h.w vr4, vr4, 16
vneg.h vr8, vr4
vbitsel.v vr9, vr4, vr8, vr3
vstelm.d vr9, a0, 0, 0
vpickve2gr.w t0, vr9, 0
vpickve2gr.w t1, vr9, 1
or t2, t0, t1
addi.w t3, zero, 1
maskeqz a0, t3, t2
endfunc_x264
/*
* int coeff_last64_c(dctcoef *l)
*/
function_x264 coeff_last64_lasx
addi.w t0, zero, 63
xvxor.v xr20, xr0, xr0
xvld xr0, a0, 0
xvld xr1, a0, 32
xvld xr2, a0, 64
xvld xr3, a0, 96
xvldi xr4, 1
la.local t1, last64_shuf
xvld xr7, t1, 0
xvldi xr9, 0x408
xvldi xr10, 0x401
xvssrlni.bu.h xr1, xr0, 0
xvssrlni.bu.h xr3, xr2, 0
xvsle.bu xr5, xr4, xr1
xvsle.bu xr6, xr4, xr3
xvssrlni.bu.h xr6, xr5, 4
xvperm.w xr6, xr6, xr7
xvclz.w xr7, xr6
xvssrlni.hu.w xr7, xr7, 2
xvpermi.d xr8, xr7, 0xd8
xvsub.h xr9, xr9, xr8
xvsll.h xr10, xr10, xr9
xvssrlni.bu.h xr10, xr10, 1
xvclz.d xr11, xr10
xvpickve2gr.w t3, xr11, 0
sub.w a0, t0, t3
endfunc_x264
function_x264 coeff_last64_lsx
addi.w t0, zero, 63
vxor.v vr20, vr0, vr0
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
vld vr4, a0, 64
vld vr5, a0, 80
vld vr6, a0, 96
vld vr7, a0, 112
vldi vr8, 1
vldi vr9, 0x408
vldi vr10, 0x401
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vssrlni.bu.h vr2, vr2, 0
vssrlni.bu.h vr3, vr3, 0
vssrlni.bu.h vr4, vr4, 0
vssrlni.bu.h vr5, vr5, 0
vssrlni.bu.h vr6, vr6, 0
vssrlni.bu.h vr7, vr7, 0
vpermi.w vr2, vr0, 0x44
vpermi.w vr3, vr1, 0x44
vpermi.w vr6, vr4, 0x44
vpermi.w vr7, vr5, 0x44
vsle.bu vr2, vr8, vr2
vsle.bu vr3, vr8, vr3
vsle.bu vr6, vr8, vr6
vsle.bu vr7, vr8, vr7
vssrlni.bu.h vr2, vr2, 4
vssrlni.bu.h vr3, vr3, 4
vssrlni.bu.h vr6, vr6, 4
vssrlni.bu.h vr7, vr7, 4
vpermi.w vr6, vr2, 0x44
vpermi.w vr7, vr3, 0x44
vpermi.w vr11, vr7, 0x0E
vpermi.w vr7, vr6, 0x44
vpermi.w vr7, vr7, 0xD8
vpermi.w vr11, vr6, 0x4E
vpermi.w vr11, vr11, 0xD8
vclz.w vr7, vr7
vclz.w vr11, vr11
vssrlni.hu.w vr7, vr7, 2
vssrlni.hu.w vr11, vr11, 2
vpermi.w vr12, vr11, 0x0E
vpermi.w vr11, vr7, 0x44
vpermi.w vr12, vr7, 0x4E
vsub.h vr11, vr9, vr11
vsub.h vr12, vr9, vr12
vsll.h vr13, vr10, vr11
vsll.h vr14, vr10, vr12
vssrlni.bu.h vr13, vr13, 1
vssrlni.bu.h vr14, vr14, 1
vclz.d vr15, vr14
vpickve2gr.w t1, vr15, 0
sub.w a0, t0, t1
endfunc_x264
/*
* int coeff_last16_c(dctcoef *l)
*/
function_x264 coeff_last16_lasx
addi.w t0, zero, 15
xvld xr0, a0, 0
xvldi xr2, 1
xvssrlni.bu.h xr0, xr0, 0
xvpermi.d xr1, xr0, 0xd8
xvsle.bu xr3, xr2, xr1
xvssrlni.bu.h xr3, xr3, 4
xvclz.d xr4, xr3
xvpickve2gr.w t1, xr4, 0
srai.w t1, t1, 2
sub.w a0, t0, t1
endfunc_x264
function_x264 coeff_last16_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vssrlni.bu.h vr3, vr3, 4
vclz.d vr4, vr3
vpickve2gr.w t1, vr4, 0
srai.w t1, t1, 2
sub.w a0, t0, t1
endfunc_x264
/*
* int coeff_last15_c(dctcoef *l)
*/
function_x264 coeff_last15_lasx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
xvldi xr3, 1
vinsgr2vr.h vr1, zero, 7
xvpermi.q xr1, xr0, 0x20
xvssrlni.bu.h xr1, xr1, 0
xvpermi.d xr2, xr1, 0xd8
xvsle.bu xr4, xr3, xr2
xvssrlni.bu.h xr4, xr4, 4
xvclz.d xr5, xr4
xvpickve2gr.w t1, xr5, 0
srai.w t1, t1, 2
sub.w a0, t0, t1
endfunc_x264
function_x264 coeff_last15_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vinsgr2vr.h vr1, zero, 7
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vssrlni.bu.h vr3, vr3, 4
vclz.d vr4, vr3
vpickve2gr.w t1, vr4, 0
srai.w t1, t1, 2
sub.w a0, t0, t1
endfunc_x264
/*
* int coeff_last8_c(dctcoef *l)
*/
function_x264 coeff_last8_lsx
addi.w t0, zero, 7
vld vr0, a0, 0
vclz.d vr1, vr0
vpickve2gr.w t1, vr1, 0
vpickve2gr.w t2, vr1, 2
li.d t3, 64
bne t2, t3, .LAST8_LOW_LSX
addi.d t4, t1, 0
addi.d t0, t0, -4
b .LAST8_END_LSX
.LAST8_LOW_LSX:
addi.d t4, t2, 0
.LAST8_END_LSX:
srai.w t4, t4, 4
sub.w a0, t0, t4
endfunc_x264
/*
* int coeff_last4_c(dctcoef *l)
*/
function_x264 coeff_last4_lsx
addi.w t0, zero, 3
vld vr0, a0, 0
vclz.d vr1, vr0
vpickve2gr.w t1, vr1, 0
srai.w t1, t1, 4
sub.w a0, t0, t1
endfunc_x264
// (dct[i] * dequant_mf[i]) << (i_qbits)
.macro DCT_MF a0, a1, in0, out0, out1
vld vr1, \a0, 0
xvld xr2, \a1, 0
vext2xv.w.h xr5, xr1
xvmul.w xr5, xr5, xr2
xvsll.w \out0, xr5, \in0
vld vr1, \a0, 16
xvld xr2, \a1, 32
vext2xv.w.h xr5, xr1
xvmul.w xr5, xr5, xr2
xvsll.w \out1, xr5, \in0
.endm
// (dct[i] * dequant_mf[i] + f) >> (-i_qbits)
.macro DCT_MF_F a0, a1, in0, out0, out1
vld vr1, \a0, 0
xvld xr2, \a1, 0
vext2xv.w.h xr5, xr1
xvmul.w xr5, xr5, xr2
xvsrar.w \out0, xr5, \in0
vld vr1, \a0, 16
xvld xr2, \a1, 32
vext2xv.w.h xr5, xr1
xvmul.w xr5, xr5, xr2
xvsrar.w \out1, xr5, \in0
.endm
/*
* void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
*/
function_x264 dequant_4x4_lasx
addi.w t1, zero, 6
addi.w t2, zero, 4
div.w t0, a2, t1
sub.w t0, t0, t2 // i_qp/6 - 4
mod.w t1, a2, t1 // i_qp%6
slli.w t1, t1, 6
add.d a1, a1, t1
blt t0, zero, .DQ4x4_DEQUANT_SHR
// i_qbits >= 0
xvreplgr2vr.w xr0, t0
DCT_MF a0, a1, xr0, xr6, xr7
b .DQ4x4_END
.DQ4x4_DEQUANT_SHR:
sub.w t4, zero, t0
xvreplgr2vr.w xr4, t4
DCT_MF_F a0, a1, xr4, xr6, xr7
.DQ4x4_END:
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
endfunc_x264
.macro DCT_MF_LSX tmp0, tmp1, in0, out0, out1, out2, out3
vld vr0, \tmp0, 0
vld vr1, \tmp1, 0
vld vr2, \tmp1, 16
vexth.w.h vr4, vr0
vsllwil.w.h vr3, vr0, 0
vmul.w vr3, vr3, vr1
vmul.w vr4, vr4, vr2
vsll.w \out0, vr3, \in0
vsll.w \out1, vr4, \in0
vld vr0, \tmp0, 16
vld vr1, \tmp1, 32
vld vr2, \tmp1, 48
vsllwil.w.h vr3, vr0, 0
vpermi.w vr4, vr0, 0x0E
vsllwil.w.h vr4, vr4, 0
vmul.w vr3, vr3, vr1
vmul.w vr4, vr4, vr2
vsll.w \out2, vr3, \in0
vsll.w \out3, vr4, \in0
.endm
.macro DCT_MF_F_LSX tmp0, tmp1, in0, out0, out1, out2, out3
vld vr0, \tmp0, 0
vld vr1, \tmp1, 0
vld vr2, \tmp1, 16
vexth.w.h vr4, vr0
vsllwil.w.h vr3, vr0, 0
vmul.w vr3, vr3, vr1
vmul.w vr4, vr4, vr2
vsrar.w \out0, vr3, \in0
vsrar.w \out1, vr4, \in0
vld vr0, \tmp0, 16
vld vr1, \tmp1, 32
vld vr2, \tmp1, 48
vexth.w.h vr4, vr0
vsllwil.w.h vr3, vr0, 0
vmul.w vr3, vr3, vr1
vmul.w vr4, vr4, vr2
vsrar.w \out2, vr3, \in0
vsrar.w \out3, vr4, \in0
.endm
function_x264 dequant_4x4_lsx
addi.w t1, zero, 6
addi.w t2, zero, 4
div.w t0, a2, t1
sub.w t0, t0, t2
mod.w t1, a2, t1
slli.w t1, t1, 6
add.d a1, a1, t1
blt t0, zero, .DQ4x4_DEQUANT_SHR_LSX
vreplgr2vr.w vr6, t0
DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
b .DQ4x4_END_LSX
.DQ4x4_DEQUANT_SHR_LSX:
sub.w t4, zero, t0
vreplgr2vr.w vr6, t4
DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
.DQ4x4_END_LSX:
vpickev.h vr11, vr9, vr7
vpickev.h vr12, vr10, vr8
vpermi.w vr13, vr12, 0x0E
vpermi.w vr12, vr11, 0x44
vpermi.w vr13, vr11, 0x4E
vst vr12, a0, 0
vst vr13, a0, 16
endfunc_x264
/*
* void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
*/
function_x264 dequant_8x8_lasx
addi.w t1, zero, 6
div.w t0, a2, t1
sub.w t0, t0, t1
mod.w t1, a2, t1 // i_qp%6
slli.w t1, t1, 8
add.d a1, a1, t1
blt t0, zero, .DQ8x8_DEQUANT_SHR
// i_qbits >= 0
xvreplgr2vr.w xr0, t0
DCT_MF a0, a1, xr0, xr6, xr7
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
.rept 3
addi.d a0, a0, 32
addi.d a1, a1, 64
DCT_MF a0, a1, xr0, xr6, xr7
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
.endr
b .DQ8x8_END
// i_qbits < 0
.DQ8x8_DEQUANT_SHR:
sub.w t4, zero, t0
xvreplgr2vr.w xr4, t4
DCT_MF_F a0, a1, xr4, xr6, xr7
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
.rept 3
addi.d a0, a0, 32
addi.d a1, a1, 64
DCT_MF_F a0, a1, xr4, xr6, xr7
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
.endr
.DQ8x8_END:
endfunc_x264
function_x264 dequant_8x8_lsx
addi.w t1, zero, 6
div.w t0, a2, t1
sub.w t0, t0, t1
mod.w t1, a2, t1
slli.w t1, t1, 8
add.d a1, a1, t1
blt t0, zero, .DQ8x8_DEQUANT_SHR_LSX
vreplgr2vr.w vr6, t0
DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
vpickev.h vr11, vr9, vr7
vpickev.h vr12, vr10, vr8
vpermi.w vr13, vr12, 0x0E
vpermi.w vr12, vr11, 0x44
vpermi.w vr13, vr11, 0x4E
vst vr12, a0, 0
vst vr13, a0, 16
.rept 3
addi.d a0, a0, 32
addi.d a1, a1, 64
DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
vpickev.h vr11, vr9, vr7
vpickev.h vr12, vr10, vr8
vpermi.w vr13, vr12, 0x0E
vpermi.w vr12, vr11, 0x44
vpermi.w vr13, vr11, 0x4E
vst vr12, a0, 0
vst vr13, a0, 16
.endr
b .DQ8x8_END_LSX
.DQ8x8_DEQUANT_SHR_LSX:
sub.w t4, zero, t0
vreplgr2vr.w vr6, t4
DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
vpickev.h vr11, vr9, vr7
vpickev.h vr12, vr10, vr8
vpermi.w vr13, vr12, 0x0E
vpermi.w vr12, vr11, 0x44
vpermi.w vr13, vr11, 0x4E
vst vr12, a0, 0
vst vr13, a0, 16
.rept 3
addi.d a0, a0, 32
addi.d a1, a1, 64
DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10
vpickev.h vr11, vr9, vr7
vpickev.h vr12, vr10, vr8
vpermi.w vr13, vr12, 0x0E
vpermi.w vr12, vr11, 0x44
vpermi.w vr13, vr11, 0x4E
vst vr12, a0, 0
vst vr13, a0, 16
.endr
.DQ8x8_END_LSX:
endfunc_x264
/*
* void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
*/
function_x264 dequant_4x4_dc_lasx
addi.w t0, zero, 6
div.w t1, a2, t0
sub.w t1, t1, t0
blt t1, zero, .DQ4x4DC_LT_ZERO
// i_qbits >= 0
mod.w t2, a2, t0
slli.w t2, t2, 6
ldx.w t0, a1, t2
sll.w t0, t0, t1
vld vr1, a0, 0
vld vr10, a0, 16
xvreplgr2vr.w xr2, t0
vext2xv.w.h xr3, xr1
xvmul.w xr6, xr3, xr2
vext2xv.w.h xr3, xr10
xvmul.w xr7, xr3, xr2
b .DQ4x4DC_END
// i_qbits < 0
.DQ4x4DC_LT_ZERO:
mod.w t2, a2, t0
slli.w t2, t2, 6
ldx.w t0, a1, t2
sub.w t3, zero, t1
vld vr1, a0, 0
vld vr10, a0, 16
xvreplgr2vr.w xr2, t0
xvreplgr2vr.w xr4, t3
vext2xv.w.h xr5, xr1
xvmul.w xr5, xr5, xr2
xvsrar.w xr6, xr5, xr4
vext2xv.w.h xr5, xr10
xvmul.w xr5, xr5, xr2
xvsrar.w xr7, xr5, xr4
.DQ4x4DC_END:
xvpickev.h xr8, xr7, xr6
xvpermi.d xr8, xr8, 0xd8
xvst xr8, a0, 0
endfunc_x264
function_x264 dequant_4x4_dc_lsx
addi.w t0, zero, 6
div.w t1, a2, t0
sub.w t1, t1, t0
blt t1, zero, .DQ4x4DC_LT_ZERO_LSX
mod.w t2, a2, t0
slli.w t2, t2, 6
ldx.w t0, a1, t2
sll.w t0, t0, t1
vld vr1, a0, 0
vld vr2, a0, 16
vreplgr2vr.w vr3, t0
vexth.w.h vr6, vr1
vsllwil.w.h vr5, vr1, 0
vmul.w vr5, vr5, vr3
vmul.w vr6, vr6, vr3
vexth.w.h vr8, vr2
vsllwil.w.h vr7, vr2, 0
vmul.w vr7, vr7, vr3
vmul.w vr8, vr8, vr3
b .DQ4x4DC_END_LSX
.DQ4x4DC_LT_ZERO_LSX:
mod.w t2, a2, t0
slli.w t2, t2, 6
ldx.w t0, a1, t2
sub.w t3, zero, t1
vld vr1, a0, 0
vld vr2, a0, 16
vreplgr2vr.w vr3, t0
vreplgr2vr.w vr4, t3
vexth.w.h vr6, vr1
vsllwil.w.h vr5, vr1, 0
vexth.w.h vr8, vr2
vsllwil.w.h vr7, vr2, 0
vmul.w vr5, vr5, vr3
vmul.w vr6, vr6, vr3
vmul.w vr7, vr7, vr3
vmul.w vr8, vr8, vr3
vsrar.w vr5, vr5, vr4
vsrar.w vr6, vr6, vr4
vsrar.w vr7, vr7, vr4
vsrar.w vr8, vr8, vr4
.DQ4x4DC_END_LSX:
vpickev.h vr9, vr7, vr5
vpickev.h vr10, vr8, vr6
vpermi.w vr11, vr10, 0x0E
vpermi.w vr10, vr9, 0x44
vpermi.w vr11, vr9, 0x4E
vst vr10, a0, 0
vst vr11, a0, 16
endfunc_x264
/*
* int decimate_score15( dctcoef *dct )
*/
function_x264 decimate_score15_lsx
addi.w t0, zero, 15
la.local t3, x264_decimate_table4
addi.d t4, a0, 2
vld vr0, t4, 0
vld vr1, t4, 16
vldi vr3, 1
vinsgr2vr.h vr1, zero, 7
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr2, vr1, 0x0E
vpermi.w vr1, vr0, 0x44
vpermi.w vr2, vr0, 0x4E
vsle.bu vr4, vr3, vr1
vsle.bu vr5, vr3, vr2
vssrlni.bu.h vr4, vr4, 4
vssrlni.bu.h vr5, vr5, 4
vclz.d vr4, vr4
vclz.d vr5, vr5
vpickve2gr.w t1, vr4, 0
srai.w t1, t1, 2
sub.w t2, t0, t1
addi.w t0, zero, 2
move a0, zero
slli.d t2, t2, 1
.LOOP_SCORE_15_LSX:
blt t2, zero, .END_SCORE_15_LSX
ldx.h t5, t4, t2
addi.d t6, t5, 1
bltu t0, t6, .RET_SCORE_15_1_LSX
addi.d t2, t2, -2
move t5, zero
.WHILE_SCORE_15_LSX:
blt t2, zero, .END_WHILE_15_LSX
ldx.h t1, t4, t2
bnez t1, .END_WHILE_15_LSX
addi.d t2, t2, -2
addi.d t5, t5, 1
b .WHILE_SCORE_15_LSX
.END_WHILE_15_LSX:
ldx.b t1, t3, t5
add.d a0, a0, t1
b .LOOP_SCORE_15_LSX
.RET_SCORE_15_1_LSX:
addi.d a0, zero, 9
jirl $r0, $r1, 0x0
.END_SCORE_15_LSX:
endfunc_x264
/*
* int decimate_score16( dctcoef *dct )
*/
function_x264 decimate_score16_lsx
addi.w t0, zero, 15
la.local t3, x264_decimate_table4
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr3, vr1, 0x0E
vpermi.w vr1, vr0, 0x44
vpermi.w vr3, vr0, 0x4E
vsle.bu vr4, vr2, vr1
vsle.bu vr5, vr2, vr3
vssrlni.bu.h vr4, vr4, 4
vssrlni.bu.h vr5, vr5, 4
vclz.d vr4, vr4
vclz.d vr5, vr5
vpickve2gr.w t1, vr4, 0
srai.w t1, t1, 2
sub.w t2, t0, t1
move t4, a0
addi.d t0, zero, 2
move a0, zero
slli.d t2, t2, 1
.LOOP_SCORE_16_LSX:
blt t2, zero, .END_SCORE_16_LSX
ldx.h t5, t4, t2
addi.d t6, t5, 1
bltu t0, t6, .RET_SCORE_16_1_LSX
addi.d t2, t2, -2
move t5, zero
.WHILE_SCORE_16_LSX:
blt t2, zero, .END_WHILE_16_LSX
ldx.h t1, t4, t2
bnez t1, .END_WHILE_16_LSX
addi.d t2, t2, -2
addi.d t5, t5, 1
b .WHILE_SCORE_16_LSX
.END_WHILE_16_LSX:
ldx.b t1, t3, t5
add.d a0, a0, t1
b .LOOP_SCORE_16_LSX
.RET_SCORE_16_1_LSX:
addi.d a0, zero, 9
jirl $r0, $r1, 0x0
.END_SCORE_16_LSX:
endfunc_x264
/*
* int decimate_score64( dctcoef *dct )
*/
function_x264 decimate_score64_lsx
addi.w t0, zero, 63
la.local t3, x264_decimate_table8
vxor.v vr20, vr0, vr0
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
vld vr4, a0, 64
vld vr5, a0, 80
vld vr6, a0, 96
vld vr7, a0, 112
vldi vr8, 1
vldi vr9, 0x408
vldi vr10, 0x401
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vssrlni.bu.h vr2, vr2, 0
vssrlni.bu.h vr3, vr3, 0
vssrlni.bu.h vr4, vr4, 0
vssrlni.bu.h vr5, vr5, 0
vssrlni.bu.h vr6, vr6, 0
vssrlni.bu.h vr7, vr7, 0
vpermi.w vr2, vr0, 0x44
vpermi.w vr3, vr1, 0x44
vpermi.w vr6, vr4, 0x44
vpermi.w vr7, vr5, 0x44
vsle.bu vr2, vr8, vr2
vsle.bu vr3, vr8, vr3
vsle.bu vr6, vr8, vr6
vsle.bu vr7, vr8, vr7
vssrlni.bu.h vr2, vr2, 4
vssrlni.bu.h vr3, vr3, 4
vssrlni.bu.h vr6, vr6, 4
vssrlni.bu.h vr7, vr7, 4
vpermi.w vr6, vr2, 0x44
vpermi.w vr7, vr3, 0x44
vpermi.w vr11, vr7, 0x0E
vpermi.w vr7, vr6, 0x44
vpermi.w vr7, vr7, 0xD8
vpermi.w vr11, vr6, 0x4E
vpermi.w vr11, vr11, 0xD8
vclz.w vr7, vr7
vclz.w vr11, vr11
vssrlni.hu.w vr7, vr7, 2
vssrlni.hu.w vr11, vr11, 2
vpermi.w vr12, vr11, 0x0E
vpermi.w vr11, vr7, 0x44
vpermi.w vr12, vr7, 0x4E
vsub.h vr11, vr9, vr11
vsub.h vr12, vr9, vr12
vsll.h vr13, vr10, vr11
vsll.h vr14, vr10, vr12
vssrlni.bu.h vr13, vr13, 1
vssrlni.bu.h vr14, vr14, 1
vclz.d vr15, vr14
vpickve2gr.w t1, vr15, 0
sub.w t2, t0, t1
move t4, a0
addi.d t0, zero, 2
slli.d t2, t2, 1
move a0, zero
.LOOP_SCORE_64_LSX:
blt t2, zero, .END_SCORE_64_LSX
ldx.h t5, t4, t2
addi.d t6, t5, 1
bltu t0, t6, .RET_SCORE_64_1_LSX
addi.d t2, t2, -2
move t5, zero
.WHILE_SCORE_64_LSX:
blt t2, zero, .END_WHILE_64_LSX
ldx.h t1, t4, t2
bnez t1, .END_WHILE_64_LSX
addi.d t2, t2, -2
addi.d t5, t5, 1
b .WHILE_SCORE_64_LSX
.END_WHILE_64_LSX:
ldx.b t1, t3, t5
add.d a0, a0, t1
b .LOOP_SCORE_64_LSX
.RET_SCORE_64_1_LSX:
addi.d a0, zero, 9
jirl $r0, $r1, 0x0
.END_SCORE_64_LSX:
endfunc_x264
/*
* int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
*/
function_x264 coeff_level_run16_lasx
addi.w t0, zero, 15
xvld xr0, a0, 0
xvldi xr2, 1
xvssrlni.bu.h xr0, xr0, 0
xvpermi.d xr1, xr0, 0xd8
xvsle.bu xr3, xr2, xr1
xvsrlni.b.h xr3, xr3, 4
xvpickve2gr.du t8, xr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN16_LASX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX
.END_COEFF_LEVEL_RUN16_LASX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run15_lasx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
xvldi xr3, 1
vinsgr2vr.h vr1, zero, 7
xvpermi.q xr1, xr0, 0x20
xvssrlni.bu.h xr1, xr1, 0
xvpermi.d xr2, xr1, 0xd8
xvsle.bu xr4, xr3, xr2
xvsrlni.b.h xr4, xr4, 4
xvpickve2gr.du t8, xr4, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN15_LASX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX
.END_COEFF_LEVEL_RUN15_LASX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run16_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN16_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX
.END_COEFF_LEVEL_RUN16_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run15_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vinsgr2vr.h vr1, zero, 7
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN15_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX
.END_COEFF_LEVEL_RUN15_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run8_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vxor.v vr1, vr1, vr1
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN8_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX
.END_COEFF_LEVEL_RUN8_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264