3549 lines
125 KiB
ArmAsm
3549 lines
125 KiB
ArmAsm
/*****************************************************************************
|
|
* pixel-a.S: LoongArch pixel metrics
|
|
*****************************************************************************
|
|
* Copyright (C) 2023-2025 x264 project
|
|
*
|
|
* Authors: Hecai Yuan <yuanhecai@loongson.cn>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at licensing@x264.com.
|
|
*****************************************************************************/
|
|
|
|
#include "loongson_asm.S"
|
|
#include "loongson_util.S"
|
|
#if !HIGH_BIT_DEPTH
|
|
|
|
const hmul_8p
|
|
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1
|
|
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1
|
|
endconst
|
|
|
|
const mask_ac4b
|
|
.short 0, -1, 0, -1, -1, -1, -1, -1
|
|
.short 0, -1, 0, -1, -1, -1, -1, -1
|
|
endconst
|
|
|
|
const mask_ac8
|
|
.short 0, -1, -1, -1, -1, -1, -1, -1
|
|
.short 0, -1, -1, -1, -1, -1, -1, -1
|
|
endconst
|
|
|
|
|
|
.macro LOAD_INC_8x4W n1, n2, n3, n4, n5
|
|
vld $vr\n1, a0, 0
|
|
vldx $vr\n2, a0, a1
|
|
vldx $vr\n3, a0, t0
|
|
vldx $vr\n4, a0, t1
|
|
xvpermi.d xr18, $xr\n1, 0x05
|
|
xvpermi.d xr19, $xr\n2, 0x05
|
|
xvpermi.d xr20, $xr\n3, 0x05
|
|
xvpermi.d xr21, $xr\n4, 0x05
|
|
add.d a0, a0, t2
|
|
xvdp2.h.bu.b $xr\n1, xr18, $xr\n5
|
|
xvdp2.h.bu.b $xr\n2, xr19, $xr\n5
|
|
xvdp2.h.bu.b $xr\n3, xr20, $xr\n5
|
|
xvdp2.h.bu.b $xr\n4, xr21, $xr\n5
|
|
.endm
|
|
|
|
.macro SUMSUB_BADC a, b, c, d
|
|
xvadd.h \a, \a, \b
|
|
xvadd.h \c, \c, \d
|
|
xvadd.h \b, \b, \b
|
|
xvadd.h \d, \d, \d
|
|
xvsub.h \b, \b, \a
|
|
xvsub.h \d, \d, \c
|
|
.endm
|
|
|
|
.macro HADAMARD4_V a, b, c, d
|
|
SUMSUB_BADC \a, \b, \c, \d
|
|
SUMSUB_BADC \a, \c, \b, \d
|
|
.endm
|
|
|
|
.macro HADAMARD_1 a, b, tmp
|
|
xmov \tmp, \a
|
|
xvpackod.h \a, \b, \a
|
|
xvpackev.h \b, \b, \tmp
|
|
xvadd.h \tmp, \a, \b
|
|
xvsub.h \b, \b, \a
|
|
xmov \a, \tmp
|
|
.endm
|
|
|
|
.macro HADAMARD_2 a, b, c
|
|
xvpickod.w \c, \b, \a
|
|
xvpickev.w \a, \b, \a
|
|
xvadda.h \a, \a, xr17
|
|
xvadda.h \c, \c, xr17
|
|
xvmax.h \a, \a, \c
|
|
.endm
|
|
|
|
.macro HADAMARD_AC_WXH_LASX w, h
|
|
function_x264 pixel_hadamard_ac_\w\()x\h\()_lasx
|
|
add.d t0, a1, a1
|
|
add.d t1, a1, t0
|
|
add.d t2, t1, a1
|
|
xvxor.v xr17, xr17, xr17
|
|
move t4, ra
|
|
bl x264_8_hadamard_ac_16x8_lasx
|
|
.if \h == 16
|
|
xmov xr11, xr9
|
|
xmov xr10, xr8
|
|
bl x264_8_hadamard_ac_16x8_lasx
|
|
xvadd.h xr9, xr9, xr11
|
|
xvadd.h xr8, xr8, xr10
|
|
.endif
|
|
move ra, t4
|
|
xvhaddw.wu.hu xr8, xr8, xr8
|
|
xvhaddw.du.wu xr8, xr8, xr8
|
|
xvhaddw.qu.du xr8, xr8, xr8
|
|
xvpickve2gr.wu t0, xr8, 0
|
|
xvpickve2gr.wu t1, xr8, 4
|
|
add.d t0, t0, t1
|
|
xvhaddw.wu.hu xr9, xr9, xr9
|
|
xvhaddw.du.wu xr9, xr9, xr9
|
|
xvhaddw.qu.du xr9, xr9, xr9
|
|
xvpickve2gr.wu t1, xr9, 0
|
|
xvpickve2gr.wu t2, xr9, 4
|
|
add.d t1, t1, t2
|
|
srli.d t0, t0, 2
|
|
srli.d t1, t1, 1
|
|
slli.d t0, t0, 32
|
|
add.d a0, t0, t1
|
|
endfunc_x264
|
|
.endm
|
|
|
|
function_x264 hadamard_ac_16x8_lasx
|
|
/* Load intermediate variable */
|
|
la.local t3, hmul_8p
|
|
xvld xr8, t3, 0
|
|
LOAD_INC_8x4W 0, 1, 2, 3, 8
|
|
HADAMARD4_V xr0, xr1, xr2, xr3
|
|
LOAD_INC_8x4W 4, 5, 6, 7, 8
|
|
HADAMARD4_V xr4, xr5, xr6, xr7
|
|
HADAMARD_1 xr0, xr1, xr8
|
|
HADAMARD_1 xr2, xr3, xr8
|
|
xmov xr18, xr1
|
|
HADAMARD_1 xr4, xr5, xr8
|
|
HADAMARD_1 xr6, xr7, xr8
|
|
xmov xr19, xr2
|
|
xmov xr20, xr3
|
|
xvadda.h xr1, xr0, xr4
|
|
xvsub.h xr21, xr4, xr0
|
|
xvadd.h xr0, xr4, xr0
|
|
la.local t3, mask_ac4b
|
|
xvld xr8, t3, 0
|
|
xvand.v xr1, xr1, xr8
|
|
xvadda.h xr1, xr1, xr5
|
|
xvadda.h xr1, xr1, xr18
|
|
xvadda.h xr1, xr1, xr19
|
|
xvadda.h xr1, xr1, xr20
|
|
xvadda.h xr1, xr1, xr6
|
|
xvadda.h xr9, xr1, xr7
|
|
|
|
xvadd.h xr3, xr7, xr20
|
|
xvsub.h xr7, xr7, xr20
|
|
xvadd.h xr2, xr6, xr19
|
|
xvsub.h xr6, xr6, xr19
|
|
xvadd.h xr1, xr5, xr18
|
|
xvsub.h xr5, xr5, xr18
|
|
|
|
HADAMARD_2 xr3, xr7, xr18
|
|
HADAMARD_2 xr2, xr6, xr19
|
|
HADAMARD_2 xr1, xr5, xr20
|
|
|
|
xvpickod.w xr5, xr21, xr0
|
|
xvpickev.w xr0, xr21, xr0
|
|
xmov xr4, xr5
|
|
xvadd.h xr5, xr0, xr4
|
|
xvsub.h xr4, xr4, xr0
|
|
|
|
xvadd.h xr2, xr2, xr3
|
|
xvadd.h xr2, xr2, xr1
|
|
xvadd.h xr2, xr2, xr2
|
|
|
|
la.local t3, mask_ac8
|
|
xvld xr8, t3, 0
|
|
xvand.v xr0, xr5, xr8
|
|
|
|
xvadda.h xr2, xr2, xr4
|
|
xvadda.h xr8, xr2, xr0
|
|
endfunc_x264
|
|
|
|
HADAMARD_AC_WXH_LASX 16, 8
|
|
HADAMARD_AC_WXH_LASX 16, 16
|
|
|
|
/* uint64_t hadamard_ac_8x8_lasx(uint8_t *p_pix,
|
|
* int32_t i_stride)
|
|
*/
|
|
function_x264 hadamard_ac_8x8_lasx
|
|
/* Load intermediate variable */
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a1, 2
|
|
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
|
|
vilvl.d vr8, vr1, vr0
|
|
vilvl.d vr9, vr3, vr2
|
|
vilvl.d vr10, vr5, vr4
|
|
vilvl.d vr11, vr7, vr6
|
|
xvpermi.q xr8, xr10, 0x02
|
|
xvpermi.q xr9, xr11, 0x02
|
|
xvpickev.b xr12, xr9, xr8
|
|
xvpickod.b xr13, xr9, xr8
|
|
xvaddwev.h.bu xr8, xr12, xr13
|
|
xvaddwod.h.bu xr9, xr12, xr13
|
|
xvsubwev.h.bu xr10, xr12, xr13
|
|
xvsubwod.h.bu xr11, xr12, xr13
|
|
xvadd.h xr12, xr8, xr9
|
|
xvadd.h xr13, xr10, xr11
|
|
xvsub.h xr14, xr8, xr9
|
|
xvsub.h xr15, xr10, xr11
|
|
|
|
xvilvl.h xr8, xr13, xr12
|
|
xvilvh.h xr9, xr13, xr12
|
|
xvilvl.h xr10, xr15, xr14
|
|
xvilvh.h xr11, xr15, xr14
|
|
xvilvl.w xr12, xr10, xr8
|
|
xvilvh.w xr13, xr10, xr8
|
|
xvilvl.w xr14, xr11, xr9
|
|
xvilvh.w xr15, xr11, xr9
|
|
xvadd.h xr8, xr12, xr13
|
|
xvadd.h xr9, xr14, xr15
|
|
xvsub.h xr10, xr12, xr13
|
|
xvsub.h xr11, xr14, xr15
|
|
xvadd.h xr12, xr8, xr9
|
|
xvadd.h xr13, xr10, xr11
|
|
xvsub.h xr14, xr8, xr9
|
|
xvsub.h xr15, xr10, xr11
|
|
|
|
vpickve2gr.hu t3, vr12, 0
|
|
vpickve2gr.hu t4, vr12, 4
|
|
xvor.v xr16, xr12, xr12
|
|
xvpermi.q xr16, xr16, 0x31
|
|
vpickve2gr.hu t5, vr16, 0
|
|
vpickve2gr.hu t6, vr16, 4
|
|
add.d t3, t3, t4
|
|
add.d t5, t5, t6
|
|
add.d t3, t3, t5
|
|
|
|
xvadda.h xr16, xr12, xr13
|
|
xvadda.h xr18, xr14, xr15
|
|
xvadd.h xr16, xr16, xr18
|
|
xvpermi.d xr17, xr16, 0x4e
|
|
xvadd.h xr18, xr16, xr17
|
|
xvhaddw.wu.hu xr18, xr18, xr18
|
|
xvhaddw.du.wu xr18, xr18, xr18
|
|
xvhaddw.qu.du xr18, xr18, xr18
|
|
xvpickve2gr.wu t4, xr18, 0
|
|
|
|
xvpackev.h xr8, xr13, xr12
|
|
xvpackev.h xr9, xr15, xr14
|
|
xvpackod.h xr10, xr13, xr12
|
|
xvpackod.h xr11, xr15, xr14
|
|
xvilvl.d xr12, xr9, xr8
|
|
xvilvh.d xr13, xr9, xr8
|
|
xvilvl.d xr14, xr11, xr10
|
|
xvilvh.d xr15, xr11, xr10
|
|
xvor.v xr16, xr12, xr12
|
|
xvor.v xr17, xr13, xr13
|
|
xvpermi.q xr12, xr14, 0x02
|
|
xvpermi.q xr13, xr14, 0x12
|
|
xvpermi.q xr16, xr15, 0x03
|
|
xvpermi.q xr17, xr15, 0x13
|
|
|
|
xvadd.h xr8, xr12, xr13
|
|
xvsub.h xr9, xr12, xr13
|
|
xvadd.h xr10, xr16, xr17
|
|
xvsub.h xr11, xr16, xr17
|
|
xvadd.h xr12, xr8, xr10
|
|
xvadd.h xr13, xr9, xr11
|
|
xvsub.h xr14, xr8, xr10
|
|
xvsub.h xr15, xr9, xr11
|
|
xvadda.h xr16, xr12, xr13
|
|
xvadda.h xr17, xr14, xr15
|
|
xvadd.h xr18, xr16, xr17
|
|
xvpermi.d xr19, xr18, 0x4e
|
|
xvadd.d xr19, xr18, xr19
|
|
xvhaddw.wu.hu xr19, xr19, xr19
|
|
xvhaddw.du.wu xr19, xr19, xr19
|
|
xvhaddw.qu.du xr19, xr19, xr19
|
|
xvpickve2gr.wu t5, xr19, 0
|
|
|
|
sub.d t4, t4, t3
|
|
sub.d t5, t5, t3
|
|
slli.d t5, t5, 32
|
|
add.d a0, t5, t4
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_16x16_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_16x16_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
slli.d t4, a1, 2
|
|
slli.d t5, a3, 2
|
|
add.d t6, a1, t2
|
|
add.d t7, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
|
|
xvpermi.q xr0, xr4, 0x02
|
|
xvpermi.q xr1, xr5, 0x02
|
|
xvpermi.q xr2, xr6, 0x02
|
|
xvpermi.q xr3, xr7, 0x02
|
|
xvpermi.q xr8, xr12, 0x02
|
|
xvpermi.q xr9, xr13, 0x02
|
|
xvpermi.q xr10, xr14, 0x02
|
|
xvpermi.q xr11, xr15, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr8
|
|
xvsubwod.h.bu xr5, xr0, xr8
|
|
xvsubwev.h.bu xr6, xr1, xr9
|
|
xvsubwod.h.bu xr7, xr1, xr9
|
|
xvsubwev.h.bu xr8, xr2, xr10
|
|
xvsubwod.h.bu xr9, xr2, xr10
|
|
xvsubwev.h.bu xr12, xr3, xr11
|
|
xvsubwod.h.bu xr13, xr3, xr11
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr12, xr13
|
|
xvsub.h xr7, xr12, xr13
|
|
xvpackev.h xr8, xr5, xr4
|
|
xvpackod.h xr9, xr5, xr4
|
|
xvpackev.h xr10, xr7, xr6
|
|
xvpackod.h xr11, xr7, xr6
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr7, xr10, xr11
|
|
xvilvl.h xr8, xr1, xr0
|
|
xvilvl.h xr9, xr3, xr2
|
|
xvilvl.h xr10, xr5, xr4
|
|
xvilvl.h xr11, xr7, xr6
|
|
xvilvh.h xr0, xr1, xr0
|
|
xvilvh.h xr1, xr3, xr2
|
|
xvilvh.h xr2, xr5, xr4
|
|
xvilvh.h xr3, xr7, xr6
|
|
xvadd.h xr4, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr5, xr8, xr9
|
|
xvsub.h xr7, xr10, xr11
|
|
xvadd.h xr8, xr4, xr6
|
|
xvadd.h xr9, xr5, xr7
|
|
xvsub.h xr10, xr4, xr6
|
|
xvsub.h xr11, xr5, xr7
|
|
xvadd.h xr4, xr0, xr1
|
|
xvadd.h xr6, xr2, xr3
|
|
xvsub.h xr5, xr0, xr1
|
|
xvsub.h xr7, xr2, xr3
|
|
xvadd.h xr0, xr4, xr6
|
|
xvadd.h xr1, xr5, xr7
|
|
xvsub.h xr2, xr4, xr6
|
|
xvsub.h xr3, xr5, xr7
|
|
xvadda.h xr8, xr8, xr9
|
|
xvadda.h xr9, xr10, xr11
|
|
xvadda.h xr0, xr0, xr1
|
|
xvadda.h xr1, xr2, xr3
|
|
xvadd.h xr8, xr8, xr9
|
|
xvadd.h xr0, xr0, xr1
|
|
xvadd.h xr16, xr0, xr8
|
|
|
|
add.d a0, a0, t4
|
|
add.d a2, a2, t5
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
|
|
xvpermi.q xr0, xr4, 0x02
|
|
xvpermi.q xr1, xr5, 0x02
|
|
xvpermi.q xr2, xr6, 0x02
|
|
xvpermi.q xr3, xr7, 0x02
|
|
xvpermi.q xr8, xr12, 0x02
|
|
xvpermi.q xr9, xr13, 0x02
|
|
xvpermi.q xr10, xr14, 0x02
|
|
xvpermi.q xr11, xr15, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr8
|
|
xvsubwod.h.bu xr5, xr0, xr8
|
|
xvsubwev.h.bu xr6, xr1, xr9
|
|
xvsubwod.h.bu xr7, xr1, xr9
|
|
xvsubwev.h.bu xr8, xr2, xr10
|
|
xvsubwod.h.bu xr9, xr2, xr10
|
|
xvsubwev.h.bu xr12, xr3, xr11
|
|
xvsubwod.h.bu xr13, xr3, xr11
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr12, xr13
|
|
xvsub.h xr7, xr12, xr13
|
|
xvpackev.h xr8, xr5, xr4
|
|
xvpackod.h xr9, xr5, xr4
|
|
xvpackev.h xr10, xr7, xr6
|
|
xvpackod.h xr11, xr7, xr6
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr7, xr10, xr11
|
|
xvilvl.h xr8, xr1, xr0
|
|
xvilvl.h xr9, xr3, xr2
|
|
xvilvl.h xr10, xr5, xr4
|
|
xvilvl.h xr11, xr7, xr6
|
|
xvilvh.h xr0, xr1, xr0
|
|
xvilvh.h xr1, xr3, xr2
|
|
xvilvh.h xr2, xr5, xr4
|
|
xvilvh.h xr3, xr7, xr6
|
|
xvadd.h xr4, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr5, xr8, xr9
|
|
xvsub.h xr7, xr10, xr11
|
|
xvadd.h xr8, xr4, xr6
|
|
xvadd.h xr9, xr5, xr7
|
|
xvsub.h xr10, xr4, xr6
|
|
xvsub.h xr11, xr5, xr7
|
|
xvadd.h xr4, xr0, xr1
|
|
xvadd.h xr6, xr2, xr3
|
|
xvsub.h xr5, xr0, xr1
|
|
xvsub.h xr7, xr2, xr3
|
|
xvadd.h xr0, xr4, xr6
|
|
xvadd.h xr1, xr5, xr7
|
|
xvsub.h xr2, xr4, xr6
|
|
xvsub.h xr3, xr5, xr7
|
|
xvadda.h xr8, xr8, xr9
|
|
xvadda.h xr9, xr10, xr11
|
|
xvadda.h xr0, xr0, xr1
|
|
xvadda.h xr1, xr2, xr3
|
|
xvadd.h xr8, xr8, xr9
|
|
xvadd.h xr0, xr0, xr1
|
|
xvadd.h xr0, xr0, xr8
|
|
xvadd.h xr0, xr0, xr16
|
|
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.du.wu xr0, xr0, xr0
|
|
xvhaddw.qu.du xr0, xr0, xr0
|
|
xvpickve2gr.wu t0, xr0, 0
|
|
xvpickve2gr.wu t1, xr0, 4
|
|
add.w t0, t0, t1
|
|
srli.d a0, t0, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_16x8_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_16x8_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
slli.d t4, t2, 1
|
|
slli.d t5, t3, 1
|
|
add.d t6, a1, t2
|
|
add.d t7, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
|
|
xvpermi.q xr0, xr4, 0x02
|
|
xvpermi.q xr1, xr5, 0x02
|
|
xvpermi.q xr2, xr6, 0x02
|
|
xvpermi.q xr3, xr7, 0x02
|
|
xvpermi.q xr8, xr12, 0x02
|
|
xvpermi.q xr9, xr13, 0x02
|
|
xvpermi.q xr10, xr14, 0x02
|
|
xvpermi.q xr11, xr15, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr8
|
|
xvsubwod.h.bu xr5, xr0, xr8
|
|
xvsubwev.h.bu xr6, xr1, xr9
|
|
xvsubwod.h.bu xr7, xr1, xr9
|
|
xvsubwev.h.bu xr8, xr2, xr10
|
|
xvsubwod.h.bu xr9, xr2, xr10
|
|
xvsubwev.h.bu xr12, xr3, xr11
|
|
xvsubwod.h.bu xr13, xr3, xr11
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr12, xr13
|
|
xvsub.h xr7, xr12, xr13
|
|
xvpackev.h xr8, xr5, xr4
|
|
xvpackod.h xr9, xr5, xr4
|
|
xvpackev.h xr10, xr7, xr6
|
|
xvpackod.h xr11, xr7, xr6
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr8, xr9
|
|
xvsub.h xr5, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr7, xr10, xr11
|
|
xvilvl.h xr8, xr1, xr0
|
|
xvilvl.h xr9, xr3, xr2
|
|
xvilvl.h xr10, xr5, xr4
|
|
xvilvl.h xr11, xr7, xr6
|
|
xvilvh.h xr0, xr1, xr0
|
|
xvilvh.h xr1, xr3, xr2
|
|
xvilvh.h xr2, xr5, xr4
|
|
xvilvh.h xr3, xr7, xr6
|
|
xvadd.h xr4, xr8, xr9
|
|
xvadd.h xr6, xr10, xr11
|
|
xvsub.h xr5, xr8, xr9
|
|
xvsub.h xr7, xr10, xr11
|
|
xvadd.h xr8, xr4, xr6
|
|
xvadd.h xr9, xr5, xr7
|
|
xvsub.h xr10, xr4, xr6
|
|
xvsub.h xr11, xr5, xr7
|
|
xvadd.h xr4, xr0, xr1
|
|
xvadd.h xr6, xr2, xr3
|
|
xvsub.h xr5, xr0, xr1
|
|
xvsub.h xr7, xr2, xr3
|
|
xvadd.h xr0, xr4, xr6
|
|
xvadd.h xr1, xr5, xr7
|
|
xvsub.h xr2, xr4, xr6
|
|
xvsub.h xr3, xr5, xr7
|
|
xvadda.h xr8, xr8, xr9
|
|
xvadda.h xr9, xr10, xr11
|
|
xvadda.h xr0, xr0, xr1
|
|
xvadda.h xr1, xr2, xr3
|
|
xvadd.h xr8, xr8, xr9
|
|
xvadd.h xr0, xr0, xr1
|
|
xvadd.h xr0, xr0, xr8
|
|
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.du.wu xr0, xr0, xr0
|
|
xvhaddw.qu.du xr0, xr0, xr0
|
|
xvpickve2gr.wu t0, xr0, 0
|
|
xvpickve2gr.wu t1, xr0, 4
|
|
add.w t0, t0, t1
|
|
srli.d a0, t0, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_8x16_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_8x16_lasx
|
|
slli.d t2, a1, 1
|
|
add.d t3, a1, t2
|
|
slli.d t4, a1, 2
|
|
slli.d t5, a3, 1
|
|
add.d t6, a3, t5
|
|
slli.d t7, a3, 2
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr2, vr5, vr4
|
|
vilvl.d vr3, vr7, vr6
|
|
xvpermi.q xr0, xr2, 0x02
|
|
xvpermi.q xr1, xr3, 0x02
|
|
vilvl.d vr2, vr9, vr8
|
|
vilvl.d vr3, vr11, vr10
|
|
vilvl.d vr4, vr13, vr12
|
|
vilvl.d vr5, vr15, vr14
|
|
xvpermi.q xr2, xr4, 0x02
|
|
xvpermi.q xr3, xr5, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr2
|
|
xvsubwod.h.bu xr5, xr0, xr2
|
|
xvsubwev.h.bu xr6, xr1, xr3
|
|
xvsubwod.h.bu xr7, xr1, xr3
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvilvl.h xr4, xr1, xr0
|
|
xvilvh.h xr5, xr1, xr0
|
|
xvilvl.h xr6, xr3, xr2
|
|
xvilvh.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr1, xr4, xr5
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr0, xr2
|
|
xvadd.h xr5, xr1, xr3
|
|
xvsub.h xr6, xr0, xr2
|
|
xvsub.h xr7, xr1, xr3
|
|
xvadda.h xr0, xr4, xr5
|
|
xvadda.h xr1, xr6, xr7
|
|
xvadd.h xr16, xr0, xr1
|
|
add.d a0, a0, t4
|
|
add.d a2, a2, t7
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr2, vr5, vr4
|
|
vilvl.d vr3, vr7, vr6
|
|
xvpermi.q xr0, xr2, 0x02
|
|
xvpermi.q xr1, xr3, 0x02
|
|
vilvl.d vr2, vr9, vr8
|
|
vilvl.d vr3, vr11, vr10
|
|
vilvl.d vr4, vr13, vr12
|
|
vilvl.d vr5, vr15, vr14
|
|
xvpermi.q xr2, xr4, 0x02
|
|
xvpermi.q xr3, xr5, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr2
|
|
xvsubwod.h.bu xr5, xr0, xr2
|
|
xvsubwev.h.bu xr6, xr1, xr3
|
|
xvsubwod.h.bu xr7, xr1, xr3
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvilvl.h xr4, xr1, xr0
|
|
xvilvh.h xr5, xr1, xr0
|
|
xvilvl.h xr6, xr3, xr2
|
|
xvilvh.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr1, xr4, xr5
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr0, xr2
|
|
xvadd.h xr5, xr1, xr3
|
|
xvsub.h xr6, xr0, xr2
|
|
xvsub.h xr7, xr1, xr3
|
|
xvadda.h xr0, xr4, xr5
|
|
xvadda.h xr1, xr6, xr7
|
|
xvadd.h xr0, xr0, xr1
|
|
xvadd.h xr0, xr0, xr16
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.du.wu xr0, xr0, xr0
|
|
xvhaddw.qu.du xr0, xr0, xr0
|
|
xvpickve2gr.wu t0, xr0, 0
|
|
xvpickve2gr.wu t1, xr0, 4
|
|
add.w t0, t0, t1
|
|
srli.d a0, t0, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_8x8_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_8x8_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t5, a3, 1
|
|
add.d t3, a1, t2
|
|
add.d t6, a3, t5
|
|
slli.d t4, t2, 1
|
|
slli.d t7, t5, 1
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t4
|
|
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t7
|
|
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
|
|
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr2, vr5, vr4
|
|
vilvl.d vr3, vr7, vr6
|
|
xvpermi.q xr0, xr2, 0x02
|
|
xvpermi.q xr1, xr3, 0x02
|
|
vilvl.d vr2, vr9, vr8
|
|
vilvl.d vr3, vr11, vr10
|
|
vilvl.d vr4, vr13, vr12
|
|
vilvl.d vr5, vr15, vr14
|
|
xvpermi.q xr2, xr4, 0x02
|
|
xvpermi.q xr3, xr5, 0x02
|
|
|
|
// HADAMARD4
|
|
xvsubwev.h.bu xr4, xr0, xr2
|
|
xvsubwod.h.bu xr5, xr0, xr2
|
|
xvsubwev.h.bu xr6, xr1, xr3
|
|
xvsubwod.h.bu xr7, xr1, xr3
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvpackev.h xr4, xr1, xr0
|
|
xvpackod.h xr5, xr1, xr0
|
|
xvpackev.h xr6, xr3, xr2
|
|
xvpackod.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvsub.h xr1, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr3, xr6, xr7
|
|
xvilvl.h xr4, xr1, xr0
|
|
xvilvh.h xr5, xr1, xr0
|
|
xvilvl.h xr6, xr3, xr2
|
|
xvilvh.h xr7, xr3, xr2
|
|
xvadd.h xr0, xr4, xr5
|
|
xvadd.h xr2, xr6, xr7
|
|
xvsub.h xr1, xr4, xr5
|
|
xvsub.h xr3, xr6, xr7
|
|
xvadd.h xr4, xr0, xr2
|
|
xvadd.h xr5, xr1, xr3
|
|
xvsub.h xr6, xr0, xr2
|
|
xvsub.h xr7, xr1, xr3
|
|
xvadda.h xr0, xr4, xr5
|
|
xvadda.h xr1, xr6, xr7
|
|
xvadd.h xr0, xr0, xr1
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.du.wu xr0, xr0, xr0
|
|
xvhaddw.qu.du xr0, xr0, xr0
|
|
xvpickve2gr.wu t0, xr0, 0
|
|
xvpickve2gr.wu t1, xr0, 4
|
|
add.w t0, t0, t1
|
|
srli.d a0, t0, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_8x4_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_8x4_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
xvpackev.d xr11, xr10, xr9
|
|
xvpackod.d xr12, xr10, xr9
|
|
xvadda.h xr11, xr11, xr12
|
|
xvhaddw.wu.hu xr11, xr11, xr11
|
|
xvhaddw.du.wu xr11, xr11, xr11
|
|
xvhaddw.qu.du xr11, xr11, xr11
|
|
xvpickve2gr.wu t4, xr11, 0
|
|
xvpickve2gr.wu t5, xr11, 4
|
|
add.d t4, t4, t5
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_4x16_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_4x16_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr9, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr10, vr7, vr5
|
|
|
|
slli.d t0, a1, 2
|
|
slli.d t1, a3, 2
|
|
// Load data from pix1 and pix2
|
|
add.d a0, a0, t0
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
add.d a2, a2, t1
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr5, vr7, vr5
|
|
xvpermi.q xr1, xr9, 0x20
|
|
xvpermi.q xr5, xr10, 0x20
|
|
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
|
|
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadda.h xr9, xr9, xr10
|
|
xvhaddw.wu.hu xr9, xr9, xr9
|
|
xvhaddw.du.wu xr9, xr9, xr9
|
|
xvhaddw.qu.du xr9, xr9, xr9
|
|
xvpickve2gr.wu t6, xr9, 0
|
|
xvpickve2gr.wu t7, xr9, 4
|
|
add.d t7, t6, t7
|
|
|
|
// Load data from pix1 and pix2
|
|
add.d a0, a0, t0
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
add.d a2, a2, t1
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr9, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr10, vr7, vr5
|
|
|
|
// Load data from pix1 and pix2
|
|
add.d a0, a0, t0
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
add.d a2, a2, t1
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr5, vr7, vr5
|
|
xvpermi.q xr1, xr9, 0x20
|
|
xvpermi.q xr5, xr10, 0x20
|
|
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
|
|
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadda.h xr9, xr9, xr10
|
|
xvhaddw.wu.hu xr9, xr9, xr9
|
|
xvhaddw.du.wu xr9, xr9, xr9
|
|
xvhaddw.qu.du xr9, xr9, xr9
|
|
xvpickve2gr.wu t6, xr9, 0
|
|
xvpickve2gr.wu t5, xr9, 4
|
|
add.d t6, t5, t6
|
|
add.d t7, t6, t7
|
|
srli.d a0, t7, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_4x8_lasx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_satd_4x8_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr9, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr10, vr7, vr5
|
|
|
|
slli.d t0, a1, 2
|
|
slli.d t1, a3, 2
|
|
add.d a0, a0, t0
|
|
add.d a2, a2, t1
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
|
|
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr5, vr7, vr5
|
|
xvpermi.q xr1, xr9, 0x20
|
|
xvpermi.q xr5, xr10, 0x20
|
|
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
|
|
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadda.h xr9, xr9, xr10
|
|
xvhaddw.wu.hu xr9, xr9, xr9
|
|
xvhaddw.du.wu xr9, xr9, xr9
|
|
xvhaddw.qu.du xr9, xr9, xr9
|
|
xvpickve2gr.wu t6, xr9, 0
|
|
xvpickve2gr.wu t7, xr9, 4
|
|
add.d t6, t6, t7
|
|
srli.d a0, t6, 1
|
|
endfunc_x264
|
|
|
|
/* int x264_pixel_satd_4x4_lsx(pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
.macro pixel_satd_4x4_lsx_core out
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr5, vr7, vr5
|
|
|
|
vsubwev.h.bu vr9, vr1, vr5
|
|
vsubwod.h.bu vr10, vr1, vr5
|
|
vadd.h vr11, vr9, vr10 /* a0 + a1 */
|
|
vsub.h vr12, vr9, vr10 /* a0 - a1 */
|
|
vpackev.h vr9, vr12, vr11
|
|
vpackod.h vr10, vr12, vr11
|
|
vadd.h vr11, vr9, vr10 /* b0 + b1 */
|
|
vsub.h vr12, vr9, vr10 /* b0 - b1 */
|
|
vpackev.w vr9, vr12, vr11
|
|
vpackod.w vr10, vr12, vr11
|
|
vadd.h vr11, vr9, vr10 /* HADAMARD4 */
|
|
vsub.h vr12, vr9, vr10
|
|
vpackev.d vr9, vr12, vr11
|
|
vpackod.d vr10, vr12, vr11
|
|
vadd.h vr11, vr9, vr10
|
|
vsub.h vr12, vr9, vr10
|
|
vpackev.d vr9, vr12, vr11
|
|
vpackod.d vr10, vr12, vr11
|
|
vadda.h \out, vr9, vr10
|
|
.endm
|
|
|
|
function_x264 pixel_satd_4x4_lsx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr13
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu t5, vr13, 0
|
|
srli.d a0, t5, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_ssd_16x16_lasx(const Pixel *pix1, intptr_t stride_pix1,
|
|
* const Pixel *pix2, intptr_t stride_pix2)
|
|
*/
|
|
function_x264 pixel_ssd_16x16_lasx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
add.d t2, a1, t1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a3, t3
|
|
add.d t5, a3, t4
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
vext2xv.hu.bu xr5, xr5
|
|
vext2xv.hu.bu xr6, xr6
|
|
vext2xv.hu.bu xr7, xr7
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
vext2xv.hu.bu xr12, xr12
|
|
vext2xv.hu.bu xr13, xr13
|
|
vext2xv.hu.bu xr14, xr14
|
|
vext2xv.hu.bu xr15, xr15
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvsub.h xr4, xr4, xr12
|
|
xvsub.h xr5, xr5, xr13
|
|
xvsub.h xr6, xr6, xr14
|
|
xvsub.h xr7, xr7, xr15
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvmul.h xr4, xr4, xr4
|
|
xvmul.h xr5, xr5, xr5
|
|
xvmul.h xr6, xr6, xr6
|
|
xvmul.h xr7, xr7, xr7
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr4, xr4, xr4
|
|
xvhaddw.wu.hu xr5, xr5, xr5
|
|
xvhaddw.wu.hu xr6, xr6, xr6
|
|
xvhaddw.wu.hu xr7, xr7, xr7
|
|
xvadd.w xr16, xr0, xr1
|
|
xvadd.w xr17, xr2, xr3
|
|
xvadd.w xr18, xr4, xr5
|
|
xvadd.w xr19, xr6, xr7
|
|
xvadd.w xr16, xr16, xr17
|
|
xvadd.w xr18, xr18, xr19
|
|
xvadd.w xr16, xr16, xr18
|
|
|
|
// Load data from pix1 and pix2
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
vext2xv.hu.bu xr5, xr5
|
|
vext2xv.hu.bu xr6, xr6
|
|
vext2xv.hu.bu xr7, xr7
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
vext2xv.hu.bu xr12, xr12
|
|
vext2xv.hu.bu xr13, xr13
|
|
vext2xv.hu.bu xr14, xr14
|
|
vext2xv.hu.bu xr15, xr15
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvsub.h xr4, xr4, xr12
|
|
xvsub.h xr5, xr5, xr13
|
|
xvsub.h xr6, xr6, xr14
|
|
xvsub.h xr7, xr7, xr15
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvmul.h xr4, xr4, xr4
|
|
xvmul.h xr5, xr5, xr5
|
|
xvmul.h xr6, xr6, xr6
|
|
xvmul.h xr7, xr7, xr7
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr4, xr4, xr4
|
|
xvhaddw.wu.hu xr5, xr5, xr5
|
|
xvhaddw.wu.hu xr6, xr6, xr6
|
|
xvhaddw.wu.hu xr7, xr7, xr7
|
|
xvadd.w xr0, xr0, xr1
|
|
xvadd.w xr2, xr2, xr3
|
|
xvadd.w xr4, xr4, xr5
|
|
xvadd.w xr6, xr6, xr7
|
|
xvadd.w xr0, xr0, xr2
|
|
xvadd.w xr4, xr4, xr6
|
|
xvadd.w xr0, xr0, xr4
|
|
xvadd.w xr0, xr0, xr16
|
|
|
|
// Calculate the sum
|
|
xvhaddw.d.w xr0, xr0, xr0
|
|
xvhaddw.q.d xr0, xr0, xr0
|
|
xvpickve2gr.w t2, xr0, 0
|
|
xvpickve2gr.w t3, xr0, 4
|
|
add.d a0, t2, t3
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_ssd_16x8_lasx(const Pixel *pix1, intptr_t stride_pix1,
|
|
* const Pixel *pix2, intptr_t stride_pix2)
|
|
*/
|
|
function_x264 pixel_ssd_16x8_lasx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
add.d t2, a1, t1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a3, t3
|
|
add.d t5, a3, t4
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
vext2xv.hu.bu xr5, xr5
|
|
vext2xv.hu.bu xr6, xr6
|
|
vext2xv.hu.bu xr7, xr7
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
vext2xv.hu.bu xr12, xr12
|
|
vext2xv.hu.bu xr13, xr13
|
|
vext2xv.hu.bu xr14, xr14
|
|
vext2xv.hu.bu xr15, xr15
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvsub.h xr4, xr4, xr12
|
|
xvsub.h xr5, xr5, xr13
|
|
xvsub.h xr6, xr6, xr14
|
|
xvsub.h xr7, xr7, xr15
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvmul.h xr4, xr4, xr4
|
|
xvmul.h xr5, xr5, xr5
|
|
xvmul.h xr6, xr6, xr6
|
|
xvmul.h xr7, xr7, xr7
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr4, xr4, xr4
|
|
xvhaddw.wu.hu xr5, xr5, xr5
|
|
xvhaddw.wu.hu xr6, xr6, xr6
|
|
xvhaddw.wu.hu xr7, xr7, xr7
|
|
xvadd.w xr0, xr0, xr1
|
|
xvadd.w xr2, xr2, xr3
|
|
xvadd.w xr4, xr4, xr5
|
|
xvadd.w xr6, xr6, xr7
|
|
xvadd.w xr0, xr0, xr2
|
|
xvadd.w xr4, xr4, xr6
|
|
xvadd.w xr0, xr0, xr4
|
|
|
|
// Calculate the sum
|
|
xvhaddw.d.w xr0, xr0, xr0
|
|
xvhaddw.q.d xr0, xr0, xr0
|
|
xvpickve2gr.w t2, xr0, 0
|
|
xvpickve2gr.w t3, xr0, 4
|
|
add.d a0, t2, t3
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_ssd_8x16_lasx(const Pixel *pix1, intptr_t stride_pix1,
|
|
* const Pixel *pix2, intptr_t stride_pix2)
|
|
*/
|
|
function_x264 pixel_ssd_8x16_lasx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
add.d t2, a1, t1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a3, t3
|
|
add.d t5, a3, t4
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
|
|
vilvl.d vr0, vr4, vr0
|
|
vilvl.d vr1, vr5, vr1
|
|
vilvl.d vr2, vr6, vr2
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr8, vr12, vr8
|
|
vilvl.d vr9, vr13, vr9
|
|
vilvl.d vr10, vr14, vr10
|
|
vilvl.d vr11, vr15, vr11
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvadd.w xr0, xr0, xr1
|
|
xvadd.w xr2, xr2, xr3
|
|
xvadd.w xr16, xr0, xr2
|
|
|
|
// Load data from pix1 and pix2
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
|
|
vilvl.d vr0, vr4, vr0
|
|
vilvl.d vr1, vr5, vr1
|
|
vilvl.d vr2, vr6, vr2
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr8, vr12, vr8
|
|
vilvl.d vr9, vr13, vr9
|
|
vilvl.d vr10, vr14, vr10
|
|
vilvl.d vr11, vr15, vr11
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvadd.w xr0, xr0, xr1
|
|
xvadd.w xr2, xr2, xr3
|
|
xvadd.w xr0, xr0, xr2
|
|
xvadd.w xr0, xr0, xr16
|
|
|
|
// Calculate the sum
|
|
xvhaddw.d.w xr0, xr0, xr0
|
|
xvhaddw.q.d xr0, xr0, xr0
|
|
xvpickve2gr.w t2, xr0, 0
|
|
xvpickve2gr.w t3, xr0, 4
|
|
add.d a0, t2, t3
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_ssd_8x8_lasx(const Pixel *pix1, intptr_t stride_pix1,
|
|
* const Pixel *pix2, intptr_t stride_pix2)
|
|
*/
|
|
function_x264 pixel_ssd_8x8_lasx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
add.d t2, a1, t1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a3, t3
|
|
add.d t5, a3, t4
|
|
|
|
// Load data from pix1 and pix2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
add.d a0, a0, t2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
|
|
add.d a2, a2, t5
|
|
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
|
|
|
|
vilvl.d vr0, vr4, vr0
|
|
vilvl.d vr1, vr5, vr1
|
|
vilvl.d vr2, vr6, vr2
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr8, vr12, vr8
|
|
vilvl.d vr9, vr13, vr9
|
|
vilvl.d vr10, vr14, vr10
|
|
vilvl.d vr11, vr15, vr11
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr8, xr8
|
|
vext2xv.hu.bu xr9, xr9
|
|
vext2xv.hu.bu xr10, xr10
|
|
vext2xv.hu.bu xr11, xr11
|
|
|
|
// Calculate the square of the difference
|
|
xvsub.h xr0, xr0, xr8
|
|
xvsub.h xr1, xr1, xr9
|
|
xvsub.h xr2, xr2, xr10
|
|
xvsub.h xr3, xr3, xr11
|
|
xvmul.h xr0, xr0, xr0
|
|
xvmul.h xr1, xr1, xr1
|
|
xvmul.h xr2, xr2, xr2
|
|
xvmul.h xr3, xr3, xr3
|
|
xvhaddw.wu.hu xr0, xr0, xr0
|
|
xvhaddw.wu.hu xr1, xr1, xr1
|
|
xvhaddw.wu.hu xr2, xr2, xr2
|
|
xvhaddw.wu.hu xr3, xr3, xr3
|
|
xvadd.w xr0, xr0, xr1
|
|
xvadd.w xr2, xr2, xr3
|
|
xvadd.w xr0, xr0, xr2
|
|
|
|
// Calculate the sum
|
|
xvhaddw.d.w xr0, xr0, xr0
|
|
xvhaddw.q.d xr0, xr0, xr0
|
|
xvpickve2gr.w t2, xr0, 0
|
|
xvpickve2gr.w t3, xr0, 4
|
|
add.d a0, t2, t3
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_sa8d_16x16_lasx(const Pixel *pix1, intptr_t i_pix1,
|
|
* const Pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_sa8d_16x16_lasx
|
|
addi.d sp, sp, -8
|
|
fst.d f24, sp, 0
|
|
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
slli.d t6, a1, 2
|
|
slli.d t7, a3, 2
|
|
slli.d t0, a1, 3
|
|
slli.d t1, a3, 3
|
|
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr15, xr11, xr13
|
|
xvsub.h xr16, xr11, xr13
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
xvadd.h xr17, xr15, xr9
|
|
xvadd.h xr18, xr16, xr10
|
|
xvsub.h xr19, xr15, xr9
|
|
xvsub.h xr20, xr16, xr10
|
|
xvadda.h xr17, xr17, xr18
|
|
xvadda.h xr19, xr19, xr20
|
|
xvadd.h xr21, xr17, xr19
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr15, xr11, xr13
|
|
xvsub.h xr16, xr11, xr13
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
xvadd.h xr17, xr15, xr9
|
|
xvadd.h xr18, xr16, xr10
|
|
xvsub.h xr19, xr15, xr9
|
|
xvsub.h xr20, xr16, xr10
|
|
xvadda.h xr17, xr17, xr18
|
|
xvadda.h xr19, xr19, xr20
|
|
xvadd.h xr22, xr17, xr19
|
|
|
|
sub.d a0, a0, t6
|
|
sub.d a2, a2, t7
|
|
addi.d a0, a0, 8
|
|
addi.d a2, a2, 8
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr15, xr11, xr13
|
|
xvsub.h xr16, xr11, xr13
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
xvadd.h xr17, xr15, xr9
|
|
xvadd.h xr18, xr16, xr10
|
|
xvsub.h xr19, xr15, xr9
|
|
xvsub.h xr20, xr16, xr10
|
|
xvadda.h xr17, xr17, xr18
|
|
xvadda.h xr19, xr19, xr20
|
|
xvadd.h xr23, xr17, xr19
|
|
|
|
sub.d a0, a0, t0
|
|
sub.d a2, a2, t1
|
|
sub.d a0, a0, t6
|
|
sub.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr15, xr11, xr13
|
|
xvsub.h xr16, xr11, xr13
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr12, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
xvadd.h xr17, xr15, xr9
|
|
xvadd.h xr18, xr16, xr10
|
|
xvsub.h xr19, xr15, xr9
|
|
xvsub.h xr20, xr16, xr10
|
|
xvadda.h xr17, xr17, xr18
|
|
xvadda.h xr19, xr19, xr20
|
|
xvadd.h xr24, xr17, xr19
|
|
|
|
xvadd.h xr21, xr21, xr22
|
|
xvadd.h xr23, xr23, xr24
|
|
xvhaddw.wu.hu xr21, xr21, xr21
|
|
xvhaddw.wu.hu xr23, xr23, xr23
|
|
xvadd.w xr21, xr21, xr23
|
|
xvhaddw.du.wu xr21, xr21, xr21
|
|
xvhaddw.qu.du xr21, xr21, xr21
|
|
xvpickve2gr.du t4, xr21, 0
|
|
xvpickve2gr.du t5, xr21, 2
|
|
add.d t4, t4, t5
|
|
addi.d t4, t4, 2
|
|
srli.d a0, t4, 2
|
|
|
|
fld.d f24, sp, 0
|
|
addi.d sp, sp, 8
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_sa8d_8x8_lasx(const Pixel *pix1, intptr_t i_pix1,
|
|
* const Pixel *pix2, intptr_t i_pix2)
|
|
*/
|
|
function_x264 pixel_sa8d_8x8_lasx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
slli.d t6, a1, 2
|
|
slli.d t7, a3, 2
|
|
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvor.v xr14, xr12, xr12
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr14, 0x13
|
|
xvadd.h xr15, xr11, xr13
|
|
xvsub.h xr16, xr11, xr13
|
|
|
|
add.d a0, a0, t6
|
|
add.d a2, a2, t7
|
|
// Load data from pix1 and pix2
|
|
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr1, xr3, 0x02
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvsubwev.h.bu xr9, xr1, xr5
|
|
xvsubwod.h.bu xr10, xr1, xr5
|
|
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
|
|
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
|
|
xvpackev.h xr9, xr12, xr11
|
|
xvpackod.h xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.w xr9, xr12, xr11
|
|
xvpackod.w xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10
|
|
xvsub.h xr12, xr9, xr10
|
|
xvpackev.d xr9, xr12, xr11
|
|
xvpackod.d xr10, xr12, xr11
|
|
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
|
|
xvsub.h xr12, xr9, xr10
|
|
xvor.v xr13, xr11, xr11
|
|
xvor.v xr14, xr12, xr12
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr13, xr14, 0x13
|
|
xvadd.h xr9, xr11, xr13
|
|
xvsub.h xr10, xr11, xr13
|
|
|
|
xvadd.h xr17, xr15, xr9
|
|
xvadd.h xr18, xr16, xr10
|
|
xvsub.h xr19, xr15, xr9
|
|
xvsub.h xr20, xr16, xr10
|
|
xvadda.h xr17, xr17, xr18
|
|
xvadda.h xr19, xr19, xr20
|
|
xvadd.h xr17, xr17, xr19
|
|
xvhaddw.wu.hu xr17, xr17, xr17
|
|
xvhaddw.du.wu xr17, xr17, xr17
|
|
xvhaddw.qu.du xr17, xr17, xr17
|
|
xvpickve2gr.wu t4, xr17, 0
|
|
xvpickve2gr.wu t5, xr17, 4
|
|
add.d t4, t4, t5
|
|
addi.d t4, t4, 2
|
|
srli.d a0, t4, 2
|
|
endfunc_x264
|
|
|
|
.macro sse_diff_8width_lasx in0, in1
|
|
fld.d f0, \in0, 0
|
|
fld.d f1, \in0, FENC_STRIDE
|
|
fld.d f2, \in0, FENC_STRIDE * 2
|
|
fld.d f3, \in0, FENC_STRIDE * 3
|
|
fld.d f4, \in1, 0
|
|
fld.d f5, \in1, FDEC_STRIDE
|
|
fld.d f6, \in1, FDEC_STRIDE * 2
|
|
fld.d f7, \in1, FDEC_STRIDE * 3
|
|
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
xvpermi.q xr1, xr0, 0x20
|
|
xvpermi.q xr5, xr4, 0x20
|
|
|
|
xvilvl.b xr2, xr5, xr1
|
|
xvilvh.b xr6, xr5, xr1
|
|
xvhsubw.hu.bu xr3, xr2, xr2
|
|
xvhsubw.hu.bu xr4, xr6, xr6
|
|
xvdp2add.w.h xr8, xr3, xr3
|
|
xvdp2add.w.h xr8, xr4, xr4
|
|
xvadd.h xr9, xr9, xr3
|
|
xvadd.h xr9, xr9, xr4
|
|
.endm
|
|
|
|
/*
|
|
* int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
|
|
* int32_t ssd[2] )
|
|
*/
|
|
function_x264 pixel_var2_8x16_lasx
|
|
add.d t0, a0, zero
|
|
add.d t1, a1, zero
|
|
xvxor.v xr8, xr8, xr8
|
|
xvxor.v xr9, xr9, xr9
|
|
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
|
|
xvhaddw.w.h xr9, xr9, xr9
|
|
xvhaddw.d.w xr9, xr9, xr9
|
|
xvhaddw.q.d xr9, xr9, xr9
|
|
xvpickve2gr.wu t2, xr9, 0
|
|
xvpickve2gr.wu t3, xr9, 4
|
|
add.w t2, t2, t3
|
|
xvhaddw.d.w xr8, xr8, xr8
|
|
xvhaddw.q.d xr8, xr8, xr8
|
|
xvpickve2gr.wu t3, xr8, 0
|
|
xvpickve2gr.wu t4, xr8, 4
|
|
add.w t3, t4, t3
|
|
st.w t3, a2, 0
|
|
mul.w t2, t2, t2
|
|
srai.w t2, t2, 7
|
|
sub.w t3, t3, t2
|
|
|
|
xvxor.v xr8, xr8, xr8
|
|
xvxor.v xr9, xr9, xr9
|
|
addi.d a0, t0, FENC_STRIDE / 2
|
|
addi.d a1, t1, FDEC_STRIDE / 2
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
|
|
xvhaddw.w.h xr9, xr9, xr9
|
|
xvhaddw.d.w xr9, xr9, xr9
|
|
xvhaddw.q.d xr9, xr9, xr9
|
|
xvpickve2gr.wu t4, xr9, 0
|
|
xvpickve2gr.wu t5, xr9, 4
|
|
add.w t4, t4, t5
|
|
xvhaddw.d.w xr8, xr8, xr8
|
|
xvhaddw.q.d xr8, xr8, xr8
|
|
xvpickve2gr.wu t5, xr8, 0
|
|
xvpickve2gr.wu t6, xr8, 4
|
|
add.w t5, t6, t5
|
|
st.w t5, a2, 4
|
|
mul.w t4, t4, t4
|
|
srai.w t4, t4, 7
|
|
sub.w t5, t5, t4
|
|
add.w a0, t3, t5
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
|
|
* int32_t ssd[2] )
|
|
*/
|
|
function_x264 pixel_var2_8x8_lasx
|
|
add.d t0, a0, zero
|
|
add.d t1, a1, zero
|
|
xvxor.v xr8, xr8, xr8
|
|
xvxor.v xr9, xr9, xr9
|
|
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
|
|
xvhaddw.w.h xr9, xr9, xr9
|
|
xvhaddw.d.w xr9, xr9, xr9
|
|
xvhaddw.q.d xr9, xr9, xr9
|
|
xvpickve2gr.wu t2, xr9, 0
|
|
xvpickve2gr.wu t3, xr9, 4
|
|
add.w t2, t2, t3
|
|
xvhaddw.d.w xr8, xr8, xr8
|
|
xvhaddw.q.d xr8, xr8, xr8
|
|
xvpickve2gr.wu t3, xr8, 0
|
|
xvpickve2gr.wu t4, xr8, 4
|
|
add.w t3, t4, t3
|
|
st.w t3, a2, 0
|
|
mul.w t2, t2, t2
|
|
srai.w t2, t2, 6
|
|
sub.w t3, t3, t2
|
|
|
|
xvxor.v xr8, xr8, xr8
|
|
xvxor.v xr9, xr9, xr9
|
|
addi.d a0, t0, FENC_STRIDE / 2
|
|
addi.d a1, t1, FDEC_STRIDE / 2
|
|
sse_diff_8width_lasx a0, a1
|
|
addi.d a0, a0, FENC_STRIDE * 4
|
|
addi.d a1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lasx a0, a1
|
|
|
|
xvhaddw.w.h xr9, xr9, xr9
|
|
xvhaddw.d.w xr9, xr9, xr9
|
|
xvhaddw.q.d xr9, xr9, xr9
|
|
xvpickve2gr.wu t4, xr9, 0
|
|
xvpickve2gr.wu t5, xr9, 4
|
|
add.w t4, t4, t5
|
|
xvhaddw.d.w xr8, xr8, xr8
|
|
xvhaddw.q.d xr8, xr8, xr8
|
|
xvpickve2gr.wu t5, xr8, 0
|
|
xvpickve2gr.wu t6, xr8, 4
|
|
add.w t5, t6, t5
|
|
st.w t5, a2, 4
|
|
mul.w t4, t4, t4
|
|
srai.w t4, t4, 6
|
|
sub.w t5, t5, t4
|
|
add.w a0, t3, t5
|
|
endfunc_x264
|
|
|
|
|
|
/*
|
|
* uint64_t x264_pixel_hadamard_ac_8x8( pixel *pix, intptr_t stride )
|
|
*/
|
|
function_x264 hadamard_ac_8x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
alsl.d a0, a1, a0, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
|
|
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
|
|
vpickev.b vr2, vr1, vr0
|
|
vpickod.b vr3, vr1, vr0
|
|
vaddwev.h.bu vr6, vr2, vr3
|
|
vaddwod.h.bu vr7, vr2, vr3
|
|
vsubwev.h.bu vr8, vr2, vr3
|
|
vsubwod.h.bu vr9, vr2, vr3
|
|
vadd.h vr10, vr6, vr7
|
|
vadd.h vr11, vr8, vr9
|
|
vsub.h vr12, vr6, vr7
|
|
vsub.h vr13, vr8, vr9
|
|
|
|
vilvl.h vr6, vr11, vr10
|
|
vilvh.h vr7, vr11, vr10
|
|
vilvl.h vr8, vr13, vr12
|
|
vilvh.h vr9, vr13, vr12
|
|
vilvl.w vr10, vr8, vr6
|
|
vilvh.w vr11, vr8, vr6
|
|
vilvl.w vr12, vr9, vr7
|
|
vilvh.w vr13, vr9, vr7
|
|
|
|
vadd.h vr6, vr10, vr11
|
|
vadd.h vr7, vr12, vr13
|
|
vsub.h vr8, vr10, vr11
|
|
vsub.h vr9, vr12, vr13
|
|
vadd.h vr10, vr6, vr7
|
|
vadd.h vr11, vr8, vr9
|
|
vsub.h vr12, vr6, vr7
|
|
vsub.h vr13, vr8, vr9
|
|
|
|
vpickev.b vr2, vr5, vr4
|
|
vpickod.b vr3, vr5, vr4
|
|
vaddwev.h.bu vr6, vr2, vr3
|
|
vaddwod.h.bu vr7, vr2, vr3
|
|
vsubwev.h.bu vr8, vr2, vr3
|
|
vsubwod.h.bu vr9, vr2, vr3
|
|
vadd.h vr14, vr6, vr7
|
|
vadd.h vr15, vr8, vr9
|
|
vsub.h vr16, vr6, vr7
|
|
vsub.h vr17, vr8, vr9
|
|
|
|
vilvl.h vr6, vr15, vr14
|
|
vilvh.h vr7, vr15, vr14
|
|
vilvl.h vr8, vr17, vr16
|
|
vilvh.h vr9, vr17, vr16
|
|
vilvl.w vr14, vr8, vr6
|
|
vilvh.w vr15, vr8, vr6
|
|
vilvl.w vr16, vr9, vr7
|
|
vilvh.w vr17, vr9, vr7
|
|
|
|
vadd.h vr6, vr14, vr15
|
|
vadd.h vr7, vr16, vr17
|
|
vsub.h vr8, vr14, vr15
|
|
vsub.h vr9, vr16, vr17
|
|
vadd.h vr14, vr6, vr7
|
|
vadd.h vr15, vr8, vr9
|
|
vsub.h vr16, vr6, vr7
|
|
vsub.h vr17, vr8, vr9
|
|
|
|
vadd.h vr18, vr10, vr14
|
|
vpickve2gr.hu t0, vr18, 0
|
|
vpickve2gr.hu t1, vr18, 4
|
|
add.d t1, t0, t1 // dc
|
|
|
|
vadda.h vr4, vr11, vr10
|
|
vadda.h vr5, vr13, vr12
|
|
vadda.h vr6, vr15, vr14
|
|
vadda.h vr7, vr17, vr16
|
|
vadd.h vr4, vr5, vr4
|
|
vadd.h vr6, vr7, vr6
|
|
vadd.h vr4, vr4, vr6
|
|
vhaddw.wu.hu vr4, vr4, vr4
|
|
vhaddw.du.wu vr4, vr4, vr4
|
|
vhaddw.qu.du vr4, vr4, vr4
|
|
vpickve2gr.wu t0, vr4, 0 // sum4
|
|
|
|
vpackev.h vr0, vr11, vr10
|
|
vpackev.h vr1, vr13, vr12
|
|
vpackev.h vr2, vr15, vr14
|
|
vpackev.h vr3, vr17, vr16
|
|
vpackod.h vr4, vr11, vr10
|
|
vpackod.h vr5, vr13, vr12
|
|
vpackod.h vr6, vr15, vr14
|
|
vpackod.h vr7, vr17, vr16
|
|
|
|
vilvl.d vr10, vr1, vr0
|
|
vilvh.d vr11, vr1, vr0
|
|
vilvl.d vr12, vr3, vr2
|
|
vilvh.d vr13, vr3, vr2
|
|
vilvl.d vr14, vr5, vr4
|
|
vilvh.d vr15, vr5, vr4
|
|
vilvl.d vr16, vr7, vr6
|
|
vilvh.d vr17, vr7, vr6
|
|
|
|
vadd.h vr0, vr10, vr11
|
|
vadd.h vr1, vr12, vr13
|
|
vadd.h vr2, vr14, vr16
|
|
vadd.h vr3, vr15, vr17
|
|
vsub.h vr4, vr10, vr11
|
|
vsub.h vr5, vr12, vr13
|
|
vsub.h vr6, vr14, vr16
|
|
vsub.h vr7, vr15, vr17
|
|
|
|
vadd.h vr10, vr0, vr1
|
|
vadd.h vr11, vr2, vr3
|
|
vadd.h vr12, vr4, vr5
|
|
vadd.h vr13, vr6, vr7
|
|
vsub.h vr14, vr0, vr1
|
|
vsub.h vr15, vr2, vr3
|
|
vsub.h vr16, vr4, vr5
|
|
vsub.h vr17, vr6, vr7
|
|
|
|
vadda.h vr10, vr10, vr11
|
|
vadda.h vr11, vr12, vr13
|
|
vadda.h vr12, vr14, vr15
|
|
vadda.h vr13, vr16, vr17
|
|
vadd.h vr10, vr10, vr11
|
|
vadd.h vr11, vr12, vr13
|
|
vadd.h vr10, vr10, vr11
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.du.wu vr10, vr10, vr10
|
|
vhaddw.qu.du vr10, vr10, vr10
|
|
vpickve2gr.wu t2, vr10, 0 // sum8
|
|
|
|
sub.d t0, t0, t1
|
|
sub.d t2, t2, t1
|
|
slli.d t2, t2, 32
|
|
add.d a0, t2, t0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_4x8( pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2 )
|
|
*/
|
|
function_x264 pixel_satd_4x8_lsx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr13
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr14
|
|
vadd.h vr13, vr14, vr13
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu t5, vr13, 0
|
|
srli.d a0, t5, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_4x16( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_4x16_lsx
|
|
slli.d t2, a1, 1
|
|
slli.d t3, a3, 1
|
|
add.d t4, a1, t2
|
|
add.d t5, a3, t3
|
|
|
|
// Load data from pix1 and pix2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr13
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr14
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr15
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
|
|
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
|
|
pixel_satd_4x4_lsx_core vr16
|
|
|
|
vadd.h vr13, vr14, vr13
|
|
vadd.h vr15, vr16, vr15
|
|
vadd.h vr13, vr15, vr13
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu t5, vr13, 0
|
|
srli.d a0, t5, 1
|
|
endfunc_x264
|
|
|
|
.macro pixel_satd_8x4_lsx_core out0, out1, out2, out3
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr2, vr5, vr4
|
|
vilvl.d vr3, vr7, vr6
|
|
|
|
vsubwev.h.bu vr4, vr0, vr2
|
|
vsubwod.h.bu vr5, vr0, vr2
|
|
vsubwev.h.bu vr6, vr1, vr3
|
|
vsubwod.h.bu vr7, vr1, vr3
|
|
vadd.h vr0, vr4, vr5
|
|
vsub.h vr1, vr4, vr5
|
|
vadd.h vr2, vr6, vr7
|
|
vsub.h vr3, vr6, vr7
|
|
vpackev.h vr4, vr1, vr0
|
|
vpackod.h vr5, vr1, vr0
|
|
vpackev.h vr6, vr3, vr2
|
|
vpackod.h vr7, vr3, vr2
|
|
vadd.h vr8, vr4, vr5
|
|
vsub.h vr9, vr4, vr5
|
|
vadd.h vr10, vr6, vr7
|
|
vsub.h vr11, vr6, vr7
|
|
vilvl.d vr4, vr9, vr8
|
|
vilvh.d vr5, vr9, vr8
|
|
vilvl.d vr6, vr11, vr10
|
|
vilvh.d vr7, vr11, vr10
|
|
vadd.h vr8, vr4, vr5
|
|
vsub.h vr9, vr4, vr5
|
|
vadd.h vr10, vr6, vr7
|
|
vsub.h vr11, vr6, vr7
|
|
vadd.h \out0, vr8, vr10
|
|
vsub.h \out1, vr8, vr10
|
|
vadd.h \out2, vr9, vr11
|
|
vsub.h \out3, vr9, vr11
|
|
.endm
|
|
|
|
/*
|
|
* int x264_pixel_satd_8x4( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_8x4_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
|
|
vadd.h vr12, vr13, vr12
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.du.wu vr12, vr12, vr12
|
|
vhaddw.qu.du vr12, vr12, vr12
|
|
vpickve2gr.wu t4, vr12, 0
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_8x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
vadd.h vr12, vr13, vr12
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
|
|
vadda.h vr13, vr14, vr13
|
|
vadda.h vr14, vr16, vr15
|
|
vadd.h vr13, vr14, vr13
|
|
|
|
vadd.h vr12, vr13, vr12
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.du.wu vr12, vr12, vr12
|
|
vhaddw.qu.du vr12, vr12, vr12
|
|
vpickve2gr.wu t4, vr12, 0
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_8x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
vadd.h vr12, vr13, vr12
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
|
|
vadda.h vr13, vr14, vr13
|
|
vadda.h vr14, vr16, vr15
|
|
vadd.h vr13, vr14, vr13
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
|
|
vadda.h vr14, vr15, vr14
|
|
vadda.h vr15, vr17, vr16
|
|
vadd.h vr14, vr15, vr14
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
|
|
vadda.h vr15, vr16, vr15
|
|
vadda.h vr16, vr18, vr17
|
|
vadd.h vr15, vr16, vr15
|
|
|
|
vadd.h vr12, vr12, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vadd.h vr12, vr12, vr14
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.du.wu vr12, vr12, vr12
|
|
vhaddw.qu.du vr12, vr12, vr12
|
|
vpickve2gr.wu t4, vr12, 0
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_16x8( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_16x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
vadd.h vr12, vr13, vr12
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
|
|
vadda.h vr13, vr14, vr13
|
|
vadda.h vr14, vr16, vr15
|
|
vadd.h vr13, vr14, vr13
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
|
|
vadda.h vr14, vr15, vr14
|
|
vadda.h vr15, vr17, vr16
|
|
vadd.h vr14, vr15, vr14
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
|
|
vadda.h vr15, vr16, vr15
|
|
vadda.h vr16, vr18, vr17
|
|
vadd.h vr15, vr16, vr15
|
|
|
|
vadd.h vr12, vr13, vr12
|
|
vadd.h vr14, vr15, vr14
|
|
vadd.h vr12, vr14, vr12
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.du.wu vr12, vr12, vr12
|
|
vhaddw.qu.du vr12, vr12, vr12
|
|
vpickve2gr.wu t4, vr12, 0
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_satd_16x16( uint8_t *p_pix1, intptr_t i_stride,
|
|
* uint8_t *p_pix2, intptr_t i_stride2 )
|
|
*/
|
|
function_x264 pixel_satd_16x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
vadd.h vr12, vr13, vr12
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
|
|
vadda.h vr13, vr14, vr13
|
|
vadda.h vr14, vr16, vr15
|
|
vadd.h vr13, vr14, vr13
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
|
|
vadda.h vr14, vr15, vr14
|
|
vadda.h vr15, vr17, vr16
|
|
vadd.h vr14, vr15, vr14
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
|
|
vadda.h vr15, vr16, vr15
|
|
vadda.h vr16, vr18, vr17
|
|
vadd.h vr15, vr16, vr15
|
|
|
|
vadd.h vr12, vr13, vr12
|
|
vadd.h vr14, vr15, vr14
|
|
vadd.h vr19, vr14, vr12
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
|
|
vadda.h vr12, vr13, vr12
|
|
vadda.h vr13, vr15, vr14
|
|
vadd.h vr12, vr13, vr12
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
|
|
vadda.h vr13, vr14, vr13
|
|
vadda.h vr14, vr16, vr15
|
|
vadd.h vr13, vr14, vr13
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
|
|
vadda.h vr14, vr15, vr14
|
|
vadda.h vr15, vr17, vr16
|
|
vadd.h vr14, vr15, vr14
|
|
|
|
addi.d t5, a0, 8
|
|
addi.d t6, a2, 8
|
|
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
|
|
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
|
|
vadda.h vr15, vr16, vr15
|
|
vadda.h vr16, vr18, vr17
|
|
vadd.h vr15, vr16, vr15
|
|
|
|
vadd.h vr12, vr13, vr12
|
|
vadd.h vr14, vr15, vr14
|
|
vadd.h vr12, vr14, vr12
|
|
vadd.h vr12, vr19, vr12
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.du.wu vr12, vr12, vr12
|
|
vhaddw.qu.du vr12, vr12, vr12
|
|
vpickve2gr.wu t4, vr12, 0
|
|
srli.d a0, t4, 1
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_4x4( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_4x4_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr5, vr7, vr6
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr4, vr5, vr4
|
|
vsubwev.h.bu vr1, vr0, vr4
|
|
vsubwod.h.bu vr2, vr0, vr4
|
|
vmul.h vr5, vr1, vr1
|
|
vmul.h vr6, vr2, vr2
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vadd.w vr5, vr5, vr6
|
|
vhaddw.d.w vr5, vr5, vr5
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vpickve2gr.w a0, vr5, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_4x8( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_4x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr5, vr7, vr6
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr4, vr5, vr4
|
|
vsubwev.h.bu vr1, vr0, vr4
|
|
vsubwod.h.bu vr2, vr0, vr4
|
|
vmul.h vr5, vr1, vr1
|
|
vmul.h vr6, vr2, vr2
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vadd.w vr10, vr5, vr6
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr5, vr7, vr6
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr4, vr5, vr4
|
|
vsubwev.h.bu vr1, vr0, vr4
|
|
vsubwod.h.bu vr2, vr0, vr4
|
|
vmul.h vr5, vr1, vr1
|
|
vmul.h vr6, vr2, vr2
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vadd.w vr5, vr5, vr6
|
|
|
|
vadd.w vr5, vr5, vr10
|
|
vhaddw.d.w vr5, vr5, vr5
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vpickve2gr.w a0, vr5, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_4x16( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_4x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr5, vr7, vr6
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr4, vr5, vr4
|
|
vsubwev.h.bu vr1, vr0, vr4
|
|
vsubwod.h.bu vr2, vr0, vr4
|
|
vmul.h vr5, vr1, vr1
|
|
vmul.h vr6, vr2, vr2
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vadd.w vr10, vr5, vr6
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr1, vr3, vr2
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr5, vr7, vr6
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr4, vr5, vr4
|
|
vsubwev.h.bu vr1, vr0, vr4
|
|
vsubwod.h.bu vr2, vr0, vr4
|
|
vmul.h vr5, vr1, vr1
|
|
vmul.h vr6, vr2, vr2
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vadd.w vr5, vr5, vr6
|
|
vadd.w vr10, vr5, vr10
|
|
.endr
|
|
|
|
vhaddw.d.w vr10, vr10, vr10
|
|
vhaddw.q.d vr10, vr10, vr10
|
|
vpickve2gr.w a0, vr10, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_8x4( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_8x4_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vmul.h vr2, vr2, vr2
|
|
vmul.h vr3, vr3, vr3
|
|
vmul.h vr6, vr6, vr6
|
|
vmul.h vr7, vr7, vr7
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr2, vr2, vr6
|
|
vhaddw.d.w vr2, vr2, vr2
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vpickve2gr.w a0, vr2, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_8x8( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_8x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vmul.h vr2, vr2, vr2
|
|
vmul.h vr3, vr3, vr3
|
|
vmul.h vr6, vr6, vr6
|
|
vmul.h vr7, vr7, vr7
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr10, vr2, vr6
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vmul.h vr2, vr2, vr2
|
|
vmul.h vr3, vr3, vr3
|
|
vmul.h vr6, vr6, vr6
|
|
vmul.h vr7, vr7, vr7
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr11, vr2, vr6
|
|
|
|
vadd.w vr10, vr10, vr11
|
|
vhaddw.d.w vr10, vr10, vr10
|
|
vhaddw.q.d vr10, vr10, vr10
|
|
vpickve2gr.w a0, vr10, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_8x16( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_8x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vmul.h vr2, vr2, vr2
|
|
vmul.h vr3, vr3, vr3
|
|
vmul.h vr6, vr6, vr6
|
|
vmul.h vr7, vr7, vr7
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr10, vr2, vr6
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vmul.h vr2, vr2, vr2
|
|
vmul.h vr3, vr3, vr3
|
|
vmul.h vr6, vr6, vr6
|
|
vmul.h vr7, vr7, vr7
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr11, vr2, vr6
|
|
vadd.w vr10, vr10, vr11
|
|
.endr
|
|
|
|
vhaddw.d.w vr10, vr10, vr10
|
|
vhaddw.q.d vr10, vr10, vr10
|
|
vpickve2gr.w a0, vr10, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_16x8( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_16x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
|
|
vsubwev.h.bu vr8, vr0, vr4
|
|
vsubwod.h.bu vr9, vr0, vr4
|
|
vsubwev.h.bu vr10, vr1, vr5
|
|
vsubwod.h.bu vr11, vr1, vr5
|
|
vsubwev.h.bu vr12, vr2, vr6
|
|
vsubwod.h.bu vr13, vr2, vr6
|
|
vsubwev.h.bu vr14, vr3, vr7
|
|
vsubwod.h.bu vr15, vr3, vr7
|
|
vmul.h vr8, vr8, vr8
|
|
vmul.h vr9, vr9, vr9
|
|
vmul.h vr10, vr10, vr10
|
|
vmul.h vr11, vr11, vr11
|
|
vmul.h vr12, vr12, vr12
|
|
vmul.h vr13, vr13, vr13
|
|
vmul.h vr14, vr14, vr14
|
|
vmul.h vr15, vr15, vr15
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.wu.hu vr14, vr14, vr14
|
|
vhaddw.wu.hu vr15, vr15, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr10, vr12, vr13
|
|
vadd.w vr11, vr14, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr16, vr8, vr9
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
|
|
vsubwev.h.bu vr8, vr0, vr4
|
|
vsubwod.h.bu vr9, vr0, vr4
|
|
vsubwev.h.bu vr10, vr1, vr5
|
|
vsubwod.h.bu vr11, vr1, vr5
|
|
vsubwev.h.bu vr12, vr2, vr6
|
|
vsubwod.h.bu vr13, vr2, vr6
|
|
vsubwev.h.bu vr14, vr3, vr7
|
|
vsubwod.h.bu vr15, vr3, vr7
|
|
vmul.h vr8, vr8, vr8
|
|
vmul.h vr9, vr9, vr9
|
|
vmul.h vr10, vr10, vr10
|
|
vmul.h vr11, vr11, vr11
|
|
vmul.h vr12, vr12, vr12
|
|
vmul.h vr13, vr13, vr13
|
|
vmul.h vr14, vr14, vr14
|
|
vmul.h vr15, vr15, vr15
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.wu.hu vr14, vr14, vr14
|
|
vhaddw.wu.hu vr15, vr15, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr10, vr12, vr13
|
|
vadd.w vr11, vr14, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr17, vr8, vr9
|
|
|
|
vadd.w vr10, vr16, vr17
|
|
vhaddw.d.w vr10, vr10, vr10
|
|
vhaddw.q.d vr10, vr10, vr10
|
|
vpickve2gr.w a0, vr10, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_ssd_16x16( pixel *pix1, intptr_t i_stride_pix1,
|
|
* pixel *pix2, intptr_t i_stride_pix2 )
|
|
*/
|
|
function_x264 pixel_ssd_16x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
slli.d t2, a3, 1
|
|
add.d t3, a3, t2
|
|
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
|
|
vsubwev.h.bu vr8, vr0, vr4
|
|
vsubwod.h.bu vr9, vr0, vr4
|
|
vsubwev.h.bu vr10, vr1, vr5
|
|
vsubwod.h.bu vr11, vr1, vr5
|
|
vsubwev.h.bu vr12, vr2, vr6
|
|
vsubwod.h.bu vr13, vr2, vr6
|
|
vsubwev.h.bu vr14, vr3, vr7
|
|
vsubwod.h.bu vr15, vr3, vr7
|
|
vmul.h vr8, vr8, vr8
|
|
vmul.h vr9, vr9, vr9
|
|
vmul.h vr10, vr10, vr10
|
|
vmul.h vr11, vr11, vr11
|
|
vmul.h vr12, vr12, vr12
|
|
vmul.h vr13, vr13, vr13
|
|
vmul.h vr14, vr14, vr14
|
|
vmul.h vr15, vr15, vr15
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.wu.hu vr14, vr14, vr14
|
|
vhaddw.wu.hu vr15, vr15, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr10, vr12, vr13
|
|
vadd.w vr11, vr14, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr16, vr8, vr9
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
|
|
vsubwev.h.bu vr8, vr0, vr4
|
|
vsubwod.h.bu vr9, vr0, vr4
|
|
vsubwev.h.bu vr10, vr1, vr5
|
|
vsubwod.h.bu vr11, vr1, vr5
|
|
vsubwev.h.bu vr12, vr2, vr6
|
|
vsubwod.h.bu vr13, vr2, vr6
|
|
vsubwev.h.bu vr14, vr3, vr7
|
|
vsubwod.h.bu vr15, vr3, vr7
|
|
vmul.h vr8, vr8, vr8
|
|
vmul.h vr9, vr9, vr9
|
|
vmul.h vr10, vr10, vr10
|
|
vmul.h vr11, vr11, vr11
|
|
vmul.h vr12, vr12, vr12
|
|
vmul.h vr13, vr13, vr13
|
|
vmul.h vr14, vr14, vr14
|
|
vmul.h vr15, vr15, vr15
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.wu.hu vr14, vr14, vr14
|
|
vhaddw.wu.hu vr15, vr15, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr10, vr12, vr13
|
|
vadd.w vr11, vr14, vr15
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr9, vr10, vr11
|
|
vadd.w vr17, vr8, vr9
|
|
vadd.w vr16, vr16, vr17
|
|
.endr
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
vhaddw.q.d vr16, vr16, vr16
|
|
vpickve2gr.w a0, vr16, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
|
|
*/
|
|
.macro pixel_sa8d_8x8_lsx_core out0, out1, out2, out3
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vadd.h vr8, vr2, vr3
|
|
vsub.h vr9, vr2, vr3
|
|
vadd.h vr10, vr6, vr7
|
|
vsub.h vr11, vr6, vr7
|
|
vpackev.h vr0, vr9, vr8
|
|
vpackod.h vr1, vr9, vr8
|
|
vpackev.h vr2, vr11, vr10
|
|
vpackod.h vr3, vr11, vr10
|
|
vadd.h vr4, vr0, vr1
|
|
vsub.h vr5, vr0, vr1
|
|
vadd.h vr6, vr2, vr3
|
|
vsub.h vr7, vr2, vr3
|
|
vilvl.d vr0, vr5, vr4
|
|
vilvh.d vr1, vr5, vr4
|
|
vilvl.d vr2, vr7, vr6
|
|
vilvh.d vr3, vr7, vr6
|
|
vadd.h vr12, vr0, vr1
|
|
vsub.h vr13, vr0, vr1
|
|
vadd.h vr14, vr2, vr3
|
|
vsub.h vr15, vr2, vr3
|
|
|
|
alsl.d t4, a1, a0, 2
|
|
alsl.d t5, a3, a2, 2
|
|
FLDD_LOADX_4 t4, a1, t0, t1, f0, f1, f2, f3
|
|
FLDD_LOADX_4 t5, a3, t2, t3, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vsubwev.h.bu vr2, vr0, vr4
|
|
vsubwod.h.bu vr3, vr0, vr4
|
|
vsubwev.h.bu vr6, vr1, vr5
|
|
vsubwod.h.bu vr7, vr1, vr5
|
|
vadd.h vr8, vr2, vr3
|
|
vsub.h vr9, vr2, vr3
|
|
vadd.h vr10, vr6, vr7
|
|
vsub.h vr11, vr6, vr7
|
|
vpackev.h vr0, vr9, vr8
|
|
vpackod.h vr1, vr9, vr8
|
|
vpackev.h vr2, vr11, vr10
|
|
vpackod.h vr3, vr11, vr10
|
|
vadd.h vr4, vr0, vr1
|
|
vsub.h vr5, vr0, vr1
|
|
vadd.h vr6, vr2, vr3
|
|
vsub.h vr7, vr2, vr3
|
|
vilvl.d vr0, vr5, vr4
|
|
vilvh.d vr1, vr5, vr4
|
|
vilvl.d vr2, vr7, vr6
|
|
vilvh.d vr3, vr7, vr6
|
|
vadd.h vr4, vr0, vr1
|
|
vsub.h vr5, vr0, vr1
|
|
vadd.h vr6, vr2, vr3
|
|
vsub.h vr7, vr2, vr3
|
|
|
|
// vr12 vr13 vr14 vr15
|
|
vpickev.w vr0, vr13, vr12
|
|
vpickod.w vr1, vr13, vr12
|
|
vpickev.w vr2, vr15, vr14
|
|
vpickod.w vr3, vr15, vr14
|
|
vadd.h vr8, vr0, vr1
|
|
vsub.h vr9, vr0, vr1
|
|
vadd.h vr10, vr2, vr3
|
|
vsub.h vr11, vr2, vr3
|
|
vadd.h vr12, vr8, vr10
|
|
vadd.h vr13, vr9, vr11
|
|
vsub.h vr14, vr8, vr10
|
|
vsub.h vr15, vr9, vr11
|
|
|
|
// vr4 vr5 vr6 vr7
|
|
vpickev.w vr0, vr5, vr4
|
|
vpickod.w vr1, vr5, vr4
|
|
vpickev.w vr2, vr7, vr6
|
|
vpickod.w vr3, vr7, vr6
|
|
vadd.h vr8, vr0, vr1
|
|
vsub.h vr9, vr0, vr1
|
|
vadd.h vr10, vr2, vr3
|
|
vsub.h vr11, vr2, vr3
|
|
vadd.h vr4, vr8, vr10
|
|
vadd.h vr5, vr9, vr11
|
|
vsub.h vr6, vr8, vr10
|
|
vsub.h vr7, vr9, vr11
|
|
|
|
vadd.h vr0, vr12, vr4
|
|
vadd.h vr1, vr13, vr5
|
|
vadd.h vr2, vr14, vr6
|
|
vadd.h vr3, vr15, vr7
|
|
vsub.h vr8, vr12, vr4
|
|
vsub.h vr9, vr13, vr5
|
|
vsub.h vr10, vr14, vr6
|
|
vsub.h vr11, vr15, vr7
|
|
vadda.h \out0, vr0, vr8
|
|
vadda.h \out1, vr1, vr9
|
|
vadda.h \out2, vr2, vr10
|
|
vadda.h \out3, vr3, vr11
|
|
.endm
|
|
|
|
function_x264 pixel_sa8d_8x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
|
|
vadd.h vr0, vr0, vr1
|
|
vadd.h vr1, vr2, vr3
|
|
vadd.h vr17, vr0, vr1
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vpickve2gr.wu t5, vr17, 0
|
|
addi.d t5, t5, 2
|
|
srli.d a0, t5, 2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1,
|
|
* pixel *pix2, intptr_t i_pix2 )
|
|
*/
|
|
function_x264 pixel_sa8d_16x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
slli.d t2, a3, 1
|
|
add.d t3, t2, a3
|
|
add.d t6, a0, zero
|
|
add.d t7, a2, zero
|
|
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
|
|
vadd.h vr0, vr0, vr1
|
|
vadd.h vr1, vr2, vr3
|
|
vadd.h vr16, vr0, vr1
|
|
|
|
addi.d a0, t6, 8
|
|
addi.d a2, t7, 8
|
|
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
|
|
vadd.h vr0, vr0, vr1
|
|
vadd.h vr1, vr2, vr3
|
|
vadd.h vr17, vr0, vr1
|
|
|
|
alsl.d a0, a1, t6, 3
|
|
alsl.d a2, a3, t7, 3
|
|
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
|
|
vadd.h vr0, vr0, vr1
|
|
vadd.h vr1, vr2, vr3
|
|
vadd.h vr18, vr0, vr1
|
|
|
|
addi.d a0, a0, 8
|
|
addi.d a2, a2, 8
|
|
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
|
|
vadd.h vr0, vr0, vr1
|
|
vadd.h vr1, vr2, vr3
|
|
vadd.h vr19, vr0, vr1
|
|
|
|
vhaddw.wu.hu vr16, vr16, vr16
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.wu.hu vr18, vr18, vr18
|
|
vhaddw.wu.hu vr19, vr19, vr19
|
|
vadd.w vr16, vr17, vr16
|
|
vadd.w vr18, vr19, vr18
|
|
vadd.w vr17, vr18, vr16
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vpickve2gr.wu t5, vr17, 0
|
|
addi.d t5, t5, 2
|
|
srli.d a0, t5, 2
|
|
endfunc_x264
|
|
|
|
/*
|
|
* uint64_t pixel_var_8x8( pixel *pix, intptr_t i_stride )
|
|
*/
|
|
function_x264 pixel_var_8x8_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
alsl.d a0, a1, a0, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vhaddw.hu.bu vr2, vr0, vr0
|
|
vhaddw.hu.bu vr3, vr1, vr1
|
|
vhaddw.hu.bu vr6, vr4, vr4
|
|
vhaddw.hu.bu vr7, vr5, vr5
|
|
vadd.h vr2, vr2, vr3
|
|
vadd.h vr6, vr6, vr7
|
|
vadd.h vr2, vr2, vr6
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
vpickve2gr.wu t5, vr2, 0 // sum
|
|
|
|
vmulwev.h.bu vr2, vr0, vr0
|
|
vmulwod.h.bu vr3, vr0, vr0
|
|
vmulwev.h.bu vr6, vr1, vr1
|
|
vmulwod.h.bu vr7, vr1, vr1
|
|
vmulwev.h.bu vr8, vr4, vr4
|
|
vmulwod.h.bu vr9, vr4, vr4
|
|
vmulwev.h.bu vr10, vr5, vr5
|
|
vmulwod.h.bu vr11, vr5, vr5
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr10, vr10, vr11
|
|
vadd.w vr2, vr2, vr6
|
|
vadd.w vr8, vr8, vr10
|
|
vadd.w vr2, vr2, vr8
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
vpickve2gr.du t6, vr2, 0 // sqr
|
|
|
|
slli.d t4, t6, 32
|
|
add.d a0, t4, t5
|
|
endfunc_x264
|
|
|
|
/*
|
|
* uint64_t pixel_var_8x16( pixel *pix, intptr_t i_stride )
|
|
*/
|
|
function_x264 pixel_var_8x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, a1, t0
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
alsl.d a0, a1, a0, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vhaddw.hu.bu vr2, vr0, vr0
|
|
vhaddw.hu.bu vr3, vr1, vr1
|
|
vhaddw.hu.bu vr6, vr4, vr4
|
|
vhaddw.hu.bu vr7, vr5, vr5
|
|
vadd.h vr2, vr2, vr3
|
|
vadd.h vr6, vr6, vr7
|
|
vadd.h vr16, vr2, vr6
|
|
|
|
vmulwev.h.bu vr2, vr0, vr0
|
|
vmulwod.h.bu vr3, vr0, vr0
|
|
vmulwev.h.bu vr6, vr1, vr1
|
|
vmulwod.h.bu vr7, vr1, vr1
|
|
vmulwev.h.bu vr8, vr4, vr4
|
|
vmulwod.h.bu vr9, vr4, vr4
|
|
vmulwev.h.bu vr10, vr5, vr5
|
|
vmulwod.h.bu vr11, vr5, vr5
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vadd.w vr12, vr2, vr3
|
|
vadd.w vr13, vr6, vr7
|
|
vadd.w vr14, vr8, vr9
|
|
vadd.w vr15, vr10, vr11
|
|
vadd.w vr12, vr12, vr13
|
|
vadd.w vr14, vr14, vr15
|
|
vadd.w vr12, vr12, vr14
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
|
|
alsl.d a0, a1, a0, 2
|
|
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr5, vr7, vr6
|
|
vhaddw.hu.bu vr2, vr0, vr0
|
|
vhaddw.hu.bu vr3, vr1, vr1
|
|
vhaddw.hu.bu vr6, vr4, vr4
|
|
vhaddw.hu.bu vr7, vr5, vr5
|
|
vadd.h vr2, vr2, vr3
|
|
vadd.h vr6, vr6, vr7
|
|
vadd.h vr2, vr2, vr6
|
|
vadd.h vr2, vr2, vr16
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
vpickve2gr.wu t5, vr2, 0 // sum
|
|
|
|
vmulwev.h.bu vr2, vr0, vr0
|
|
vmulwod.h.bu vr3, vr0, vr0
|
|
vmulwev.h.bu vr6, vr1, vr1
|
|
vmulwod.h.bu vr7, vr1, vr1
|
|
vmulwev.h.bu vr8, vr4, vr4
|
|
vmulwod.h.bu vr9, vr4, vr4
|
|
vmulwev.h.bu vr10, vr5, vr5
|
|
vmulwod.h.bu vr11, vr5, vr5
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.wu.hu vr3, vr3, vr3
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vadd.w vr2, vr2, vr3
|
|
vadd.w vr6, vr6, vr7
|
|
vadd.w vr8, vr8, vr9
|
|
vadd.w vr10, vr10, vr11
|
|
vadd.w vr2, vr2, vr6
|
|
vadd.w vr8, vr8, vr10
|
|
vadd.w vr2, vr2, vr8
|
|
vadd.w vr2, vr2, vr12
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
vpickve2gr.du t6, vr2, 0 // sqr
|
|
slli.d t4, t6, 32
|
|
add.d a0, t4, t5
|
|
endfunc_x264
|
|
|
|
/*
|
|
* uint64_t pixel_var_16x16( pixel *pix, intptr_t i_stride )
|
|
*/
|
|
function_x264 pixel_var_16x16_lsx
|
|
slli.d t0, a1, 1
|
|
add.d t1, t0, a1
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
vhaddw.hu.bu vr4, vr0, vr0
|
|
vhaddw.hu.bu vr5, vr1, vr1
|
|
vhaddw.hu.bu vr6, vr2, vr2
|
|
vhaddw.hu.bu vr7, vr3, vr3
|
|
vadd.h vr4, vr5, vr4
|
|
vadd.h vr5, vr7, vr6
|
|
vadd.h vr13, vr5, vr4
|
|
|
|
vmulwev.h.bu vr5, vr0, vr0
|
|
vmulwod.h.bu vr6, vr0, vr0
|
|
vmulwev.h.bu vr7, vr1, vr1
|
|
vmulwod.h.bu vr8, vr1, vr1
|
|
vmulwev.h.bu vr9, vr2, vr2
|
|
vmulwod.h.bu vr10, vr2, vr2
|
|
vmulwev.h.bu vr11, vr3, vr3
|
|
vmulwod.h.bu vr12, vr3, vr3
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vadd.w vr5, vr5, vr6
|
|
vadd.w vr6, vr8, vr7
|
|
vadd.w vr7, vr10, vr9
|
|
vadd.w vr8, vr12, vr11
|
|
vadd.w vr0, vr5, vr6
|
|
vadd.w vr1, vr8, vr7
|
|
vadd.w vr14, vr1, vr0
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
|
|
vhaddw.hu.bu vr4, vr0, vr0
|
|
vhaddw.hu.bu vr5, vr1, vr1
|
|
vhaddw.hu.bu vr6, vr2, vr2
|
|
vhaddw.hu.bu vr7, vr3, vr3
|
|
vadd.h vr4, vr5, vr4
|
|
vadd.h vr5, vr7, vr6
|
|
vadd.h vr4, vr5, vr4
|
|
vadd.h vr13, vr4, vr13
|
|
|
|
vmulwev.h.bu vr5, vr0, vr0
|
|
vmulwod.h.bu vr6, vr0, vr0
|
|
vmulwev.h.bu vr7, vr1, vr1
|
|
vmulwod.h.bu vr8, vr1, vr1
|
|
vmulwev.h.bu vr9, vr2, vr2
|
|
vmulwod.h.bu vr10, vr2, vr2
|
|
vmulwev.h.bu vr11, vr3, vr3
|
|
vmulwod.h.bu vr12, vr3, vr3
|
|
vhaddw.wu.hu vr5, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr12, vr12, vr12
|
|
vadd.w vr5, vr5, vr6
|
|
vadd.w vr6, vr8, vr7
|
|
vadd.w vr7, vr10, vr9
|
|
vadd.w vr8, vr12, vr11
|
|
vadd.w vr0, vr5, vr6
|
|
vadd.w vr1, vr8, vr7
|
|
vadd.w vr0, vr1, vr0
|
|
vadd.w vr14, vr0, vr14
|
|
.endr
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu t4, vr13, 0
|
|
|
|
vhaddw.du.wu vr14, vr14, vr14
|
|
vhaddw.qu.du vr14, vr14, vr14
|
|
vpickve2gr.du t6, vr14, 0 // sqr
|
|
|
|
slli.d t5, t6, 32
|
|
add.d a0, t4, t5
|
|
endfunc_x264
|
|
|
|
.macro sse_diff_8width_lsx in0, in1, in2, in3
|
|
fld.d f0, \in0, 0
|
|
fld.d f1, \in0, FENC_STRIDE
|
|
fld.d f2, \in0, FENC_STRIDE * 2
|
|
fld.d f3, \in0, FENC_STRIDE * 3
|
|
fld.d f4, \in1, 0
|
|
fld.d f5, \in1, FDEC_STRIDE
|
|
fld.d f6, \in1, FDEC_STRIDE * 2
|
|
fld.d f7, \in1, FDEC_STRIDE * 3
|
|
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr1, vr3, vr2
|
|
vilvl.d vr2, vr5, vr4
|
|
vilvl.d vr3, vr7, vr6
|
|
vsubwev.h.bu vr4, vr0, vr2
|
|
vsubwod.h.bu vr5, vr0, vr2
|
|
vsubwev.h.bu vr6, vr1, vr3
|
|
vsubwod.h.bu vr7, vr1, vr3
|
|
// sqr_u
|
|
vdp2add.w.h \in2, vr4, vr4
|
|
vdp2add.w.h \in2, vr5, vr5
|
|
vdp2add.w.h \in2, vr6, vr6
|
|
vdp2add.w.h \in2, vr7, vr7
|
|
// sum_u
|
|
vadd.h vr4, vr4, vr5
|
|
vadd.h vr6, vr6, vr7
|
|
vadd.h \in3, vr4, vr6
|
|
.endm
|
|
|
|
/*
|
|
* int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
|
|
*/
|
|
function_x264 pixel_var2_8x8_lsx
|
|
vxor.v vr8, vr8, vr8
|
|
sse_diff_8width_lsx a0, a1, vr8, vr9
|
|
addi.d t0, a0, FENC_STRIDE * 4
|
|
addi.d t1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr10
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t2, vr8, 0 // sqr_u
|
|
vadd.h vr8, vr10, vr9
|
|
vhaddw.w.h vr8, vr8, vr8
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t3, vr8, 0 // sum_u
|
|
|
|
addi.d a0, a0, FENC_STRIDE / 2
|
|
addi.d a1, a1, FDEC_STRIDE / 2
|
|
vxor.v vr8, vr8, vr8
|
|
sse_diff_8width_lsx a0, a1, vr8, vr9
|
|
addi.d t0, a0, FENC_STRIDE * 4
|
|
addi.d t1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr10
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t4, vr8, 0 // sqr_v
|
|
vadd.h vr8, vr10, vr9
|
|
vhaddw.w.h vr8, vr8, vr8
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t5, vr8, 0 // sum_v
|
|
|
|
st.w t2, a2, 0
|
|
st.w t4, a2, 4
|
|
mul.w t3, t3, t3
|
|
mul.w t5, t5, t5
|
|
srai.w t3, t3, 6
|
|
srai.w t5, t5, 6
|
|
sub.w t2, t2, t3
|
|
sub.w t4, t4, t5
|
|
add.w a0, t2, t4
|
|
endfunc_x264
|
|
|
|
/*
|
|
* int pixel_var2_8x16( pixel *fenc, pixel *fdec, int ssd[2] )
|
|
*/
|
|
function_x264 pixel_var2_8x16_lsx
|
|
vxor.v vr8, vr8, vr8
|
|
sse_diff_8width_lsx a0, a1, vr8, vr9
|
|
addi.d t0, a0, FENC_STRIDE * 4
|
|
addi.d t1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr10
|
|
addi.d t0, t0, FENC_STRIDE * 4
|
|
addi.d t1, t1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr11
|
|
addi.d t0, t0, FENC_STRIDE * 4
|
|
addi.d t1, t1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr12
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t2, vr8, 0 // sqr_u
|
|
vadd.h vr8, vr10, vr9
|
|
vadd.h vr8, vr11, vr8
|
|
vadd.h vr8, vr12, vr8
|
|
vhaddw.w.h vr8, vr8, vr8
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t3, vr8, 0 // sum_u
|
|
|
|
addi.d a0, a0, FENC_STRIDE / 2
|
|
addi.d a1, a1, FDEC_STRIDE / 2
|
|
vxor.v vr8, vr8, vr8
|
|
sse_diff_8width_lsx a0, a1, vr8, vr9
|
|
addi.d t0, a0, FENC_STRIDE * 4
|
|
addi.d t1, a1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr10
|
|
addi.d t0, t0, FENC_STRIDE * 4
|
|
addi.d t1, t1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr11
|
|
addi.d t0, t0, FENC_STRIDE * 4
|
|
addi.d t1, t1, FDEC_STRIDE * 4
|
|
sse_diff_8width_lsx t0, t1, vr8, vr12
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t4, vr8, 0 // sqr_v
|
|
vadd.h vr8, vr10, vr9
|
|
vadd.h vr8, vr11, vr8
|
|
vadd.h vr8, vr12, vr8
|
|
vhaddw.w.h vr8, vr8, vr8
|
|
vhaddw.d.w vr8, vr8, vr8
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vpickve2gr.w t5, vr8, 0 // sum_v
|
|
|
|
st.w t2, a2, 0
|
|
st.w t4, a2, 4
|
|
mul.w t3, t3, t3
|
|
mul.w t5, t5, t5
|
|
srai.w t3, t3, 7
|
|
srai.w t5, t5, 7
|
|
sub.w t2, t2, t3
|
|
sub.w t4, t4, t5
|
|
add.w a0, t2, t4
|
|
endfunc_x264
|
|
#endif /* !HIGH_BIT_DEPTH */
|