2025-04-28 08:47:28 +08:00

3549 lines
125 KiB
ArmAsm

/*****************************************************************************
* pixel-a.S: LoongArch pixel metrics
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Hecai Yuan <yuanhecai@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "loongson_asm.S"
#include "loongson_util.S"
#if !HIGH_BIT_DEPTH
const hmul_8p
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1
endconst
const mask_ac4b
.short 0, -1, 0, -1, -1, -1, -1, -1
.short 0, -1, 0, -1, -1, -1, -1, -1
endconst
const mask_ac8
.short 0, -1, -1, -1, -1, -1, -1, -1
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
.macro LOAD_INC_8x4W n1, n2, n3, n4, n5
vld $vr\n1, a0, 0
vldx $vr\n2, a0, a1
vldx $vr\n3, a0, t0
vldx $vr\n4, a0, t1
xvpermi.d xr18, $xr\n1, 0x05
xvpermi.d xr19, $xr\n2, 0x05
xvpermi.d xr20, $xr\n3, 0x05
xvpermi.d xr21, $xr\n4, 0x05
add.d a0, a0, t2
xvdp2.h.bu.b $xr\n1, xr18, $xr\n5
xvdp2.h.bu.b $xr\n2, xr19, $xr\n5
xvdp2.h.bu.b $xr\n3, xr20, $xr\n5
xvdp2.h.bu.b $xr\n4, xr21, $xr\n5
.endm
.macro SUMSUB_BADC a, b, c, d
xvadd.h \a, \a, \b
xvadd.h \c, \c, \d
xvadd.h \b, \b, \b
xvadd.h \d, \d, \d
xvsub.h \b, \b, \a
xvsub.h \d, \d, \c
.endm
.macro HADAMARD4_V a, b, c, d
SUMSUB_BADC \a, \b, \c, \d
SUMSUB_BADC \a, \c, \b, \d
.endm
.macro HADAMARD_1 a, b, tmp
xmov \tmp, \a
xvpackod.h \a, \b, \a
xvpackev.h \b, \b, \tmp
xvadd.h \tmp, \a, \b
xvsub.h \b, \b, \a
xmov \a, \tmp
.endm
.macro HADAMARD_2 a, b, c
xvpickod.w \c, \b, \a
xvpickev.w \a, \b, \a
xvadda.h \a, \a, xr17
xvadda.h \c, \c, xr17
xvmax.h \a, \a, \c
.endm
.macro HADAMARD_AC_WXH_LASX w, h
function_x264 pixel_hadamard_ac_\w\()x\h\()_lasx
add.d t0, a1, a1
add.d t1, a1, t0
add.d t2, t1, a1
xvxor.v xr17, xr17, xr17
move t4, ra
bl x264_8_hadamard_ac_16x8_lasx
.if \h == 16
xmov xr11, xr9
xmov xr10, xr8
bl x264_8_hadamard_ac_16x8_lasx
xvadd.h xr9, xr9, xr11
xvadd.h xr8, xr8, xr10
.endif
move ra, t4
xvhaddw.wu.hu xr8, xr8, xr8
xvhaddw.du.wu xr8, xr8, xr8
xvhaddw.qu.du xr8, xr8, xr8
xvpickve2gr.wu t0, xr8, 0
xvpickve2gr.wu t1, xr8, 4
add.d t0, t0, t1
xvhaddw.wu.hu xr9, xr9, xr9
xvhaddw.du.wu xr9, xr9, xr9
xvhaddw.qu.du xr9, xr9, xr9
xvpickve2gr.wu t1, xr9, 0
xvpickve2gr.wu t2, xr9, 4
add.d t1, t1, t2
srli.d t0, t0, 2
srli.d t1, t1, 1
slli.d t0, t0, 32
add.d a0, t0, t1
endfunc_x264
.endm
function_x264 hadamard_ac_16x8_lasx
/* Load intermediate variable */
la.local t3, hmul_8p
xvld xr8, t3, 0
LOAD_INC_8x4W 0, 1, 2, 3, 8
HADAMARD4_V xr0, xr1, xr2, xr3
LOAD_INC_8x4W 4, 5, 6, 7, 8
HADAMARD4_V xr4, xr5, xr6, xr7
HADAMARD_1 xr0, xr1, xr8
HADAMARD_1 xr2, xr3, xr8
xmov xr18, xr1
HADAMARD_1 xr4, xr5, xr8
HADAMARD_1 xr6, xr7, xr8
xmov xr19, xr2
xmov xr20, xr3
xvadda.h xr1, xr0, xr4
xvsub.h xr21, xr4, xr0
xvadd.h xr0, xr4, xr0
la.local t3, mask_ac4b
xvld xr8, t3, 0
xvand.v xr1, xr1, xr8
xvadda.h xr1, xr1, xr5
xvadda.h xr1, xr1, xr18
xvadda.h xr1, xr1, xr19
xvadda.h xr1, xr1, xr20
xvadda.h xr1, xr1, xr6
xvadda.h xr9, xr1, xr7
xvadd.h xr3, xr7, xr20
xvsub.h xr7, xr7, xr20
xvadd.h xr2, xr6, xr19
xvsub.h xr6, xr6, xr19
xvadd.h xr1, xr5, xr18
xvsub.h xr5, xr5, xr18
HADAMARD_2 xr3, xr7, xr18
HADAMARD_2 xr2, xr6, xr19
HADAMARD_2 xr1, xr5, xr20
xvpickod.w xr5, xr21, xr0
xvpickev.w xr0, xr21, xr0
xmov xr4, xr5
xvadd.h xr5, xr0, xr4
xvsub.h xr4, xr4, xr0
xvadd.h xr2, xr2, xr3
xvadd.h xr2, xr2, xr1
xvadd.h xr2, xr2, xr2
la.local t3, mask_ac8
xvld xr8, t3, 0
xvand.v xr0, xr5, xr8
xvadda.h xr2, xr2, xr4
xvadda.h xr8, xr2, xr0
endfunc_x264
HADAMARD_AC_WXH_LASX 16, 8
HADAMARD_AC_WXH_LASX 16, 16
/* uint64_t hadamard_ac_8x8_lasx(uint8_t *p_pix,
* int32_t i_stride)
*/
function_x264 hadamard_ac_8x8_lasx
/* Load intermediate variable */
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a1, 2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
vilvl.d vr8, vr1, vr0
vilvl.d vr9, vr3, vr2
vilvl.d vr10, vr5, vr4
vilvl.d vr11, vr7, vr6
xvpermi.q xr8, xr10, 0x02
xvpermi.q xr9, xr11, 0x02
xvpickev.b xr12, xr9, xr8
xvpickod.b xr13, xr9, xr8
xvaddwev.h.bu xr8, xr12, xr13
xvaddwod.h.bu xr9, xr12, xr13
xvsubwev.h.bu xr10, xr12, xr13
xvsubwod.h.bu xr11, xr12, xr13
xvadd.h xr12, xr8, xr9
xvadd.h xr13, xr10, xr11
xvsub.h xr14, xr8, xr9
xvsub.h xr15, xr10, xr11
xvilvl.h xr8, xr13, xr12
xvilvh.h xr9, xr13, xr12
xvilvl.h xr10, xr15, xr14
xvilvh.h xr11, xr15, xr14
xvilvl.w xr12, xr10, xr8
xvilvh.w xr13, xr10, xr8
xvilvl.w xr14, xr11, xr9
xvilvh.w xr15, xr11, xr9
xvadd.h xr8, xr12, xr13
xvadd.h xr9, xr14, xr15
xvsub.h xr10, xr12, xr13
xvsub.h xr11, xr14, xr15
xvadd.h xr12, xr8, xr9
xvadd.h xr13, xr10, xr11
xvsub.h xr14, xr8, xr9
xvsub.h xr15, xr10, xr11
vpickve2gr.hu t3, vr12, 0
vpickve2gr.hu t4, vr12, 4
xvor.v xr16, xr12, xr12
xvpermi.q xr16, xr16, 0x31
vpickve2gr.hu t5, vr16, 0
vpickve2gr.hu t6, vr16, 4
add.d t3, t3, t4
add.d t5, t5, t6
add.d t3, t3, t5
xvadda.h xr16, xr12, xr13
xvadda.h xr18, xr14, xr15
xvadd.h xr16, xr16, xr18
xvpermi.d xr17, xr16, 0x4e
xvadd.h xr18, xr16, xr17
xvhaddw.wu.hu xr18, xr18, xr18
xvhaddw.du.wu xr18, xr18, xr18
xvhaddw.qu.du xr18, xr18, xr18
xvpickve2gr.wu t4, xr18, 0
xvpackev.h xr8, xr13, xr12
xvpackev.h xr9, xr15, xr14
xvpackod.h xr10, xr13, xr12
xvpackod.h xr11, xr15, xr14
xvilvl.d xr12, xr9, xr8
xvilvh.d xr13, xr9, xr8
xvilvl.d xr14, xr11, xr10
xvilvh.d xr15, xr11, xr10
xvor.v xr16, xr12, xr12
xvor.v xr17, xr13, xr13
xvpermi.q xr12, xr14, 0x02
xvpermi.q xr13, xr14, 0x12
xvpermi.q xr16, xr15, 0x03
xvpermi.q xr17, xr15, 0x13
xvadd.h xr8, xr12, xr13
xvsub.h xr9, xr12, xr13
xvadd.h xr10, xr16, xr17
xvsub.h xr11, xr16, xr17
xvadd.h xr12, xr8, xr10
xvadd.h xr13, xr9, xr11
xvsub.h xr14, xr8, xr10
xvsub.h xr15, xr9, xr11
xvadda.h xr16, xr12, xr13
xvadda.h xr17, xr14, xr15
xvadd.h xr18, xr16, xr17
xvpermi.d xr19, xr18, 0x4e
xvadd.d xr19, xr18, xr19
xvhaddw.wu.hu xr19, xr19, xr19
xvhaddw.du.wu xr19, xr19, xr19
xvhaddw.qu.du xr19, xr19, xr19
xvpickve2gr.wu t5, xr19, 0
sub.d t4, t4, t3
sub.d t5, t5, t3
slli.d t5, t5, 32
add.d a0, t5, t4
endfunc_x264
/* int x264_pixel_satd_16x16_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_16x16_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
slli.d t4, a1, 2
slli.d t5, a3, 2
add.d t6, a1, t2
add.d t7, a3, t3
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
xvpermi.q xr0, xr4, 0x02
xvpermi.q xr1, xr5, 0x02
xvpermi.q xr2, xr6, 0x02
xvpermi.q xr3, xr7, 0x02
xvpermi.q xr8, xr12, 0x02
xvpermi.q xr9, xr13, 0x02
xvpermi.q xr10, xr14, 0x02
xvpermi.q xr11, xr15, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr8
xvsubwod.h.bu xr5, xr0, xr8
xvsubwev.h.bu xr6, xr1, xr9
xvsubwod.h.bu xr7, xr1, xr9
xvsubwev.h.bu xr8, xr2, xr10
xvsubwod.h.bu xr9, xr2, xr10
xvsubwev.h.bu xr12, xr3, xr11
xvsubwod.h.bu xr13, xr3, xr11
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr12, xr13
xvsub.h xr7, xr12, xr13
xvpackev.h xr8, xr5, xr4
xvpackod.h xr9, xr5, xr4
xvpackev.h xr10, xr7, xr6
xvpackod.h xr11, xr7, xr6
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr7, xr10, xr11
xvilvl.h xr8, xr1, xr0
xvilvl.h xr9, xr3, xr2
xvilvl.h xr10, xr5, xr4
xvilvl.h xr11, xr7, xr6
xvilvh.h xr0, xr1, xr0
xvilvh.h xr1, xr3, xr2
xvilvh.h xr2, xr5, xr4
xvilvh.h xr3, xr7, xr6
xvadd.h xr4, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr5, xr8, xr9
xvsub.h xr7, xr10, xr11
xvadd.h xr8, xr4, xr6
xvadd.h xr9, xr5, xr7
xvsub.h xr10, xr4, xr6
xvsub.h xr11, xr5, xr7
xvadd.h xr4, xr0, xr1
xvadd.h xr6, xr2, xr3
xvsub.h xr5, xr0, xr1
xvsub.h xr7, xr2, xr3
xvadd.h xr0, xr4, xr6
xvadd.h xr1, xr5, xr7
xvsub.h xr2, xr4, xr6
xvsub.h xr3, xr5, xr7
xvadda.h xr8, xr8, xr9
xvadda.h xr9, xr10, xr11
xvadda.h xr0, xr0, xr1
xvadda.h xr1, xr2, xr3
xvadd.h xr8, xr8, xr9
xvadd.h xr0, xr0, xr1
xvadd.h xr16, xr0, xr8
add.d a0, a0, t4
add.d a2, a2, t5
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
xvpermi.q xr0, xr4, 0x02
xvpermi.q xr1, xr5, 0x02
xvpermi.q xr2, xr6, 0x02
xvpermi.q xr3, xr7, 0x02
xvpermi.q xr8, xr12, 0x02
xvpermi.q xr9, xr13, 0x02
xvpermi.q xr10, xr14, 0x02
xvpermi.q xr11, xr15, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr8
xvsubwod.h.bu xr5, xr0, xr8
xvsubwev.h.bu xr6, xr1, xr9
xvsubwod.h.bu xr7, xr1, xr9
xvsubwev.h.bu xr8, xr2, xr10
xvsubwod.h.bu xr9, xr2, xr10
xvsubwev.h.bu xr12, xr3, xr11
xvsubwod.h.bu xr13, xr3, xr11
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr12, xr13
xvsub.h xr7, xr12, xr13
xvpackev.h xr8, xr5, xr4
xvpackod.h xr9, xr5, xr4
xvpackev.h xr10, xr7, xr6
xvpackod.h xr11, xr7, xr6
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr7, xr10, xr11
xvilvl.h xr8, xr1, xr0
xvilvl.h xr9, xr3, xr2
xvilvl.h xr10, xr5, xr4
xvilvl.h xr11, xr7, xr6
xvilvh.h xr0, xr1, xr0
xvilvh.h xr1, xr3, xr2
xvilvh.h xr2, xr5, xr4
xvilvh.h xr3, xr7, xr6
xvadd.h xr4, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr5, xr8, xr9
xvsub.h xr7, xr10, xr11
xvadd.h xr8, xr4, xr6
xvadd.h xr9, xr5, xr7
xvsub.h xr10, xr4, xr6
xvsub.h xr11, xr5, xr7
xvadd.h xr4, xr0, xr1
xvadd.h xr6, xr2, xr3
xvsub.h xr5, xr0, xr1
xvsub.h xr7, xr2, xr3
xvadd.h xr0, xr4, xr6
xvadd.h xr1, xr5, xr7
xvsub.h xr2, xr4, xr6
xvsub.h xr3, xr5, xr7
xvadda.h xr8, xr8, xr9
xvadda.h xr9, xr10, xr11
xvadda.h xr0, xr0, xr1
xvadda.h xr1, xr2, xr3
xvadd.h xr8, xr8, xr9
xvadd.h xr0, xr0, xr1
xvadd.h xr0, xr0, xr8
xvadd.h xr0, xr0, xr16
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.du.wu xr0, xr0, xr0
xvhaddw.qu.du xr0, xr0, xr0
xvpickve2gr.wu t0, xr0, 0
xvpickve2gr.wu t1, xr0, 4
add.w t0, t0, t1
srli.d a0, t0, 1
endfunc_x264
/* int x264_pixel_satd_16x8_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_16x8_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
slli.d t4, t2, 1
slli.d t5, t3, 1
add.d t6, a1, t2
add.d t7, a3, t3
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15
xvpermi.q xr0, xr4, 0x02
xvpermi.q xr1, xr5, 0x02
xvpermi.q xr2, xr6, 0x02
xvpermi.q xr3, xr7, 0x02
xvpermi.q xr8, xr12, 0x02
xvpermi.q xr9, xr13, 0x02
xvpermi.q xr10, xr14, 0x02
xvpermi.q xr11, xr15, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr8
xvsubwod.h.bu xr5, xr0, xr8
xvsubwev.h.bu xr6, xr1, xr9
xvsubwod.h.bu xr7, xr1, xr9
xvsubwev.h.bu xr8, xr2, xr10
xvsubwod.h.bu xr9, xr2, xr10
xvsubwev.h.bu xr12, xr3, xr11
xvsubwod.h.bu xr13, xr3, xr11
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr12, xr13
xvsub.h xr7, xr12, xr13
xvpackev.h xr8, xr5, xr4
xvpackod.h xr9, xr5, xr4
xvpackev.h xr10, xr7, xr6
xvpackod.h xr11, xr7, xr6
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr8, xr9
xvsub.h xr5, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr7, xr10, xr11
xvilvl.h xr8, xr1, xr0
xvilvl.h xr9, xr3, xr2
xvilvl.h xr10, xr5, xr4
xvilvl.h xr11, xr7, xr6
xvilvh.h xr0, xr1, xr0
xvilvh.h xr1, xr3, xr2
xvilvh.h xr2, xr5, xr4
xvilvh.h xr3, xr7, xr6
xvadd.h xr4, xr8, xr9
xvadd.h xr6, xr10, xr11
xvsub.h xr5, xr8, xr9
xvsub.h xr7, xr10, xr11
xvadd.h xr8, xr4, xr6
xvadd.h xr9, xr5, xr7
xvsub.h xr10, xr4, xr6
xvsub.h xr11, xr5, xr7
xvadd.h xr4, xr0, xr1
xvadd.h xr6, xr2, xr3
xvsub.h xr5, xr0, xr1
xvsub.h xr7, xr2, xr3
xvadd.h xr0, xr4, xr6
xvadd.h xr1, xr5, xr7
xvsub.h xr2, xr4, xr6
xvsub.h xr3, xr5, xr7
xvadda.h xr8, xr8, xr9
xvadda.h xr9, xr10, xr11
xvadda.h xr0, xr0, xr1
xvadda.h xr1, xr2, xr3
xvadd.h xr8, xr8, xr9
xvadd.h xr0, xr0, xr1
xvadd.h xr0, xr0, xr8
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.du.wu xr0, xr0, xr0
xvhaddw.qu.du xr0, xr0, xr0
xvpickve2gr.wu t0, xr0, 0
xvpickve2gr.wu t1, xr0, 4
add.w t0, t0, t1
srli.d a0, t0, 1
endfunc_x264
/* int x264_pixel_satd_8x16_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_8x16_lasx
slli.d t2, a1, 1
add.d t3, a1, t2
slli.d t4, a1, 2
slli.d t5, a3, 1
add.d t6, a3, t5
slli.d t7, a3, 2
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
add.d a2, a2, t7
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
xvpermi.q xr0, xr2, 0x02
xvpermi.q xr1, xr3, 0x02
vilvl.d vr2, vr9, vr8
vilvl.d vr3, vr11, vr10
vilvl.d vr4, vr13, vr12
vilvl.d vr5, vr15, vr14
xvpermi.q xr2, xr4, 0x02
xvpermi.q xr3, xr5, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr2
xvsubwod.h.bu xr5, xr0, xr2
xvsubwev.h.bu xr6, xr1, xr3
xvsubwod.h.bu xr7, xr1, xr3
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvilvl.h xr4, xr1, xr0
xvilvh.h xr5, xr1, xr0
xvilvl.h xr6, xr3, xr2
xvilvh.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr1, xr4, xr5
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr0, xr2
xvadd.h xr5, xr1, xr3
xvsub.h xr6, xr0, xr2
xvsub.h xr7, xr1, xr3
xvadda.h xr0, xr4, xr5
xvadda.h xr1, xr6, xr7
xvadd.h xr16, xr0, xr1
add.d a0, a0, t4
add.d a2, a2, t7
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
add.d a2, a2, t7
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
xvpermi.q xr0, xr2, 0x02
xvpermi.q xr1, xr3, 0x02
vilvl.d vr2, vr9, vr8
vilvl.d vr3, vr11, vr10
vilvl.d vr4, vr13, vr12
vilvl.d vr5, vr15, vr14
xvpermi.q xr2, xr4, 0x02
xvpermi.q xr3, xr5, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr2
xvsubwod.h.bu xr5, xr0, xr2
xvsubwev.h.bu xr6, xr1, xr3
xvsubwod.h.bu xr7, xr1, xr3
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvilvl.h xr4, xr1, xr0
xvilvh.h xr5, xr1, xr0
xvilvl.h xr6, xr3, xr2
xvilvh.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr1, xr4, xr5
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr0, xr2
xvadd.h xr5, xr1, xr3
xvsub.h xr6, xr0, xr2
xvsub.h xr7, xr1, xr3
xvadda.h xr0, xr4, xr5
xvadda.h xr1, xr6, xr7
xvadd.h xr0, xr0, xr1
xvadd.h xr0, xr0, xr16
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.du.wu xr0, xr0, xr0
xvhaddw.qu.du xr0, xr0, xr0
xvpickve2gr.wu t0, xr0, 0
xvpickve2gr.wu t1, xr0, 4
add.w t0, t0, t1
srli.d a0, t0, 1
endfunc_x264
/* int x264_pixel_satd_8x8_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_8x8_lasx
slli.d t2, a1, 1
slli.d t5, a3, 1
add.d t3, a1, t2
add.d t6, a3, t5
slli.d t4, t2, 1
slli.d t7, t5, 1
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3
add.d a0, a0, t4
LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11
add.d a2, a2, t7
LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
xvpermi.q xr0, xr2, 0x02
xvpermi.q xr1, xr3, 0x02
vilvl.d vr2, vr9, vr8
vilvl.d vr3, vr11, vr10
vilvl.d vr4, vr13, vr12
vilvl.d vr5, vr15, vr14
xvpermi.q xr2, xr4, 0x02
xvpermi.q xr3, xr5, 0x02
// HADAMARD4
xvsubwev.h.bu xr4, xr0, xr2
xvsubwod.h.bu xr5, xr0, xr2
xvsubwev.h.bu xr6, xr1, xr3
xvsubwod.h.bu xr7, xr1, xr3
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvpackev.h xr4, xr1, xr0
xvpackod.h xr5, xr1, xr0
xvpackev.h xr6, xr3, xr2
xvpackod.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvsub.h xr1, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr3, xr6, xr7
xvilvl.h xr4, xr1, xr0
xvilvh.h xr5, xr1, xr0
xvilvl.h xr6, xr3, xr2
xvilvh.h xr7, xr3, xr2
xvadd.h xr0, xr4, xr5
xvadd.h xr2, xr6, xr7
xvsub.h xr1, xr4, xr5
xvsub.h xr3, xr6, xr7
xvadd.h xr4, xr0, xr2
xvadd.h xr5, xr1, xr3
xvsub.h xr6, xr0, xr2
xvsub.h xr7, xr1, xr3
xvadda.h xr0, xr4, xr5
xvadda.h xr1, xr6, xr7
xvadd.h xr0, xr0, xr1
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.du.wu xr0, xr0, xr0
xvhaddw.qu.du xr0, xr0, xr0
xvpickve2gr.wu t0, xr0, 0
xvpickve2gr.wu t1, xr0, 4
add.w t0, t0, t1
srli.d a0, t0, 1
endfunc_x264
/* int x264_pixel_satd_8x4_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_8x4_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvpackev.d xr11, xr10, xr9
xvpackod.d xr12, xr10, xr9
xvadda.h xr11, xr11, xr12
xvhaddw.wu.hu xr11, xr11, xr11
xvhaddw.du.wu xr11, xr11, xr11
xvhaddw.qu.du xr11, xr11, xr11
xvpickve2gr.wu t4, xr11, 0
xvpickve2gr.wu t5, xr11, 4
add.d t4, t4, t5
srli.d a0, t4, 1
endfunc_x264
/* int x264_pixel_satd_4x16_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_4x16_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr9, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr10, vr7, vr5
slli.d t0, a1, 2
slli.d t1, a3, 2
// Load data from pix1 and pix2
add.d a0, a0, t0
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
add.d a2, a2, t1
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr5, vr7, vr5
xvpermi.q xr1, xr9, 0x20
xvpermi.q xr5, xr10, 0x20
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadda.h xr9, xr9, xr10
xvhaddw.wu.hu xr9, xr9, xr9
xvhaddw.du.wu xr9, xr9, xr9
xvhaddw.qu.du xr9, xr9, xr9
xvpickve2gr.wu t6, xr9, 0
xvpickve2gr.wu t7, xr9, 4
add.d t7, t6, t7
// Load data from pix1 and pix2
add.d a0, a0, t0
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
add.d a2, a2, t1
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr9, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr10, vr7, vr5
// Load data from pix1 and pix2
add.d a0, a0, t0
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
add.d a2, a2, t1
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr5, vr7, vr5
xvpermi.q xr1, xr9, 0x20
xvpermi.q xr5, xr10, 0x20
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadda.h xr9, xr9, xr10
xvhaddw.wu.hu xr9, xr9, xr9
xvhaddw.du.wu xr9, xr9, xr9
xvhaddw.qu.du xr9, xr9, xr9
xvpickve2gr.wu t6, xr9, 0
xvpickve2gr.wu t5, xr9, 4
add.d t6, t5, t6
add.d t7, t6, t7
srli.d a0, t7, 1
endfunc_x264
/* int x264_pixel_satd_4x8_lasx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_satd_4x8_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr9, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr10, vr7, vr5
slli.d t0, a1, 2
slli.d t1, a3, 2
add.d a0, a0, t0
add.d a2, a2, t1
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4
LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr5, vr7, vr5
xvpermi.q xr1, xr9, 0x20
xvpermi.q xr5, xr10, 0x20
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* b0 + b1 */
xvsub.h xr12, xr9, xr10 /* b0 - b1 */
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadda.h xr9, xr9, xr10
xvhaddw.wu.hu xr9, xr9, xr9
xvhaddw.du.wu xr9, xr9, xr9
xvhaddw.qu.du xr9, xr9, xr9
xvpickve2gr.wu t6, xr9, 0
xvpickve2gr.wu t7, xr9, 4
add.d t6, t6, t7
srli.d a0, t6, 1
endfunc_x264
/* int x264_pixel_satd_4x4_lsx(pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2)
*/
.macro pixel_satd_4x4_lsx_core out
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr5, vr7, vr5
vsubwev.h.bu vr9, vr1, vr5
vsubwod.h.bu vr10, vr1, vr5
vadd.h vr11, vr9, vr10 /* a0 + a1 */
vsub.h vr12, vr9, vr10 /* a0 - a1 */
vpackev.h vr9, vr12, vr11
vpackod.h vr10, vr12, vr11
vadd.h vr11, vr9, vr10 /* b0 + b1 */
vsub.h vr12, vr9, vr10 /* b0 - b1 */
vpackev.w vr9, vr12, vr11
vpackod.w vr10, vr12, vr11
vadd.h vr11, vr9, vr10 /* HADAMARD4 */
vsub.h vr12, vr9, vr10
vpackev.d vr9, vr12, vr11
vpackod.d vr10, vr12, vr11
vadd.h vr11, vr9, vr10
vsub.h vr12, vr9, vr10
vpackev.d vr9, vr12, vr11
vpackod.d vr10, vr12, vr11
vadda.h \out, vr9, vr10
.endm
function_x264 pixel_satd_4x4_lsx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr13
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu t5, vr13, 0
srli.d a0, t5, 1
endfunc_x264
/*
* int pixel_ssd_16x16_lasx(const Pixel *pix1, intptr_t stride_pix1,
* const Pixel *pix2, intptr_t stride_pix2)
*/
function_x264 pixel_ssd_16x16_lasx
slli.d t0, a1, 1
add.d t1, a1, t0
add.d t2, a1, t1
slli.d t3, a3, 1
add.d t4, a3, t3
add.d t5, a3, t4
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr4, xr4
vext2xv.hu.bu xr5, xr5
vext2xv.hu.bu xr6, xr6
vext2xv.hu.bu xr7, xr7
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
vext2xv.hu.bu xr12, xr12
vext2xv.hu.bu xr13, xr13
vext2xv.hu.bu xr14, xr14
vext2xv.hu.bu xr15, xr15
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvsub.h xr4, xr4, xr12
xvsub.h xr5, xr5, xr13
xvsub.h xr6, xr6, xr14
xvsub.h xr7, xr7, xr15
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvmul.h xr4, xr4, xr4
xvmul.h xr5, xr5, xr5
xvmul.h xr6, xr6, xr6
xvmul.h xr7, xr7, xr7
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvhaddw.wu.hu xr4, xr4, xr4
xvhaddw.wu.hu xr5, xr5, xr5
xvhaddw.wu.hu xr6, xr6, xr6
xvhaddw.wu.hu xr7, xr7, xr7
xvadd.w xr16, xr0, xr1
xvadd.w xr17, xr2, xr3
xvadd.w xr18, xr4, xr5
xvadd.w xr19, xr6, xr7
xvadd.w xr16, xr16, xr17
xvadd.w xr18, xr18, xr19
xvadd.w xr16, xr16, xr18
// Load data from pix1 and pix2
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr4, xr4
vext2xv.hu.bu xr5, xr5
vext2xv.hu.bu xr6, xr6
vext2xv.hu.bu xr7, xr7
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
vext2xv.hu.bu xr12, xr12
vext2xv.hu.bu xr13, xr13
vext2xv.hu.bu xr14, xr14
vext2xv.hu.bu xr15, xr15
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvsub.h xr4, xr4, xr12
xvsub.h xr5, xr5, xr13
xvsub.h xr6, xr6, xr14
xvsub.h xr7, xr7, xr15
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvmul.h xr4, xr4, xr4
xvmul.h xr5, xr5, xr5
xvmul.h xr6, xr6, xr6
xvmul.h xr7, xr7, xr7
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvhaddw.wu.hu xr4, xr4, xr4
xvhaddw.wu.hu xr5, xr5, xr5
xvhaddw.wu.hu xr6, xr6, xr6
xvhaddw.wu.hu xr7, xr7, xr7
xvadd.w xr0, xr0, xr1
xvadd.w xr2, xr2, xr3
xvadd.w xr4, xr4, xr5
xvadd.w xr6, xr6, xr7
xvadd.w xr0, xr0, xr2
xvadd.w xr4, xr4, xr6
xvadd.w xr0, xr0, xr4
xvadd.w xr0, xr0, xr16
// Calculate the sum
xvhaddw.d.w xr0, xr0, xr0
xvhaddw.q.d xr0, xr0, xr0
xvpickve2gr.w t2, xr0, 0
xvpickve2gr.w t3, xr0, 4
add.d a0, t2, t3
endfunc_x264
/*
* int pixel_ssd_16x8_lasx(const Pixel *pix1, intptr_t stride_pix1,
* const Pixel *pix2, intptr_t stride_pix2)
*/
function_x264 pixel_ssd_16x8_lasx
slli.d t0, a1, 1
add.d t1, a1, t0
add.d t2, a1, t1
slli.d t3, a3, 1
add.d t4, a3, t3
add.d t5, a3, t4
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr4, xr4
vext2xv.hu.bu xr5, xr5
vext2xv.hu.bu xr6, xr6
vext2xv.hu.bu xr7, xr7
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
vext2xv.hu.bu xr12, xr12
vext2xv.hu.bu xr13, xr13
vext2xv.hu.bu xr14, xr14
vext2xv.hu.bu xr15, xr15
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvsub.h xr4, xr4, xr12
xvsub.h xr5, xr5, xr13
xvsub.h xr6, xr6, xr14
xvsub.h xr7, xr7, xr15
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvmul.h xr4, xr4, xr4
xvmul.h xr5, xr5, xr5
xvmul.h xr6, xr6, xr6
xvmul.h xr7, xr7, xr7
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvhaddw.wu.hu xr4, xr4, xr4
xvhaddw.wu.hu xr5, xr5, xr5
xvhaddw.wu.hu xr6, xr6, xr6
xvhaddw.wu.hu xr7, xr7, xr7
xvadd.w xr0, xr0, xr1
xvadd.w xr2, xr2, xr3
xvadd.w xr4, xr4, xr5
xvadd.w xr6, xr6, xr7
xvadd.w xr0, xr0, xr2
xvadd.w xr4, xr4, xr6
xvadd.w xr0, xr0, xr4
// Calculate the sum
xvhaddw.d.w xr0, xr0, xr0
xvhaddw.q.d xr0, xr0, xr0
xvpickve2gr.w t2, xr0, 0
xvpickve2gr.w t3, xr0, 4
add.d a0, t2, t3
endfunc_x264
/*
* int pixel_ssd_8x16_lasx(const Pixel *pix1, intptr_t stride_pix1,
* const Pixel *pix2, intptr_t stride_pix2)
*/
function_x264 pixel_ssd_8x16_lasx
slli.d t0, a1, 1
add.d t1, a1, t0
add.d t2, a1, t1
slli.d t3, a3, 1
add.d t4, a3, t3
add.d t5, a3, t4
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vilvl.d vr0, vr4, vr0
vilvl.d vr1, vr5, vr1
vilvl.d vr2, vr6, vr2
vilvl.d vr3, vr7, vr3
vilvl.d vr8, vr12, vr8
vilvl.d vr9, vr13, vr9
vilvl.d vr10, vr14, vr10
vilvl.d vr11, vr15, vr11
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvadd.w xr0, xr0, xr1
xvadd.w xr2, xr2, xr3
xvadd.w xr16, xr0, xr2
// Load data from pix1 and pix2
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vilvl.d vr0, vr4, vr0
vilvl.d vr1, vr5, vr1
vilvl.d vr2, vr6, vr2
vilvl.d vr3, vr7, vr3
vilvl.d vr8, vr12, vr8
vilvl.d vr9, vr13, vr9
vilvl.d vr10, vr14, vr10
vilvl.d vr11, vr15, vr11
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvadd.w xr0, xr0, xr1
xvadd.w xr2, xr2, xr3
xvadd.w xr0, xr0, xr2
xvadd.w xr0, xr0, xr16
// Calculate the sum
xvhaddw.d.w xr0, xr0, xr0
xvhaddw.q.d xr0, xr0, xr0
xvpickve2gr.w t2, xr0, 0
xvpickve2gr.w t3, xr0, 4
add.d a0, t2, t3
endfunc_x264
/*
* int pixel_ssd_8x8_lasx(const Pixel *pix1, intptr_t stride_pix1,
* const Pixel *pix2, intptr_t stride_pix2)
*/
function_x264 pixel_ssd_8x8_lasx
slli.d t0, a1, 1
add.d t1, a1, t0
add.d t2, a1, t1
slli.d t3, a3, 1
add.d t4, a3, t3
add.d t5, a3, t4
// Load data from pix1 and pix2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
add.d a0, a0, t2
LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7
LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11
add.d a2, a2, t5
LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15
vilvl.d vr0, vr4, vr0
vilvl.d vr1, vr5, vr1
vilvl.d vr2, vr6, vr2
vilvl.d vr3, vr7, vr3
vilvl.d vr8, vr12, vr8
vilvl.d vr9, vr13, vr9
vilvl.d vr10, vr14, vr10
vilvl.d vr11, vr15, vr11
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
vext2xv.hu.bu xr8, xr8
vext2xv.hu.bu xr9, xr9
vext2xv.hu.bu xr10, xr10
vext2xv.hu.bu xr11, xr11
// Calculate the square of the difference
xvsub.h xr0, xr0, xr8
xvsub.h xr1, xr1, xr9
xvsub.h xr2, xr2, xr10
xvsub.h xr3, xr3, xr11
xvmul.h xr0, xr0, xr0
xvmul.h xr1, xr1, xr1
xvmul.h xr2, xr2, xr2
xvmul.h xr3, xr3, xr3
xvhaddw.wu.hu xr0, xr0, xr0
xvhaddw.wu.hu xr1, xr1, xr1
xvhaddw.wu.hu xr2, xr2, xr2
xvhaddw.wu.hu xr3, xr3, xr3
xvadd.w xr0, xr0, xr1
xvadd.w xr2, xr2, xr3
xvadd.w xr0, xr0, xr2
// Calculate the sum
xvhaddw.d.w xr0, xr0, xr0
xvhaddw.q.d xr0, xr0, xr0
xvpickve2gr.w t2, xr0, 0
xvpickve2gr.w t3, xr0, 4
add.d a0, t2, t3
endfunc_x264
/*
* int pixel_sa8d_16x16_lasx(const Pixel *pix1, intptr_t i_pix1,
* const Pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_sa8d_16x16_lasx
addi.d sp, sp, -8
fst.d f24, sp, 0
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
slli.d t6, a1, 2
slli.d t7, a3, 2
slli.d t0, a1, 3
slli.d t1, a3, 3
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr15, xr11, xr13
xvsub.h xr16, xr11, xr13
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvadd.h xr17, xr15, xr9
xvadd.h xr18, xr16, xr10
xvsub.h xr19, xr15, xr9
xvsub.h xr20, xr16, xr10
xvadda.h xr17, xr17, xr18
xvadda.h xr19, xr19, xr20
xvadd.h xr21, xr17, xr19
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr15, xr11, xr13
xvsub.h xr16, xr11, xr13
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvadd.h xr17, xr15, xr9
xvadd.h xr18, xr16, xr10
xvsub.h xr19, xr15, xr9
xvsub.h xr20, xr16, xr10
xvadda.h xr17, xr17, xr18
xvadda.h xr19, xr19, xr20
xvadd.h xr22, xr17, xr19
sub.d a0, a0, t6
sub.d a2, a2, t7
addi.d a0, a0, 8
addi.d a2, a2, 8
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr15, xr11, xr13
xvsub.h xr16, xr11, xr13
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvadd.h xr17, xr15, xr9
xvadd.h xr18, xr16, xr10
xvsub.h xr19, xr15, xr9
xvsub.h xr20, xr16, xr10
xvadda.h xr17, xr17, xr18
xvadda.h xr19, xr19, xr20
xvadd.h xr23, xr17, xr19
sub.d a0, a0, t0
sub.d a2, a2, t1
sub.d a0, a0, t6
sub.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr15, xr11, xr13
xvsub.h xr16, xr11, xr13
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr12, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvadd.h xr17, xr15, xr9
xvadd.h xr18, xr16, xr10
xvsub.h xr19, xr15, xr9
xvsub.h xr20, xr16, xr10
xvadda.h xr17, xr17, xr18
xvadda.h xr19, xr19, xr20
xvadd.h xr24, xr17, xr19
xvadd.h xr21, xr21, xr22
xvadd.h xr23, xr23, xr24
xvhaddw.wu.hu xr21, xr21, xr21
xvhaddw.wu.hu xr23, xr23, xr23
xvadd.w xr21, xr21, xr23
xvhaddw.du.wu xr21, xr21, xr21
xvhaddw.qu.du xr21, xr21, xr21
xvpickve2gr.du t4, xr21, 0
xvpickve2gr.du t5, xr21, 2
add.d t4, t4, t5
addi.d t4, t4, 2
srli.d a0, t4, 2
fld.d f24, sp, 0
addi.d sp, sp, 8
endfunc_x264
/*
* int pixel_sa8d_8x8_lasx(const Pixel *pix1, intptr_t i_pix1,
* const Pixel *pix2, intptr_t i_pix2)
*/
function_x264 pixel_sa8d_8x8_lasx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
slli.d t6, a1, 2
slli.d t7, a3, 2
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvor.v xr14, xr12, xr12
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr14, 0x13
xvadd.h xr15, xr11, xr13
xvsub.h xr16, xr11, xr13
add.d a0, a0, t6
add.d a2, a2, t7
// Load data from pix1 and pix2
FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr1, xr3, 0x02
xvpermi.q xr5, xr7, 0x02
xvsubwev.h.bu xr9, xr1, xr5
xvsubwod.h.bu xr10, xr1, xr5
xvadd.h xr11, xr9, xr10 /* a0 + a1 */
xvsub.h xr12, xr9, xr10 /* a0 - a1 */
xvpackev.h xr9, xr12, xr11
xvpackod.h xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvpackev.w xr9, xr12, xr11
xvpackod.w xr10, xr12, xr11
xvadd.h xr11, xr9, xr10
xvsub.h xr12, xr9, xr10
xvpackev.d xr9, xr12, xr11
xvpackod.d xr10, xr12, xr11
xvadd.h xr11, xr9, xr10 /* HADAMARD4 */
xvsub.h xr12, xr9, xr10
xvor.v xr13, xr11, xr11
xvor.v xr14, xr12, xr12
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr13, xr14, 0x13
xvadd.h xr9, xr11, xr13
xvsub.h xr10, xr11, xr13
xvadd.h xr17, xr15, xr9
xvadd.h xr18, xr16, xr10
xvsub.h xr19, xr15, xr9
xvsub.h xr20, xr16, xr10
xvadda.h xr17, xr17, xr18
xvadda.h xr19, xr19, xr20
xvadd.h xr17, xr17, xr19
xvhaddw.wu.hu xr17, xr17, xr17
xvhaddw.du.wu xr17, xr17, xr17
xvhaddw.qu.du xr17, xr17, xr17
xvpickve2gr.wu t4, xr17, 0
xvpickve2gr.wu t5, xr17, 4
add.d t4, t4, t5
addi.d t4, t4, 2
srli.d a0, t4, 2
endfunc_x264
.macro sse_diff_8width_lasx in0, in1
fld.d f0, \in0, 0
fld.d f1, \in0, FENC_STRIDE
fld.d f2, \in0, FENC_STRIDE * 2
fld.d f3, \in0, FENC_STRIDE * 3
fld.d f4, \in1, 0
fld.d f5, \in1, FDEC_STRIDE
fld.d f6, \in1, FDEC_STRIDE * 2
fld.d f7, \in1, FDEC_STRIDE * 3
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
xvpermi.q xr1, xr0, 0x20
xvpermi.q xr5, xr4, 0x20
xvilvl.b xr2, xr5, xr1
xvilvh.b xr6, xr5, xr1
xvhsubw.hu.bu xr3, xr2, xr2
xvhsubw.hu.bu xr4, xr6, xr6
xvdp2add.w.h xr8, xr3, xr3
xvdp2add.w.h xr8, xr4, xr4
xvadd.h xr9, xr9, xr3
xvadd.h xr9, xr9, xr4
.endm
/*
* int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
* int32_t ssd[2] )
*/
function_x264 pixel_var2_8x16_lasx
add.d t0, a0, zero
add.d t1, a1, zero
xvxor.v xr8, xr8, xr8
xvxor.v xr9, xr9, xr9
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
xvhaddw.w.h xr9, xr9, xr9
xvhaddw.d.w xr9, xr9, xr9
xvhaddw.q.d xr9, xr9, xr9
xvpickve2gr.wu t2, xr9, 0
xvpickve2gr.wu t3, xr9, 4
add.w t2, t2, t3
xvhaddw.d.w xr8, xr8, xr8
xvhaddw.q.d xr8, xr8, xr8
xvpickve2gr.wu t3, xr8, 0
xvpickve2gr.wu t4, xr8, 4
add.w t3, t4, t3
st.w t3, a2, 0
mul.w t2, t2, t2
srai.w t2, t2, 7
sub.w t3, t3, t2
xvxor.v xr8, xr8, xr8
xvxor.v xr9, xr9, xr9
addi.d a0, t0, FENC_STRIDE / 2
addi.d a1, t1, FDEC_STRIDE / 2
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
xvhaddw.w.h xr9, xr9, xr9
xvhaddw.d.w xr9, xr9, xr9
xvhaddw.q.d xr9, xr9, xr9
xvpickve2gr.wu t4, xr9, 0
xvpickve2gr.wu t5, xr9, 4
add.w t4, t4, t5
xvhaddw.d.w xr8, xr8, xr8
xvhaddw.q.d xr8, xr8, xr8
xvpickve2gr.wu t5, xr8, 0
xvpickve2gr.wu t6, xr8, 4
add.w t5, t6, t5
st.w t5, a2, 4
mul.w t4, t4, t4
srai.w t4, t4, 7
sub.w t5, t5, t4
add.w a0, t3, t5
endfunc_x264
/*
* int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
* int32_t ssd[2] )
*/
function_x264 pixel_var2_8x8_lasx
add.d t0, a0, zero
add.d t1, a1, zero
xvxor.v xr8, xr8, xr8
xvxor.v xr9, xr9, xr9
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
xvhaddw.w.h xr9, xr9, xr9
xvhaddw.d.w xr9, xr9, xr9
xvhaddw.q.d xr9, xr9, xr9
xvpickve2gr.wu t2, xr9, 0
xvpickve2gr.wu t3, xr9, 4
add.w t2, t2, t3
xvhaddw.d.w xr8, xr8, xr8
xvhaddw.q.d xr8, xr8, xr8
xvpickve2gr.wu t3, xr8, 0
xvpickve2gr.wu t4, xr8, 4
add.w t3, t4, t3
st.w t3, a2, 0
mul.w t2, t2, t2
srai.w t2, t2, 6
sub.w t3, t3, t2
xvxor.v xr8, xr8, xr8
xvxor.v xr9, xr9, xr9
addi.d a0, t0, FENC_STRIDE / 2
addi.d a1, t1, FDEC_STRIDE / 2
sse_diff_8width_lasx a0, a1
addi.d a0, a0, FENC_STRIDE * 4
addi.d a1, a1, FDEC_STRIDE * 4
sse_diff_8width_lasx a0, a1
xvhaddw.w.h xr9, xr9, xr9
xvhaddw.d.w xr9, xr9, xr9
xvhaddw.q.d xr9, xr9, xr9
xvpickve2gr.wu t4, xr9, 0
xvpickve2gr.wu t5, xr9, 4
add.w t4, t4, t5
xvhaddw.d.w xr8, xr8, xr8
xvhaddw.q.d xr8, xr8, xr8
xvpickve2gr.wu t5, xr8, 0
xvpickve2gr.wu t6, xr8, 4
add.w t5, t6, t5
st.w t5, a2, 4
mul.w t4, t4, t4
srai.w t4, t4, 6
sub.w t5, t5, t4
add.w a0, t3, t5
endfunc_x264
/*
* uint64_t x264_pixel_hadamard_ac_8x8( pixel *pix, intptr_t stride )
*/
function_x264 hadamard_ac_8x8_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
alsl.d a0, a1, a0, 2
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vpickev.b vr2, vr1, vr0
vpickod.b vr3, vr1, vr0
vaddwev.h.bu vr6, vr2, vr3
vaddwod.h.bu vr7, vr2, vr3
vsubwev.h.bu vr8, vr2, vr3
vsubwod.h.bu vr9, vr2, vr3
vadd.h vr10, vr6, vr7
vadd.h vr11, vr8, vr9
vsub.h vr12, vr6, vr7
vsub.h vr13, vr8, vr9
vilvl.h vr6, vr11, vr10
vilvh.h vr7, vr11, vr10
vilvl.h vr8, vr13, vr12
vilvh.h vr9, vr13, vr12
vilvl.w vr10, vr8, vr6
vilvh.w vr11, vr8, vr6
vilvl.w vr12, vr9, vr7
vilvh.w vr13, vr9, vr7
vadd.h vr6, vr10, vr11
vadd.h vr7, vr12, vr13
vsub.h vr8, vr10, vr11
vsub.h vr9, vr12, vr13
vadd.h vr10, vr6, vr7
vadd.h vr11, vr8, vr9
vsub.h vr12, vr6, vr7
vsub.h vr13, vr8, vr9
vpickev.b vr2, vr5, vr4
vpickod.b vr3, vr5, vr4
vaddwev.h.bu vr6, vr2, vr3
vaddwod.h.bu vr7, vr2, vr3
vsubwev.h.bu vr8, vr2, vr3
vsubwod.h.bu vr9, vr2, vr3
vadd.h vr14, vr6, vr7
vadd.h vr15, vr8, vr9
vsub.h vr16, vr6, vr7
vsub.h vr17, vr8, vr9
vilvl.h vr6, vr15, vr14
vilvh.h vr7, vr15, vr14
vilvl.h vr8, vr17, vr16
vilvh.h vr9, vr17, vr16
vilvl.w vr14, vr8, vr6
vilvh.w vr15, vr8, vr6
vilvl.w vr16, vr9, vr7
vilvh.w vr17, vr9, vr7
vadd.h vr6, vr14, vr15
vadd.h vr7, vr16, vr17
vsub.h vr8, vr14, vr15
vsub.h vr9, vr16, vr17
vadd.h vr14, vr6, vr7
vadd.h vr15, vr8, vr9
vsub.h vr16, vr6, vr7
vsub.h vr17, vr8, vr9
vadd.h vr18, vr10, vr14
vpickve2gr.hu t0, vr18, 0
vpickve2gr.hu t1, vr18, 4
add.d t1, t0, t1 // dc
vadda.h vr4, vr11, vr10
vadda.h vr5, vr13, vr12
vadda.h vr6, vr15, vr14
vadda.h vr7, vr17, vr16
vadd.h vr4, vr5, vr4
vadd.h vr6, vr7, vr6
vadd.h vr4, vr4, vr6
vhaddw.wu.hu vr4, vr4, vr4
vhaddw.du.wu vr4, vr4, vr4
vhaddw.qu.du vr4, vr4, vr4
vpickve2gr.wu t0, vr4, 0 // sum4
vpackev.h vr0, vr11, vr10
vpackev.h vr1, vr13, vr12
vpackev.h vr2, vr15, vr14
vpackev.h vr3, vr17, vr16
vpackod.h vr4, vr11, vr10
vpackod.h vr5, vr13, vr12
vpackod.h vr6, vr15, vr14
vpackod.h vr7, vr17, vr16
vilvl.d vr10, vr1, vr0
vilvh.d vr11, vr1, vr0
vilvl.d vr12, vr3, vr2
vilvh.d vr13, vr3, vr2
vilvl.d vr14, vr5, vr4
vilvh.d vr15, vr5, vr4
vilvl.d vr16, vr7, vr6
vilvh.d vr17, vr7, vr6
vadd.h vr0, vr10, vr11
vadd.h vr1, vr12, vr13
vadd.h vr2, vr14, vr16
vadd.h vr3, vr15, vr17
vsub.h vr4, vr10, vr11
vsub.h vr5, vr12, vr13
vsub.h vr6, vr14, vr16
vsub.h vr7, vr15, vr17
vadd.h vr10, vr0, vr1
vadd.h vr11, vr2, vr3
vadd.h vr12, vr4, vr5
vadd.h vr13, vr6, vr7
vsub.h vr14, vr0, vr1
vsub.h vr15, vr2, vr3
vsub.h vr16, vr4, vr5
vsub.h vr17, vr6, vr7
vadda.h vr10, vr10, vr11
vadda.h vr11, vr12, vr13
vadda.h vr12, vr14, vr15
vadda.h vr13, vr16, vr17
vadd.h vr10, vr10, vr11
vadd.h vr11, vr12, vr13
vadd.h vr10, vr10, vr11
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.du.wu vr10, vr10, vr10
vhaddw.qu.du vr10, vr10, vr10
vpickve2gr.wu t2, vr10, 0 // sum8
sub.d t0, t0, t1
sub.d t2, t2, t1
slli.d t2, t2, 32
add.d a0, t2, t0
endfunc_x264
/*
* int x264_pixel_satd_4x8( pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2 )
*/
function_x264 pixel_satd_4x8_lsx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr14
vadd.h vr13, vr14, vr13
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu t5, vr13, 0
srli.d a0, t5, 1
endfunc_x264
/*
* int x264_pixel_satd_4x16( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_4x16_lsx
slli.d t2, a1, 1
slli.d t3, a3, 1
add.d t4, a1, t2
add.d t5, a3, t3
// Load data from pix1 and pix2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr14
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr15
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4
FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8
pixel_satd_4x4_lsx_core vr16
vadd.h vr13, vr14, vr13
vadd.h vr15, vr16, vr15
vadd.h vr13, vr15, vr13
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu t5, vr13, 0
srli.d a0, t5, 1
endfunc_x264
.macro pixel_satd_8x4_lsx_core out0, out1, out2, out3
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
vsubwev.h.bu vr4, vr0, vr2
vsubwod.h.bu vr5, vr0, vr2
vsubwev.h.bu vr6, vr1, vr3
vsubwod.h.bu vr7, vr1, vr3
vadd.h vr0, vr4, vr5
vsub.h vr1, vr4, vr5
vadd.h vr2, vr6, vr7
vsub.h vr3, vr6, vr7
vpackev.h vr4, vr1, vr0
vpackod.h vr5, vr1, vr0
vpackev.h vr6, vr3, vr2
vpackod.h vr7, vr3, vr2
vadd.h vr8, vr4, vr5
vsub.h vr9, vr4, vr5
vadd.h vr10, vr6, vr7
vsub.h vr11, vr6, vr7
vilvl.d vr4, vr9, vr8
vilvh.d vr5, vr9, vr8
vilvl.d vr6, vr11, vr10
vilvh.d vr7, vr11, vr10
vadd.h vr8, vr4, vr5
vsub.h vr9, vr4, vr5
vadd.h vr10, vr6, vr7
vsub.h vr11, vr6, vr7
vadd.h \out0, vr8, vr10
vsub.h \out1, vr8, vr10
vadd.h \out2, vr9, vr11
vsub.h \out3, vr9, vr11
.endm
/*
* int x264_pixel_satd_8x4( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_8x4_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.du.wu vr12, vr12, vr12
vhaddw.qu.du vr12, vr12, vr12
vpickve2gr.wu t4, vr12, 0
srli.d a0, t4, 1
endfunc_x264
/*
* int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_8x8_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
vadda.h vr13, vr14, vr13
vadda.h vr14, vr16, vr15
vadd.h vr13, vr14, vr13
vadd.h vr12, vr13, vr12
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.du.wu vr12, vr12, vr12
vhaddw.qu.du vr12, vr12, vr12
vpickve2gr.wu t4, vr12, 0
srli.d a0, t4, 1
endfunc_x264
/*
* int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_8x16_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
vadda.h vr13, vr14, vr13
vadda.h vr14, vr16, vr15
vadd.h vr13, vr14, vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
vadda.h vr14, vr15, vr14
vadda.h vr15, vr17, vr16
vadd.h vr14, vr15, vr14
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
vadda.h vr15, vr16, vr15
vadda.h vr16, vr18, vr17
vadd.h vr15, vr16, vr15
vadd.h vr12, vr12, vr13
vadd.h vr14, vr14, vr15
vadd.h vr12, vr12, vr14
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.du.wu vr12, vr12, vr12
vhaddw.qu.du vr12, vr12, vr12
vpickve2gr.wu t4, vr12, 0
srli.d a0, t4, 1
endfunc_x264
/*
* int x264_pixel_satd_16x8( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_16x8_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
vadda.h vr13, vr14, vr13
vadda.h vr14, vr16, vr15
vadd.h vr13, vr14, vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
vadda.h vr14, vr15, vr14
vadda.h vr15, vr17, vr16
vadd.h vr14, vr15, vr14
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
vadda.h vr15, vr16, vr15
vadda.h vr16, vr18, vr17
vadd.h vr15, vr16, vr15
vadd.h vr12, vr13, vr12
vadd.h vr14, vr15, vr14
vadd.h vr12, vr14, vr12
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.du.wu vr12, vr12, vr12
vhaddw.qu.du vr12, vr12, vr12
vpickve2gr.wu t4, vr12, 0
srli.d a0, t4, 1
endfunc_x264
/*
* int x264_pixel_satd_16x16( uint8_t *p_pix1, intptr_t i_stride,
* uint8_t *p_pix2, intptr_t i_stride2 )
*/
function_x264 pixel_satd_16x16_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
vadda.h vr13, vr14, vr13
vadda.h vr14, vr16, vr15
vadd.h vr13, vr14, vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
vadda.h vr14, vr15, vr14
vadda.h vr15, vr17, vr16
vadd.h vr14, vr15, vr14
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
vadda.h vr15, vr16, vr15
vadda.h vr16, vr18, vr17
vadd.h vr15, vr16, vr15
vadd.h vr12, vr13, vr12
vadd.h vr14, vr15, vr14
vadd.h vr19, vr14, vr12
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15
vadda.h vr12, vr13, vr12
vadda.h vr13, vr15, vr14
vadd.h vr12, vr13, vr12
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16
vadda.h vr13, vr14, vr13
vadda.h vr14, vr16, vr15
vadd.h vr13, vr14, vr13
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17
vadda.h vr14, vr15, vr14
vadda.h vr15, vr17, vr16
vadd.h vr14, vr15, vr14
addi.d t5, a0, 8
addi.d t6, a2, 8
FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7
pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18
vadda.h vr15, vr16, vr15
vadda.h vr16, vr18, vr17
vadd.h vr15, vr16, vr15
vadd.h vr12, vr13, vr12
vadd.h vr14, vr15, vr14
vadd.h vr12, vr14, vr12
vadd.h vr12, vr19, vr12
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.du.wu vr12, vr12, vr12
vhaddw.qu.du vr12, vr12, vr12
vpickve2gr.wu t4, vr12, 0
srli.d a0, t4, 1
endfunc_x264
/*
* int x264_pixel_ssd_4x4( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_4x4_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr4, vr5, vr4
vilvl.w vr5, vr7, vr6
vilvl.d vr0, vr1, vr0
vilvl.d vr4, vr5, vr4
vsubwev.h.bu vr1, vr0, vr4
vsubwod.h.bu vr2, vr0, vr4
vmul.h vr5, vr1, vr1
vmul.h vr6, vr2, vr2
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vadd.w vr5, vr5, vr6
vhaddw.d.w vr5, vr5, vr5
vhaddw.q.d vr5, vr5, vr5
vpickve2gr.w a0, vr5, 0
endfunc_x264
/*
* int x264_pixel_ssd_4x8( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_4x8_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr4, vr5, vr4
vilvl.w vr5, vr7, vr6
vilvl.d vr0, vr1, vr0
vilvl.d vr4, vr5, vr4
vsubwev.h.bu vr1, vr0, vr4
vsubwod.h.bu vr2, vr0, vr4
vmul.h vr5, vr1, vr1
vmul.h vr6, vr2, vr2
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vadd.w vr10, vr5, vr6
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr4, vr5, vr4
vilvl.w vr5, vr7, vr6
vilvl.d vr0, vr1, vr0
vilvl.d vr4, vr5, vr4
vsubwev.h.bu vr1, vr0, vr4
vsubwod.h.bu vr2, vr0, vr4
vmul.h vr5, vr1, vr1
vmul.h vr6, vr2, vr2
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vadd.w vr5, vr5, vr6
vadd.w vr5, vr5, vr10
vhaddw.d.w vr5, vr5, vr5
vhaddw.q.d vr5, vr5, vr5
vpickve2gr.w a0, vr5, 0
endfunc_x264
/*
* int x264_pixel_ssd_4x16( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_4x16_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr4, vr5, vr4
vilvl.w vr5, vr7, vr6
vilvl.d vr0, vr1, vr0
vilvl.d vr4, vr5, vr4
vsubwev.h.bu vr1, vr0, vr4
vsubwod.h.bu vr2, vr0, vr4
vmul.h vr5, vr1, vr1
vmul.h vr6, vr2, vr2
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vadd.w vr10, vr5, vr6
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr4, vr5, vr4
vilvl.w vr5, vr7, vr6
vilvl.d vr0, vr1, vr0
vilvl.d vr4, vr5, vr4
vsubwev.h.bu vr1, vr0, vr4
vsubwod.h.bu vr2, vr0, vr4
vmul.h vr5, vr1, vr1
vmul.h vr6, vr2, vr2
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vadd.w vr5, vr5, vr6
vadd.w vr10, vr5, vr10
.endr
vhaddw.d.w vr10, vr10, vr10
vhaddw.q.d vr10, vr10, vr10
vpickve2gr.w a0, vr10, 0
endfunc_x264
/*
* int x264_pixel_ssd_8x4( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_8x4_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vmul.h vr2, vr2, vr2
vmul.h vr3, vr3, vr3
vmul.h vr6, vr6, vr6
vmul.h vr7, vr7, vr7
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr2, vr2, vr6
vhaddw.d.w vr2, vr2, vr2
vhaddw.q.d vr2, vr2, vr2
vpickve2gr.w a0, vr2, 0
endfunc_x264
/*
* int x264_pixel_ssd_8x8( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_8x8_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vmul.h vr2, vr2, vr2
vmul.h vr3, vr3, vr3
vmul.h vr6, vr6, vr6
vmul.h vr7, vr7, vr7
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr10, vr2, vr6
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vmul.h vr2, vr2, vr2
vmul.h vr3, vr3, vr3
vmul.h vr6, vr6, vr6
vmul.h vr7, vr7, vr7
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr11, vr2, vr6
vadd.w vr10, vr10, vr11
vhaddw.d.w vr10, vr10, vr10
vhaddw.q.d vr10, vr10, vr10
vpickve2gr.w a0, vr10, 0
endfunc_x264
/*
* int x264_pixel_ssd_8x16( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_8x16_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vmul.h vr2, vr2, vr2
vmul.h vr3, vr3, vr3
vmul.h vr6, vr6, vr6
vmul.h vr7, vr7, vr7
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr10, vr2, vr6
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vmul.h vr2, vr2, vr2
vmul.h vr3, vr3, vr3
vmul.h vr6, vr6, vr6
vmul.h vr7, vr7, vr7
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr11, vr2, vr6
vadd.w vr10, vr10, vr11
.endr
vhaddw.d.w vr10, vr10, vr10
vhaddw.q.d vr10, vr10, vr10
vpickve2gr.w a0, vr10, 0
endfunc_x264
/*
* int x264_pixel_ssd_16x8( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_16x8_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
vsubwev.h.bu vr8, vr0, vr4
vsubwod.h.bu vr9, vr0, vr4
vsubwev.h.bu vr10, vr1, vr5
vsubwod.h.bu vr11, vr1, vr5
vsubwev.h.bu vr12, vr2, vr6
vsubwod.h.bu vr13, vr2, vr6
vsubwev.h.bu vr14, vr3, vr7
vsubwod.h.bu vr15, vr3, vr7
vmul.h vr8, vr8, vr8
vmul.h vr9, vr9, vr9
vmul.h vr10, vr10, vr10
vmul.h vr11, vr11, vr11
vmul.h vr12, vr12, vr12
vmul.h vr13, vr13, vr13
vmul.h vr14, vr14, vr14
vmul.h vr15, vr15, vr15
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.wu.hu vr14, vr14, vr14
vhaddw.wu.hu vr15, vr15, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr10, vr12, vr13
vadd.w vr11, vr14, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr16, vr8, vr9
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
vsubwev.h.bu vr8, vr0, vr4
vsubwod.h.bu vr9, vr0, vr4
vsubwev.h.bu vr10, vr1, vr5
vsubwod.h.bu vr11, vr1, vr5
vsubwev.h.bu vr12, vr2, vr6
vsubwod.h.bu vr13, vr2, vr6
vsubwev.h.bu vr14, vr3, vr7
vsubwod.h.bu vr15, vr3, vr7
vmul.h vr8, vr8, vr8
vmul.h vr9, vr9, vr9
vmul.h vr10, vr10, vr10
vmul.h vr11, vr11, vr11
vmul.h vr12, vr12, vr12
vmul.h vr13, vr13, vr13
vmul.h vr14, vr14, vr14
vmul.h vr15, vr15, vr15
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.wu.hu vr14, vr14, vr14
vhaddw.wu.hu vr15, vr15, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr10, vr12, vr13
vadd.w vr11, vr14, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr17, vr8, vr9
vadd.w vr10, vr16, vr17
vhaddw.d.w vr10, vr10, vr10
vhaddw.q.d vr10, vr10, vr10
vpickve2gr.w a0, vr10, 0
endfunc_x264
/*
* int x264_pixel_ssd_16x16( pixel *pix1, intptr_t i_stride_pix1,
* pixel *pix2, intptr_t i_stride_pix2 )
*/
function_x264 pixel_ssd_16x16_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
slli.d t2, a3, 1
add.d t3, a3, t2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
vsubwev.h.bu vr8, vr0, vr4
vsubwod.h.bu vr9, vr0, vr4
vsubwev.h.bu vr10, vr1, vr5
vsubwod.h.bu vr11, vr1, vr5
vsubwev.h.bu vr12, vr2, vr6
vsubwod.h.bu vr13, vr2, vr6
vsubwev.h.bu vr14, vr3, vr7
vsubwod.h.bu vr15, vr3, vr7
vmul.h vr8, vr8, vr8
vmul.h vr9, vr9, vr9
vmul.h vr10, vr10, vr10
vmul.h vr11, vr11, vr11
vmul.h vr12, vr12, vr12
vmul.h vr13, vr13, vr13
vmul.h vr14, vr14, vr14
vmul.h vr15, vr15, vr15
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.wu.hu vr14, vr14, vr14
vhaddw.wu.hu vr15, vr15, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr10, vr12, vr13
vadd.w vr11, vr14, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr16, vr8, vr9
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7
vsubwev.h.bu vr8, vr0, vr4
vsubwod.h.bu vr9, vr0, vr4
vsubwev.h.bu vr10, vr1, vr5
vsubwod.h.bu vr11, vr1, vr5
vsubwev.h.bu vr12, vr2, vr6
vsubwod.h.bu vr13, vr2, vr6
vsubwev.h.bu vr14, vr3, vr7
vsubwod.h.bu vr15, vr3, vr7
vmul.h vr8, vr8, vr8
vmul.h vr9, vr9, vr9
vmul.h vr10, vr10, vr10
vmul.h vr11, vr11, vr11
vmul.h vr12, vr12, vr12
vmul.h vr13, vr13, vr13
vmul.h vr14, vr14, vr14
vmul.h vr15, vr15, vr15
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.wu.hu vr14, vr14, vr14
vhaddw.wu.hu vr15, vr15, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr10, vr12, vr13
vadd.w vr11, vr14, vr15
vadd.w vr8, vr8, vr9
vadd.w vr9, vr10, vr11
vadd.w vr17, vr8, vr9
vadd.w vr16, vr16, vr17
.endr
vhaddw.d.w vr16, vr16, vr16
vhaddw.q.d vr16, vr16, vr16
vpickve2gr.w a0, vr16, 0
endfunc_x264
/*
* int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
*/
.macro pixel_sa8d_8x8_lsx_core out0, out1, out2, out3
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vadd.h vr8, vr2, vr3
vsub.h vr9, vr2, vr3
vadd.h vr10, vr6, vr7
vsub.h vr11, vr6, vr7
vpackev.h vr0, vr9, vr8
vpackod.h vr1, vr9, vr8
vpackev.h vr2, vr11, vr10
vpackod.h vr3, vr11, vr10
vadd.h vr4, vr0, vr1
vsub.h vr5, vr0, vr1
vadd.h vr6, vr2, vr3
vsub.h vr7, vr2, vr3
vilvl.d vr0, vr5, vr4
vilvh.d vr1, vr5, vr4
vilvl.d vr2, vr7, vr6
vilvh.d vr3, vr7, vr6
vadd.h vr12, vr0, vr1
vsub.h vr13, vr0, vr1
vadd.h vr14, vr2, vr3
vsub.h vr15, vr2, vr3
alsl.d t4, a1, a0, 2
alsl.d t5, a3, a2, 2
FLDD_LOADX_4 t4, a1, t0, t1, f0, f1, f2, f3
FLDD_LOADX_4 t5, a3, t2, t3, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vsubwev.h.bu vr2, vr0, vr4
vsubwod.h.bu vr3, vr0, vr4
vsubwev.h.bu vr6, vr1, vr5
vsubwod.h.bu vr7, vr1, vr5
vadd.h vr8, vr2, vr3
vsub.h vr9, vr2, vr3
vadd.h vr10, vr6, vr7
vsub.h vr11, vr6, vr7
vpackev.h vr0, vr9, vr8
vpackod.h vr1, vr9, vr8
vpackev.h vr2, vr11, vr10
vpackod.h vr3, vr11, vr10
vadd.h vr4, vr0, vr1
vsub.h vr5, vr0, vr1
vadd.h vr6, vr2, vr3
vsub.h vr7, vr2, vr3
vilvl.d vr0, vr5, vr4
vilvh.d vr1, vr5, vr4
vilvl.d vr2, vr7, vr6
vilvh.d vr3, vr7, vr6
vadd.h vr4, vr0, vr1
vsub.h vr5, vr0, vr1
vadd.h vr6, vr2, vr3
vsub.h vr7, vr2, vr3
// vr12 vr13 vr14 vr15
vpickev.w vr0, vr13, vr12
vpickod.w vr1, vr13, vr12
vpickev.w vr2, vr15, vr14
vpickod.w vr3, vr15, vr14
vadd.h vr8, vr0, vr1
vsub.h vr9, vr0, vr1
vadd.h vr10, vr2, vr3
vsub.h vr11, vr2, vr3
vadd.h vr12, vr8, vr10
vadd.h vr13, vr9, vr11
vsub.h vr14, vr8, vr10
vsub.h vr15, vr9, vr11
// vr4 vr5 vr6 vr7
vpickev.w vr0, vr5, vr4
vpickod.w vr1, vr5, vr4
vpickev.w vr2, vr7, vr6
vpickod.w vr3, vr7, vr6
vadd.h vr8, vr0, vr1
vsub.h vr9, vr0, vr1
vadd.h vr10, vr2, vr3
vsub.h vr11, vr2, vr3
vadd.h vr4, vr8, vr10
vadd.h vr5, vr9, vr11
vsub.h vr6, vr8, vr10
vsub.h vr7, vr9, vr11
vadd.h vr0, vr12, vr4
vadd.h vr1, vr13, vr5
vadd.h vr2, vr14, vr6
vadd.h vr3, vr15, vr7
vsub.h vr8, vr12, vr4
vsub.h vr9, vr13, vr5
vsub.h vr10, vr14, vr6
vsub.h vr11, vr15, vr7
vadda.h \out0, vr0, vr8
vadda.h \out1, vr1, vr9
vadda.h \out2, vr2, vr10
vadda.h \out3, vr3, vr11
.endm
function_x264 pixel_sa8d_8x8_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
vadd.h vr0, vr0, vr1
vadd.h vr1, vr2, vr3
vadd.h vr17, vr0, vr1
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.du.wu vr17, vr17, vr17
vhaddw.qu.du vr17, vr17, vr17
vpickve2gr.wu t5, vr17, 0
addi.d t5, t5, 2
srli.d a0, t5, 2
endfunc_x264
/*
* int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1,
* pixel *pix2, intptr_t i_pix2 )
*/
function_x264 pixel_sa8d_16x16_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a3, 1
add.d t3, t2, a3
add.d t6, a0, zero
add.d t7, a2, zero
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
vadd.h vr0, vr0, vr1
vadd.h vr1, vr2, vr3
vadd.h vr16, vr0, vr1
addi.d a0, t6, 8
addi.d a2, t7, 8
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
vadd.h vr0, vr0, vr1
vadd.h vr1, vr2, vr3
vadd.h vr17, vr0, vr1
alsl.d a0, a1, t6, 3
alsl.d a2, a3, t7, 3
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
vadd.h vr0, vr0, vr1
vadd.h vr1, vr2, vr3
vadd.h vr18, vr0, vr1
addi.d a0, a0, 8
addi.d a2, a2, 8
pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3
vadd.h vr0, vr0, vr1
vadd.h vr1, vr2, vr3
vadd.h vr19, vr0, vr1
vhaddw.wu.hu vr16, vr16, vr16
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.wu.hu vr18, vr18, vr18
vhaddw.wu.hu vr19, vr19, vr19
vadd.w vr16, vr17, vr16
vadd.w vr18, vr19, vr18
vadd.w vr17, vr18, vr16
vhaddw.du.wu vr17, vr17, vr17
vhaddw.qu.du vr17, vr17, vr17
vpickve2gr.wu t5, vr17, 0
addi.d t5, t5, 2
srli.d a0, t5, 2
endfunc_x264
/*
* uint64_t pixel_var_8x8( pixel *pix, intptr_t i_stride )
*/
function_x264 pixel_var_8x8_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
alsl.d a0, a1, a0, 2
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vhaddw.hu.bu vr2, vr0, vr0
vhaddw.hu.bu vr3, vr1, vr1
vhaddw.hu.bu vr6, vr4, vr4
vhaddw.hu.bu vr7, vr5, vr5
vadd.h vr2, vr2, vr3
vadd.h vr6, vr6, vr7
vadd.h vr2, vr2, vr6
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr2, vr2, vr2
vpickve2gr.wu t5, vr2, 0 // sum
vmulwev.h.bu vr2, vr0, vr0
vmulwod.h.bu vr3, vr0, vr0
vmulwev.h.bu vr6, vr1, vr1
vmulwod.h.bu vr7, vr1, vr1
vmulwev.h.bu vr8, vr4, vr4
vmulwod.h.bu vr9, vr4, vr4
vmulwev.h.bu vr10, vr5, vr5
vmulwod.h.bu vr11, vr5, vr5
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr8, vr8, vr9
vadd.w vr10, vr10, vr11
vadd.w vr2, vr2, vr6
vadd.w vr8, vr8, vr10
vadd.w vr2, vr2, vr8
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr2, vr2, vr2
vpickve2gr.du t6, vr2, 0 // sqr
slli.d t4, t6, 32
add.d a0, t4, t5
endfunc_x264
/*
* uint64_t pixel_var_8x16( pixel *pix, intptr_t i_stride )
*/
function_x264 pixel_var_8x16_lsx
slli.d t0, a1, 1
add.d t1, a1, t0
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
alsl.d a0, a1, a0, 2
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vhaddw.hu.bu vr2, vr0, vr0
vhaddw.hu.bu vr3, vr1, vr1
vhaddw.hu.bu vr6, vr4, vr4
vhaddw.hu.bu vr7, vr5, vr5
vadd.h vr2, vr2, vr3
vadd.h vr6, vr6, vr7
vadd.h vr16, vr2, vr6
vmulwev.h.bu vr2, vr0, vr0
vmulwod.h.bu vr3, vr0, vr0
vmulwev.h.bu vr6, vr1, vr1
vmulwod.h.bu vr7, vr1, vr1
vmulwev.h.bu vr8, vr4, vr4
vmulwod.h.bu vr9, vr4, vr4
vmulwev.h.bu vr10, vr5, vr5
vmulwod.h.bu vr11, vr5, vr5
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vadd.w vr12, vr2, vr3
vadd.w vr13, vr6, vr7
vadd.w vr14, vr8, vr9
vadd.w vr15, vr10, vr11
vadd.w vr12, vr12, vr13
vadd.w vr14, vr14, vr15
vadd.w vr12, vr12, vr14
alsl.d a0, a1, a0, 2
FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3
alsl.d a0, a1, a0, 2
FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr4, vr5, vr4
vilvl.d vr5, vr7, vr6
vhaddw.hu.bu vr2, vr0, vr0
vhaddw.hu.bu vr3, vr1, vr1
vhaddw.hu.bu vr6, vr4, vr4
vhaddw.hu.bu vr7, vr5, vr5
vadd.h vr2, vr2, vr3
vadd.h vr6, vr6, vr7
vadd.h vr2, vr2, vr6
vadd.h vr2, vr2, vr16
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr2, vr2, vr2
vpickve2gr.wu t5, vr2, 0 // sum
vmulwev.h.bu vr2, vr0, vr0
vmulwod.h.bu vr3, vr0, vr0
vmulwev.h.bu vr6, vr1, vr1
vmulwod.h.bu vr7, vr1, vr1
vmulwev.h.bu vr8, vr4, vr4
vmulwod.h.bu vr9, vr4, vr4
vmulwev.h.bu vr10, vr5, vr5
vmulwod.h.bu vr11, vr5, vr5
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.wu.hu vr3, vr3, vr3
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vadd.w vr2, vr2, vr3
vadd.w vr6, vr6, vr7
vadd.w vr8, vr8, vr9
vadd.w vr10, vr10, vr11
vadd.w vr2, vr2, vr6
vadd.w vr8, vr8, vr10
vadd.w vr2, vr2, vr8
vadd.w vr2, vr2, vr12
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr2, vr2, vr2
vpickve2gr.du t6, vr2, 0 // sqr
slli.d t4, t6, 32
add.d a0, t4, t5
endfunc_x264
/*
* uint64_t pixel_var_16x16( pixel *pix, intptr_t i_stride )
*/
function_x264 pixel_var_16x16_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
vhaddw.hu.bu vr4, vr0, vr0
vhaddw.hu.bu vr5, vr1, vr1
vhaddw.hu.bu vr6, vr2, vr2
vhaddw.hu.bu vr7, vr3, vr3
vadd.h vr4, vr5, vr4
vadd.h vr5, vr7, vr6
vadd.h vr13, vr5, vr4
vmulwev.h.bu vr5, vr0, vr0
vmulwod.h.bu vr6, vr0, vr0
vmulwev.h.bu vr7, vr1, vr1
vmulwod.h.bu vr8, vr1, vr1
vmulwev.h.bu vr9, vr2, vr2
vmulwod.h.bu vr10, vr2, vr2
vmulwev.h.bu vr11, vr3, vr3
vmulwod.h.bu vr12, vr3, vr3
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vadd.w vr5, vr5, vr6
vadd.w vr6, vr8, vr7
vadd.w vr7, vr10, vr9
vadd.w vr8, vr12, vr11
vadd.w vr0, vr5, vr6
vadd.w vr1, vr8, vr7
vadd.w vr14, vr1, vr0
.rept 3
alsl.d a0, a1, a0, 2
LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3
vhaddw.hu.bu vr4, vr0, vr0
vhaddw.hu.bu vr5, vr1, vr1
vhaddw.hu.bu vr6, vr2, vr2
vhaddw.hu.bu vr7, vr3, vr3
vadd.h vr4, vr5, vr4
vadd.h vr5, vr7, vr6
vadd.h vr4, vr5, vr4
vadd.h vr13, vr4, vr13
vmulwev.h.bu vr5, vr0, vr0
vmulwod.h.bu vr6, vr0, vr0
vmulwev.h.bu vr7, vr1, vr1
vmulwod.h.bu vr8, vr1, vr1
vmulwev.h.bu vr9, vr2, vr2
vmulwod.h.bu vr10, vr2, vr2
vmulwev.h.bu vr11, vr3, vr3
vmulwod.h.bu vr12, vr3, vr3
vhaddw.wu.hu vr5, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.wu.hu vr12, vr12, vr12
vadd.w vr5, vr5, vr6
vadd.w vr6, vr8, vr7
vadd.w vr7, vr10, vr9
vadd.w vr8, vr12, vr11
vadd.w vr0, vr5, vr6
vadd.w vr1, vr8, vr7
vadd.w vr0, vr1, vr0
vadd.w vr14, vr0, vr14
.endr
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu t4, vr13, 0
vhaddw.du.wu vr14, vr14, vr14
vhaddw.qu.du vr14, vr14, vr14
vpickve2gr.du t6, vr14, 0 // sqr
slli.d t5, t6, 32
add.d a0, t4, t5
endfunc_x264
.macro sse_diff_8width_lsx in0, in1, in2, in3
fld.d f0, \in0, 0
fld.d f1, \in0, FENC_STRIDE
fld.d f2, \in0, FENC_STRIDE * 2
fld.d f3, \in0, FENC_STRIDE * 3
fld.d f4, \in1, 0
fld.d f5, \in1, FDEC_STRIDE
fld.d f6, \in1, FDEC_STRIDE * 2
fld.d f7, \in1, FDEC_STRIDE * 3
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
vsubwev.h.bu vr4, vr0, vr2
vsubwod.h.bu vr5, vr0, vr2
vsubwev.h.bu vr6, vr1, vr3
vsubwod.h.bu vr7, vr1, vr3
// sqr_u
vdp2add.w.h \in2, vr4, vr4
vdp2add.w.h \in2, vr5, vr5
vdp2add.w.h \in2, vr6, vr6
vdp2add.w.h \in2, vr7, vr7
// sum_u
vadd.h vr4, vr4, vr5
vadd.h vr6, vr6, vr7
vadd.h \in3, vr4, vr6
.endm
/*
* int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
*/
function_x264 pixel_var2_8x8_lsx
vxor.v vr8, vr8, vr8
sse_diff_8width_lsx a0, a1, vr8, vr9
addi.d t0, a0, FENC_STRIDE * 4
addi.d t1, a1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr10
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t2, vr8, 0 // sqr_u
vadd.h vr8, vr10, vr9
vhaddw.w.h vr8, vr8, vr8
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t3, vr8, 0 // sum_u
addi.d a0, a0, FENC_STRIDE / 2
addi.d a1, a1, FDEC_STRIDE / 2
vxor.v vr8, vr8, vr8
sse_diff_8width_lsx a0, a1, vr8, vr9
addi.d t0, a0, FENC_STRIDE * 4
addi.d t1, a1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr10
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t4, vr8, 0 // sqr_v
vadd.h vr8, vr10, vr9
vhaddw.w.h vr8, vr8, vr8
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t5, vr8, 0 // sum_v
st.w t2, a2, 0
st.w t4, a2, 4
mul.w t3, t3, t3
mul.w t5, t5, t5
srai.w t3, t3, 6
srai.w t5, t5, 6
sub.w t2, t2, t3
sub.w t4, t4, t5
add.w a0, t2, t4
endfunc_x264
/*
* int pixel_var2_8x16( pixel *fenc, pixel *fdec, int ssd[2] )
*/
function_x264 pixel_var2_8x16_lsx
vxor.v vr8, vr8, vr8
sse_diff_8width_lsx a0, a1, vr8, vr9
addi.d t0, a0, FENC_STRIDE * 4
addi.d t1, a1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr10
addi.d t0, t0, FENC_STRIDE * 4
addi.d t1, t1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr11
addi.d t0, t0, FENC_STRIDE * 4
addi.d t1, t1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr12
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t2, vr8, 0 // sqr_u
vadd.h vr8, vr10, vr9
vadd.h vr8, vr11, vr8
vadd.h vr8, vr12, vr8
vhaddw.w.h vr8, vr8, vr8
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t3, vr8, 0 // sum_u
addi.d a0, a0, FENC_STRIDE / 2
addi.d a1, a1, FDEC_STRIDE / 2
vxor.v vr8, vr8, vr8
sse_diff_8width_lsx a0, a1, vr8, vr9
addi.d t0, a0, FENC_STRIDE * 4
addi.d t1, a1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr10
addi.d t0, t0, FENC_STRIDE * 4
addi.d t1, t1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr11
addi.d t0, t0, FENC_STRIDE * 4
addi.d t1, t1, FDEC_STRIDE * 4
sse_diff_8width_lsx t0, t1, vr8, vr12
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t4, vr8, 0 // sqr_v
vadd.h vr8, vr10, vr9
vadd.h vr8, vr11, vr8
vadd.h vr8, vr12, vr8
vhaddw.w.h vr8, vr8, vr8
vhaddw.d.w vr8, vr8, vr8
vhaddw.q.d vr8, vr8, vr8
vpickve2gr.w t5, vr8, 0 // sum_v
st.w t2, a2, 0
st.w t4, a2, 4
mul.w t3, t3, t3
mul.w t5, t5, t5
srai.w t3, t3, 7
srai.w t5, t5, 7
sub.w t2, t2, t3
sub.w t4, t4, t5
add.w a0, t2, t4
endfunc_x264
#endif /* !HIGH_BIT_DEPTH */