2025-04-28 08:47:28 +08:00

2586 lines
96 KiB
ArmAsm

/*****************************************************************************
* sad-a.S: loongarch sad functions
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Lu Wang <wanglu@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "loongson_asm.S"
#include "loongson_util.S"
#if !HIGH_BIT_DEPTH
/* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_16x16_lasx
slli.d t1, a5, 1
add.d t2, a5, t1
slli.d t3, a5, 2
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 0
xvld xr16, a0, 32
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr12, xr8, xr8
xvhaddw.hu.bu xr13, xr9, xr9
xvhaddw.hu.bu xr14, xr10, xr10
xvhaddw.hu.bu xr15, xr11, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr16, xr4
xvabsd.bu xr9, xr16, xr5
xvabsd.bu xr10, xr16, xr6
xvabsd.bu xr11, xr16, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
add.d a4, a4, t3
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 64
xvld xr16, a0, 96
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr16, xr4
xvabsd.bu xr9, xr16, xr5
xvabsd.bu xr10, xr16, xr6
xvabsd.bu xr11, xr16, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
add.d a4, a4, t3
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 128
xvld xr16, a0, 160
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr16, xr4
xvabsd.bu xr9, xr16, xr5
xvabsd.bu xr10, xr16, xr6
xvabsd.bu xr11, xr16, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
add.d a4, a4, t3
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 192
xvld xr16, a0, 224
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr16, xr4
xvabsd.bu xr9, xr16, xr5
xvabsd.bu xr10, xr16, xr6
xvabsd.bu xr11, xr16, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
xvori.b xr17, xr12, 0
xvori.b xr18, xr13, 0
xvpermi.q xr12, xr14, 0x02
xvpermi.q xr14, xr17, 0x31
xvpermi.q xr13, xr15, 0x02
xvpermi.q xr15, xr18, 0x31
xvadd.h xr12, xr12, xr14
xvadd.h xr13, xr13, xr15
xvhaddw.w.h xr12, xr12, xr12
xvhaddw.w.h xr13, xr13, xr13
xvhaddw.d.w xr12, xr12, xr12
xvhaddw.d.w xr13, xr13, xr13
xvhaddw.q.d xr12, xr12, xr12
xvhaddw.q.d xr13, xr13, xr13
xvpackev.w xr13, xr13, xr12
// Store data to p_sad_array
xvstelm.d xr13, a6, 0, 0
xvstelm.d xr13, a6, 8, 2
endfunc_x264
/* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_16x8_lasx
slli.d t1, a5, 1
add.d t2, a5, t1
slli.d t3, a5, 2
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 0
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr12, xr8, xr8
xvhaddw.hu.bu xr13, xr9, xr9
xvhaddw.hu.bu xr14, xr10, xr10
xvhaddw.hu.bu xr15, xr11, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 32
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
add.d a4, a4, t3
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 64
vld vr4, a1, 0
vldx vr8, a1, a5
vld vr5, a2, 0
vldx vr9, a2, a5
vld vr6, a3, 0
vldx vr10, a3, a5
vld vr7, a4, 0
vldx vr11, a4, a5
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
xvld xr3, a0, 96
vldx vr4, a1, t1
vldx vr8, a1, t2
vldx vr5, a2, t1
vldx vr9, a2, t2
vldx vr6, a3, t1
vldx vr10, a3, t2
vldx vr7, a4, t1
vldx vr11, a4, t2
xvpermi.q xr4, xr8, 0x02
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr6, xr10, 0x02
xvpermi.q xr7, xr11, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvadd.h xr12, xr12, xr8
xvadd.h xr13, xr13, xr9
xvadd.h xr14, xr14, xr10
xvadd.h xr15, xr15, xr11
xvori.b xr17, xr12, 0
xvori.b xr18, xr13, 0
xvpermi.q xr12, xr14, 0x02
xvpermi.q xr14, xr17, 0x31
xvpermi.q xr13, xr15, 0x02
xvpermi.q xr15, xr18, 0x31
xvadd.h xr12, xr12, xr14
xvadd.h xr13, xr13, xr15
xvhaddw.w.h xr12, xr12, xr12
xvhaddw.w.h xr13, xr13, xr13
xvhaddw.d.w xr12, xr12, xr12
xvhaddw.d.w xr13, xr13, xr13
xvhaddw.q.d xr12, xr12, xr12
xvhaddw.q.d xr13, xr13, xr13
xvpackev.w xr13, xr13, xr12
// Store data to p_sad_array
xvstelm.d xr13, a6, 0, 0
xvstelm.d xr13, a6, 8, 2
endfunc_x264
/* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_8x8_lasx
slli.d t1, a5, 1
add.d t2, t1, a5
slli.d t3, a5, 2
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
vilvl.d vr4, vr5, vr4
vilvl.d vr6, vr7, vr6
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
vilvl.d vr14, vr15, vr14
vilvl.d vr16, vr17, vr16
vilvl.d vr18, vr19, vr18
vilvl.d vr20, vr21, vr20
xvpermi.q xr4, xr6, 0x02
xvpermi.q xr8, xr10, 0x02
xvpermi.q xr14, xr16, 0x02
xvpermi.q xr18, xr20, 0x02
// Calculate the absolute value of the difference
xvldrepl.d xr3, a0, 0
xvabsd.bu xr5, xr3, xr4
xvldrepl.d xr3, a0, 16
xvabsd.bu xr9, xr3, xr8
xvldrepl.d xr3, a0, 32
xvabsd.bu xr10, xr3, xr14
xvldrepl.d xr3, a0, 48
xvabsd.bu xr11, xr3, xr18
xvaddwev.h.bu xr0, xr5, xr9
xvaddwod.h.bu xr1, xr5, xr9
xvaddwev.h.bu xr2, xr10, xr11
xvaddwod.h.bu xr22, xr10, xr11
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
add.d a4, a4, t3
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
vilvl.d vr4, vr5, vr4
vilvl.d vr6, vr7, vr6
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
vilvl.d vr14, vr15, vr14
vilvl.d vr16, vr17, vr16
vilvl.d vr18, vr19, vr18
vilvl.d vr20, vr21, vr20
xvpermi.q xr4, xr6, 0x02
xvpermi.q xr8, xr10, 0x02
xvpermi.q xr14, xr16, 0x02
xvpermi.q xr18, xr20, 0x02
// Calculate the absolute value of the difference
xvldrepl.d xr3, a0, 64
xvabsd.bu xr5, xr3, xr4
xvldrepl.d xr3, a0, 80
xvabsd.bu xr9, xr3, xr8
xvldrepl.d xr3, a0, 96
xvabsd.bu xr10, xr3, xr14
xvldrepl.d xr3, a0, 112
xvabsd.bu xr11, xr3, xr18
xvaddwev.h.bu xr12, xr5, xr9
xvaddwod.h.bu xr13, xr5, xr9
xvaddwev.h.bu xr14, xr10, xr11
xvaddwod.h.bu xr15, xr10, xr11
xvadd.h xr5, xr0, xr12
xvadd.h xr9, xr1, xr13
xvadd.h xr10, xr2, xr14
xvadd.h xr11, xr22, xr15
xvadd.h xr5, xr5, xr9
xvadd.h xr10, xr10, xr11
xvadd.h xr10, xr10, xr5
xvhaddw.wu.hu xr10, xr10, xr10
xvhaddw.du.wu xr10, xr10, xr10
xvpermi.q xr5, xr10, 0x01
xvpickev.w xr10, xr5, xr10
// Store data to p_sad_array
vst vr10, a6, 0
endfunc_x264
/* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_8x4_lasx
slli.d t1, a5, 1
add.d t2, t1, a5
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
fld.d f2, a0, 0
fld.d f3, a0, 16
fld.d f12, a0, 32
fld.d f13, a0, 48
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
vilvl.d vr3, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr7, vr11, vr7
vilvl.d vr13, vr13, vr12
vilvl.d vr14, vr18, vr14
vilvl.d vr15, vr19, vr15
vilvl.d vr16, vr20, vr16
vilvl.d vr17, vr21, vr17
xvpermi.q xr3, xr13, 0x02
xvpermi.q xr4, xr16, 0x02
xvpermi.q xr5, xr17, 0x02
xvpermi.q xr6, xr14, 0x02
xvpermi.q xr7, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr8, xr3, xr4
xvabsd.bu xr9, xr3, xr5
xvabsd.bu xr10, xr3, xr6
xvabsd.bu xr11, xr3, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvpermi.d xr10, xr10, 0x4e
xvpermi.d xr11, xr11, 0x4e
xvadd.h xr8, xr8, xr10
xvadd.h xr9, xr9, xr11
xvhaddw.w.h xr8, xr8, xr8
xvhaddw.w.h xr9, xr9, xr9
xvhaddw.d.w xr8, xr8, xr8
xvhaddw.d.w xr9, xr9, xr9
xvhaddw.q.d xr8, xr8, xr8
xvhaddw.q.d xr9, xr9, xr9
xvpackev.w xr9, xr9, xr8
// Store data to p_sad_array
xvstelm.d xr9, a6, 0, 0
xvstelm.d xr9, a6, 8, 2
endfunc_x264
/* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_4x4_lsx
slli.d t0, a5, 1
add.d t1, a5, t0
slli.d t2, a5, 2
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
fld.s f2, a0, 0
fld.s f3, a0, 16
fld.s f4, a1, 0
fldx.s f8, a1, a5
fld.s f5, a2, 0
fldx.s f9, a2, a5
fld.s f6, a3, 0
fldx.s f10, a3, a5
fld.s f7, a4, 0
fldx.s f11, a4, a5
vilvl.w vr3, vr3, vr2
vilvl.w vr4, vr8, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr7, vr11, vr7
fld.s f2, a0, 32
fld.s f0, a0, 48
fldx.s f8, a1, t0
fldx.s f12, a1, t1
fldx.s f9, a2, t0
fldx.s f13, a2, t1
fldx.s f10, a3, t0
fldx.s f14, a3, t1
fldx.s f11, a4, t0
fldx.s f15, a4, t1
vilvl.w vr2, vr0, vr2
vilvl.w vr8, vr12, vr8
vilvl.w vr9, vr13, vr9
vilvl.w vr10, vr14, vr10
vilvl.w vr11, vr15, vr11
vilvl.d vr3, vr2, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr7, vr11, vr7
// Calculate the absolute value of the difference
vabsd.bu vr8, vr3, vr4
vabsd.bu vr9, vr3, vr5
vabsd.bu vr10, vr3, vr6
vabsd.bu vr11, vr3, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.wu.hu vr10, vr10, vr10
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.du.wu vr8, vr8, vr8
vhaddw.du.wu vr9, vr9, vr9
vhaddw.du.wu vr10, vr10, vr10
vhaddw.du.wu vr11, vr11, vr11
vhaddw.qu.du vr8, vr8, vr8
vhaddw.qu.du vr9, vr9, vr9
vhaddw.qu.du vr10, vr10, vr10
vhaddw.qu.du vr11, vr11, vr11
// Store data to p_sad_array
vstelm.w vr8, a6, 0, 0
vstelm.w vr9, a6, 4, 0
vstelm.w vr10, a6, 8, 0
vstelm.w vr11, a6, 12, 0
endfunc_x264
/* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_16x16_lasx
// Load data from p_src, p_ref0, p_ref1 and p_ref2
slli.d t1, a4, 1
add.d t2, a4, t1
slli.d t3, a4, 2
xvld xr2, a0, 0
xvld xr3, a0, 32
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr16, xr7, xr7
xvhaddw.hu.bu xr17, xr8, xr8
xvhaddw.hu.bu xr18, xr9, xr9
xvhaddw.hu.bu xr19, xr10, xr10
xvhaddw.hu.bu xr20, xr11, xr11
xvhaddw.hu.bu xr21, xr12, xr12
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
xvld xr2, a0, 64
xvld xr3, a0, 96
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr7, xr7, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvhaddw.hu.bu xr12, xr12, xr12
xvadd.h xr16, xr16, xr7
xvadd.h xr17, xr17, xr8
xvadd.h xr18, xr18, xr9
xvadd.h xr19, xr19, xr10
xvadd.h xr20, xr20, xr11
xvadd.h xr21, xr21, xr12
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
xvld xr2, a0, 128
xvld xr3, a0, 160
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr7, xr7, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvhaddw.hu.bu xr12, xr12, xr12
xvadd.h xr16, xr16, xr7
xvadd.h xr17, xr17, xr8
xvadd.h xr18, xr18, xr9
xvadd.h xr19, xr19, xr10
xvadd.h xr20, xr20, xr11
xvadd.h xr21, xr21, xr12
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
xvld xr2, a0, 192
xvld xr3, a0, 224
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr7, xr7, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvhaddw.hu.bu xr12, xr12, xr12
xvadd.h xr16, xr16, xr7
xvadd.h xr17, xr17, xr8
xvadd.h xr18, xr18, xr9
xvadd.h xr19, xr19, xr10
xvadd.h xr20, xr20, xr11
xvadd.h xr21, xr21, xr12
xvadd.h xr11, xr16, xr19
xvadd.h xr12, xr17, xr20
xvadd.h xr13, xr18, xr21
xvhaddw.wu.hu xr11, xr11, xr11
xvhaddw.wu.hu xr12, xr12, xr12
xvhaddw.wu.hu xr13, xr13, xr13
xvhaddw.du.wu xr11, xr11, xr11
xvhaddw.du.wu xr12, xr12, xr12
xvhaddw.du.wu xr13, xr13, xr13
xvhaddw.qu.du xr11, xr11, xr11
xvhaddw.qu.du xr12, xr12, xr12
xvhaddw.qu.du xr13, xr13, xr13
xvpickve.w xr17, xr11, 4
xvpickve.w xr18, xr12, 4
xvpickve.w xr19, xr13, 4
xvadd.w xr11, xr11, xr17
xvadd.w xr12, xr12, xr18
xvadd.w xr13, xr13, xr19
// Store data to p_sad_array
vstelm.w vr11, a5, 0, 0
vstelm.w vr12, a5, 4, 0
vstelm.w vr13, a5, 8, 0
endfunc_x264
/* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_16x8_lasx
// Load data from p_src, p_ref0, p_ref1 and p_ref2
slli.d t1, a4, 1
add.d t2, a4, t1
slli.d t3, a4, 2
xvld xr2, a0, 0
xvld xr3, a0, 32
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr16, xr7, xr7
xvhaddw.hu.bu xr17, xr8, xr8
xvhaddw.hu.bu xr18, xr9, xr9
xvhaddw.hu.bu xr19, xr10, xr10
xvhaddw.hu.bu xr20, xr11, xr11
xvhaddw.hu.bu xr21, xr12, xr12
add.d a1, a1, t3
add.d a2, a2, t3
add.d a3, a3, t3
xvld xr2, a0, 64
xvld xr3, a0, 96
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
xvpermi.q xr4, xr7, 0x02
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr6, xr9, 0x02
xvpermi.q xr10, xr13, 0x02
xvpermi.q xr11, xr14, 0x02
xvpermi.q xr12, xr15, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr7, xr2, xr4
xvabsd.bu xr8, xr2, xr5
xvabsd.bu xr9, xr2, xr6
xvabsd.bu xr10, xr3, xr10
xvabsd.bu xr11, xr3, xr11
xvabsd.bu xr12, xr3, xr12
xvhaddw.hu.bu xr7, xr7, xr7
xvhaddw.hu.bu xr8, xr8, xr8
xvhaddw.hu.bu xr9, xr9, xr9
xvhaddw.hu.bu xr10, xr10, xr10
xvhaddw.hu.bu xr11, xr11, xr11
xvhaddw.hu.bu xr12, xr12, xr12
xvadd.h xr16, xr16, xr7
xvadd.h xr17, xr17, xr8
xvadd.h xr18, xr18, xr9
xvadd.h xr19, xr19, xr10
xvadd.h xr20, xr20, xr11
xvadd.h xr21, xr21, xr12
xvadd.h xr11, xr16, xr19
xvadd.h xr12, xr17, xr20
xvadd.h xr13, xr18, xr21
xvhaddw.wu.hu xr11, xr11, xr11
xvhaddw.wu.hu xr12, xr12, xr12
xvhaddw.wu.hu xr13, xr13, xr13
xvhaddw.du.wu xr11, xr11, xr11
xvhaddw.du.wu xr12, xr12, xr12
xvhaddw.du.wu xr13, xr13, xr13
xvhaddw.qu.du xr11, xr11, xr11
xvhaddw.qu.du xr12, xr12, xr12
xvhaddw.qu.du xr13, xr13, xr13
xvpickve.w xr17, xr11, 4
xvpickve.w xr18, xr12, 4
xvpickve.w xr19, xr13, 4
xvadd.w xr11, xr11, xr17
xvadd.w xr12, xr12, xr18
xvadd.w xr13, xr13, xr19
// Store data to p_sad_array
vstelm.w vr11, a5, 0, 0
vstelm.w vr12, a5, 4, 0
vstelm.w vr13, a5, 8, 0
endfunc_x264
/* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_4x4_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.s f3, a0, 0
fld.s f7, a0, 16
fld.s f11, a0, 32
fld.s f15, a0, 48
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.w vr3, vr7, vr3
vilvl.w vr4, vr8, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr11, vr15, vr11
vilvl.w vr12, vr16, vr12
vilvl.w vr13, vr17, vr13
vilvl.w vr14, vr18, vr14
vilvl.d vr3, vr11, vr3
vilvl.d vr4, vr12, vr4
vilvl.d vr5, vr13, vr5
vilvl.d vr6, vr14, vr6
// Calculate the absolute value of the difference
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.du.wu vr7, vr7, vr7
vhaddw.du.wu vr8, vr8, vr8
vhaddw.du.wu vr9, vr9, vr9
vhaddw.qu.du vr7, vr7, vr7
vhaddw.qu.du vr8, vr8, vr8
vhaddw.qu.du vr9, vr9, vr9
// Store data to p_sad_array
vstelm.w vr7, a5, 0, 0
vstelm.w vr8, a5, 4, 0
vstelm.w vr9, a5, 8, 0
endfunc_x264
/* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_8x4_lasx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
// Load data from p_src and p_ref
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr4, vr6, vr4
vilvl.d vr7, vr9, vr7
vilvl.d vr8, vr10, vr8
xvpermi.q xr3, xr7, 0x02
xvpermi.q xr4, xr8, 0x02
// Calculate the absolute value of the difference
xvabsd.bu xr5, xr3, xr4
xvhaddw.hu.bu xr6, xr5, xr5
xvhaddw.wu.hu xr6, xr6, xr6
xvhaddw.du.wu xr6, xr6, xr6
xvhaddw.qu.du xr6, xr6, xr6
xvpickve2gr.wu t2, xr6, 0
xvpickve2gr.wu t3, xr6, 4
add.d a0, t2, t3
endfunc_x264
/* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_4x4_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
// Load data from p_src and p_ref
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.w vr3, vr5, vr3
vilvl.w vr4, vr6, vr4
vilvl.w vr7, vr9, vr7
vilvl.w vr8, vr10, vr8
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
// Calculate the absolute value of the difference
vabsd.bu vr5, vr3, vr4
vhaddw.hu.bu vr6, vr5, vr5
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.du.wu vr6, vr6, vr6
vhaddw.qu.du vr6, vr6, vr6
vpickve2gr.wu a0, vr6, 0
endfunc_x264
/* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_4x8_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
// Load data from p_src and p_ref
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.w vr3, vr5, vr3
vilvl.w vr4, vr6, vr4
vilvl.w vr7, vr9, vr7
vilvl.w vr8, vr10, vr8
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vabsd.bu vr11, vr3, vr4
vhaddw.hu.bu vr11, vr11, vr11
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.w vr3, vr5, vr3
vilvl.w vr4, vr6, vr4
vilvl.w vr7, vr9, vr7
vilvl.w vr8, vr10, vr8
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vabsd.bu vr5, vr3, vr4
vhaddw.hu.bu vr5, vr5, vr5
vadd.h vr6, vr11, vr5
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.du.wu vr6, vr6, vr6
vhaddw.qu.du vr6, vr6, vr6
vpickve2gr.wu a0, vr6, 0
endfunc_x264
/* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_4x16_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
// Load data from p_src and p_ref
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.w vr3, vr5, vr3
vilvl.w vr4, vr6, vr4
vilvl.w vr7, vr9, vr7
vilvl.w vr8, vr10, vr8
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vabsd.bu vr11, vr3, vr4
vhaddw.hu.bu vr11, vr11, vr11
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.w vr3, vr5, vr3
vilvl.w vr4, vr6, vr4
vilvl.w vr7, vr9, vr7
vilvl.w vr8, vr10, vr8
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vabsd.bu vr12, vr3, vr4
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr11, vr11, vr12
.endr
vhaddw.wu.hu vr11, vr11, vr11
vhaddw.du.wu vr11, vr11, vr11
vhaddw.qu.du vr11, vr11, vr11
vpickve2gr.wu a0, vr11, 0
endfunc_x264
/* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_8x4_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr7, vr9, vr7
vilvl.d vr4, vr6, vr4
vilvl.d vr8, vr10, vr8
vabsd.bu vr11, vr3, vr4
vabsd.bu vr12, vr7, vr8
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr6, vr11, vr12
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.du.wu vr6, vr6, vr6
vhaddw.qu.du vr6, vr6, vr6
vpickve2gr.wu a0, vr6, 0
endfunc_x264
/* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_8x8_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr7, vr9, vr7
vilvl.d vr4, vr6, vr4
vilvl.d vr8, vr10, vr8
vabsd.bu vr11, vr3, vr4
vabsd.bu vr12, vr7, vr8
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr13, vr11, vr12
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr7, vr9, vr7
vilvl.d vr4, vr6, vr4
vilvl.d vr8, vr10, vr8
vabsd.bu vr11, vr3, vr4
vabsd.bu vr12, vr7, vr8
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr6, vr11, vr12
vadd.h vr6, vr6, vr13
vhaddw.wu.hu vr6, vr6, vr6
vhaddw.du.wu vr6, vr6, vr6
vhaddw.qu.du vr6, vr6, vr6
vpickve2gr.wu a0, vr6, 0
endfunc_x264
/* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_8x16_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr7, vr9, vr7
vilvl.d vr4, vr6, vr4
vilvl.d vr8, vr10, vr8
vabsd.bu vr11, vr3, vr4
vabsd.bu vr12, vr7, vr8
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr13, vr11, vr12
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
vilvl.d vr3, vr5, vr3
vilvl.d vr7, vr9, vr7
vilvl.d vr4, vr6, vr4
vilvl.d vr8, vr10, vr8
vabsd.bu vr11, vr3, vr4
vabsd.bu vr12, vr7, vr8
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vadd.h vr14, vr11, vr12
vadd.h vr13, vr13, vr14
.endr
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu a0, vr13, 0
endfunc_x264
/* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_16x8_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
vabsd.bu vr8, vr0, vr4
vabsd.bu vr9, vr1, vr5
vabsd.bu vr10, vr2, vr6
vabsd.bu vr11, vr3, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vadd.h vr8, vr8, vr9
vadd.h vr9, vr10, vr11
vadd.h vr14, vr8, vr9
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
vabsd.bu vr8, vr0, vr4
vabsd.bu vr9, vr1, vr5
vabsd.bu vr10, vr2, vr6
vabsd.bu vr11, vr3, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vadd.h vr8, vr8, vr9
vadd.h vr9, vr10, vr11
vadd.h vr12, vr8, vr9
vadd.h vr13, vr12, vr14
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu a0, vr13, 0
endfunc_x264
/* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
* uint8_t *p_ref, intptr_t i_ref_stride)
*/
function_x264 pixel_sad_16x16_lsx
slli.d t1, a1, 1
slli.d t2, a3, 1
add.d t3, a1, t1
add.d t4, a3, t2
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
vabsd.bu vr8, vr0, vr4
vabsd.bu vr9, vr1, vr5
vabsd.bu vr10, vr2, vr6
vabsd.bu vr11, vr3, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vadd.h vr8, vr8, vr9
vadd.h vr9, vr10, vr11
vadd.h vr13, vr8, vr9
.rept 3
alsl.d a0, a1, a0, 2
alsl.d a2, a3, a2, 2
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
vabsd.bu vr8, vr0, vr4
vabsd.bu vr9, vr1, vr5
vabsd.bu vr10, vr2, vr6
vabsd.bu vr11, vr3, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vadd.h vr8, vr8, vr9
vadd.h vr9, vr10, vr11
vadd.h vr12, vr8, vr9
vadd.h vr13, vr12, vr13
.endr
vhaddw.wu.hu vr13, vr13, vr13
vhaddw.du.wu vr13, vr13, vr13
vhaddw.qu.du vr13, vr13, vr13
vpickve2gr.wu a0, vr13, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_4x8_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.s f3, a0, 0
fld.s f7, a0, 16
fld.s f11, a0, 32
fld.s f15, a0, 48
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.w vr3, vr7, vr3
vilvl.w vr4, vr8, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr11, vr15, vr11
vilvl.w vr12, vr16, vr12
vilvl.w vr13, vr17, vr13
vilvl.w vr14, vr18, vr14
vilvl.d vr3, vr11, vr3
vilvl.d vr4, vr12, vr4
vilvl.d vr5, vr13, vr5
vilvl.d vr6, vr14, vr6
vabsd.bu vr0, vr3, vr4
vabsd.bu vr1, vr3, vr5
vabsd.bu vr2, vr3, vr6
alsl.d a1, a4, a1, 2
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
fld.s f3, a0, 64
fld.s f7, a0, 80
fld.s f11, a0, 96
fld.s f15, a0, 112
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.w vr3, vr7, vr3
vilvl.w vr4, vr8, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr11, vr15, vr11
vilvl.w vr12, vr16, vr12
vilvl.w vr13, vr17, vr13
vilvl.w vr14, vr18, vr14
vilvl.d vr3, vr11, vr3
vilvl.d vr4, vr12, vr4
vilvl.d vr5, vr13, vr5
vilvl.d vr6, vr14, vr6
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.hu.bu vr1, vr1, vr1
vhaddw.hu.bu vr2, vr2, vr2
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vadd.h vr7, vr7, vr0
vadd.h vr8, vr8, vr1
vadd.h vr9, vr9, vr2
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.du.wu vr7, vr7, vr7
vhaddw.du.wu vr8, vr8, vr8
vhaddw.du.wu vr9, vr9, vr9
vhaddw.qu.du vr7, vr7, vr7
vhaddw.qu.du vr8, vr8, vr8
vhaddw.qu.du vr9, vr9, vr9
// Store data to p_sad_array
vstelm.w vr7, a5, 0, 0
vstelm.w vr8, a5, 4, 0
vstelm.w vr9, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_8x4_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f3, a0, 0
fld.d f7, a0, 16
fld.d f11, a0, 32
fld.d f15, a0, 48
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr11, vr15, vr11
vilvl.d vr12, vr16, vr12
vilvl.d vr13, vr17, vr13
vilvl.d vr14, vr18, vr14
vabsd.bu vr0, vr3, vr4
vabsd.bu vr1, vr3, vr5
vabsd.bu vr2, vr3, vr6
vabsd.bu vr3, vr11, vr12
vabsd.bu vr4, vr11, vr13
vabsd.bu vr5, vr11, vr14
vhaddw.hu.bu vr0, vr0, vr0
vhaddw.hu.bu vr1, vr1, vr1
vhaddw.hu.bu vr2, vr2, vr2
vhaddw.hu.bu vr3, vr3, vr3
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vadd.h vr7, vr0, vr3
vadd.h vr8, vr1, vr4
vadd.h vr9, vr2, vr5
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.du.wu vr7, vr7, vr7
vhaddw.du.wu vr8, vr8, vr8
vhaddw.du.wu vr9, vr9, vr9
vhaddw.qu.du vr7, vr7, vr7
vhaddw.qu.du vr8, vr8, vr8
vhaddw.qu.du vr9, vr9, vr9
// Store data to p_sad_array
vstelm.w vr7, a5, 0, 0
vstelm.w vr8, a5, 4, 0
vstelm.w vr9, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_8x8_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f3, a0, 0
fld.d f7, a0, 16
fld.d f11, a0, 32
fld.d f15, a0, 48
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr11, vr15, vr11
vilvl.d vr12, vr16, vr12
vilvl.d vr13, vr17, vr13
vilvl.d vr14, vr18, vr14
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vabsd.bu vr10, vr11, vr12
vabsd.bu vr15, vr11, vr13
vabsd.bu vr16, vr11, vr14
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vadd.h vr0, vr7, vr10
vadd.h vr1, vr8, vr15
vadd.h vr2, vr9, vr16
alsl.d a1, a4, a1, 2
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
fld.d f3, a0, 64
fld.d f7, a0, 80
fld.d f11, a0, 96
fld.d f15, a0, 112
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr11, vr15, vr11
vilvl.d vr12, vr16, vr12
vilvl.d vr13, vr17, vr13
vilvl.d vr14, vr18, vr14
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vabsd.bu vr10, vr11, vr12
vabsd.bu vr15, vr11, vr13
vabsd.bu vr16, vr11, vr14
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vadd.h vr7, vr7, vr10
vadd.h vr8, vr8, vr15
vadd.h vr9, vr9, vr16
vadd.h vr7, vr7, vr0
vadd.h vr8, vr8, vr1
vadd.h vr9, vr9, vr2
vhaddw.wu.hu vr7, vr7, vr7
vhaddw.wu.hu vr8, vr8, vr8
vhaddw.wu.hu vr9, vr9, vr9
vhaddw.du.wu vr7, vr7, vr7
vhaddw.du.wu vr8, vr8, vr8
vhaddw.du.wu vr9, vr9, vr9
vhaddw.qu.du vr7, vr7, vr7
vhaddw.qu.du vr8, vr8, vr8
vhaddw.qu.du vr9, vr9, vr9
// Store data to p_sad_array
vstelm.w vr7, a5, 0, 0
vstelm.w vr8, a5, 4, 0
vstelm.w vr9, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_8x16_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f3, a0, 0
fld.d f7, a0, 16
fld.d f11, a0, 32
fld.d f15, a0, 48
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr11, vr15, vr11
vilvl.d vr12, vr16, vr12
vilvl.d vr13, vr17, vr13
vilvl.d vr14, vr18, vr14
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vabsd.bu vr10, vr11, vr12
vabsd.bu vr15, vr11, vr13
vabsd.bu vr16, vr11, vr14
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vadd.h vr0, vr7, vr10
vadd.h vr1, vr8, vr15
vadd.h vr2, vr9, vr16
.rept 3
alsl.d a1, a4, a1, 2
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
addi.d a0, a0, 64
fld.d f3, a0, 0
fld.d f7, a0, 16
fld.d f11, a0, 32
fld.d f15, a0, 48
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
vilvl.d vr3, vr7, vr3
vilvl.d vr4, vr8, vr4
vilvl.d vr5, vr9, vr5
vilvl.d vr6, vr10, vr6
vilvl.d vr11, vr15, vr11
vilvl.d vr12, vr16, vr12
vilvl.d vr13, vr17, vr13
vilvl.d vr14, vr18, vr14
vabsd.bu vr7, vr3, vr4
vabsd.bu vr8, vr3, vr5
vabsd.bu vr9, vr3, vr6
vabsd.bu vr10, vr11, vr12
vabsd.bu vr15, vr11, vr13
vabsd.bu vr16, vr11, vr14
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vadd.h vr7, vr7, vr10
vadd.h vr8, vr8, vr15
vadd.h vr9, vr9, vr16
vadd.h vr0, vr7, vr0
vadd.h vr1, vr8, vr1
vadd.h vr2, vr9, vr2
.endr
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.wu.hu vr1, vr1, vr1
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.du.wu vr0, vr0, vr0
vhaddw.du.wu vr1, vr1, vr1
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr0, vr0, vr0
vhaddw.qu.du vr1, vr1, vr1
vhaddw.qu.du vr2, vr2, vr2
// Store data to p_sad_array
vstelm.w vr0, a5, 0, 0
vstelm.w vr1, a5, 4, 0
vstelm.w vr2, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_16x8_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr1, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr2, vr10
vabsd.bu vr11, vr2, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr3, vr13
vabsd.bu vr14, vr3, vr14
vabsd.bu vr15, vr3, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr0, vr7, vr4
vadd.h vr1, vr13, vr10
vadd.h vr16, vr1, vr0
vadd.h vr0, vr8, vr5
vadd.h vr1, vr14, vr11
vadd.h vr17, vr1, vr0
vadd.h vr0, vr9, vr6
vadd.h vr1, vr15, vr12
vadd.h vr18, vr1, vr0
// vr16, vr17, vr18
alsl.d a1, a4, a1, 2
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
vld vr0, a0, 64
vld vr1, a0, 80
vld vr2, a0, 96
vld vr3, a0, 112
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr1, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr2, vr10
vabsd.bu vr11, vr2, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr3, vr13
vabsd.bu vr14, vr3, vr14
vabsd.bu vr15, vr3, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr0, vr7, vr4
vadd.h vr1, vr13, vr10
vadd.h vr2, vr1, vr0
vadd.h vr0, vr8, vr5
vadd.h vr1, vr14, vr11
vadd.h vr3, vr1, vr0
vadd.h vr0, vr9, vr6
vadd.h vr1, vr15, vr12
vadd.h vr4, vr1, vr0
vadd.h vr0, vr16, vr2
vadd.h vr1, vr17, vr3
vadd.h vr2, vr18, vr4
vhaddw.wu.hu vr0, vr0, vr0
vhaddw.wu.hu vr1, vr1, vr1
vhaddw.wu.hu vr2, vr2, vr2
vhaddw.du.wu vr0, vr0, vr0
vhaddw.du.wu vr1, vr1, vr1
vhaddw.du.wu vr2, vr2, vr2
vhaddw.qu.du vr0, vr0, vr0
vhaddw.qu.du vr1, vr1, vr1
vhaddw.qu.du vr2, vr2, vr2
// Store data to p_sad_array
vstelm.w vr0, a5, 0, 0
vstelm.w vr1, a5, 4, 0
vstelm.w vr2, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* intptr_t i_ref_stride,
* int32_t p_sad_array[3])
*/
function_x264 pixel_sad_x3_16x16_lsx
slli.d t1, a4, 1
add.d t2, a4, t1
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr1, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr2, vr10
vabsd.bu vr11, vr2, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr3, vr13
vabsd.bu vr14, vr3, vr14
vabsd.bu vr15, vr3, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr0, vr7, vr4
vadd.h vr1, vr13, vr10
vadd.h vr16, vr1, vr0
vadd.h vr0, vr8, vr5
vadd.h vr1, vr14, vr11
vadd.h vr17, vr1, vr0
vadd.h vr0, vr9, vr6
vadd.h vr1, vr15, vr12
vadd.h vr18, vr1, vr0
.rept 3
alsl.d a1, a4, a1, 2
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
addi.d a0, a0, 64
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr1, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr2, vr10
vabsd.bu vr11, vr2, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr3, vr13
vabsd.bu vr14, vr3, vr14
vabsd.bu vr15, vr3, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr0, vr7, vr4
vadd.h vr1, vr13, vr10
vadd.h vr2, vr1, vr0
vadd.h vr0, vr8, vr5
vadd.h vr1, vr14, vr11
vadd.h vr3, vr1, vr0
vadd.h vr0, vr9, vr6
vadd.h vr1, vr15, vr12
vadd.h vr4, vr1, vr0
vadd.h vr16, vr16, vr2
vadd.h vr17, vr17, vr3
vadd.h vr18, vr18, vr4
.endr
vhaddw.wu.hu vr16, vr16, vr16
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.wu.hu vr18, vr18, vr18
vhaddw.du.wu vr16, vr16, vr16
vhaddw.du.wu vr17, vr17, vr17
vhaddw.du.wu vr18, vr18, vr18
vhaddw.qu.du vr16, vr16, vr16
vhaddw.qu.du vr17, vr17, vr17
vhaddw.qu.du vr18, vr18, vr18
// Store data to p_sad_array
vstelm.w vr16, a5, 0, 0
vstelm.w vr17, a5, 4, 0
vstelm.w vr18, a5, 8, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_4x8_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
fld.s f0, a0, 0
fld.s f1, a0, 16
fld.s f2, a0, 32
fld.s f3, a0, 48
FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.w vr0, vr1, vr0
vilvl.w vr2, vr3, vr2
vilvl.d vr0, vr2, vr0
vilvl.w vr4, vr8, vr4
vilvl.w vr12, vr16, vr12
vilvl.d vr1, vr12, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr13, vr17, vr13
vilvl.d vr2, vr13, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr14, vr18, vr14
vilvl.d vr3, vr14, vr6
vilvl.w vr7, vr11, vr7
vilvl.w vr15, vr19, vr15
vilvl.d vr4, vr15, vr7
vabsd.bu vr1, vr0, vr1
vabsd.bu vr2, vr0, vr2
vabsd.bu vr3, vr0, vr3
vabsd.bu vr4, vr0, vr4
vhaddw.hu.bu vr20, vr1, vr1
vhaddw.hu.bu vr21, vr2, vr2
vhaddw.hu.bu vr22, vr3, vr3
vhaddw.hu.bu vr23, vr4, vr4
alsl.d a1, a5, a1, 2
alsl.d a2, a5, a2, 2
alsl.d a3, a5, a3, 2
alsl.d a4, a5, a4, 2
fld.s f0, a0, 64
fld.s f1, a0, 80
fld.s f2, a0, 96
fld.s f3, a0, 112
FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.w vr0, vr1, vr0
vilvl.w vr2, vr3, vr2
vilvl.d vr0, vr2, vr0
vilvl.w vr4, vr8, vr4
vilvl.w vr12, vr16, vr12
vilvl.d vr1, vr12, vr4
vilvl.w vr5, vr9, vr5
vilvl.w vr13, vr17, vr13
vilvl.d vr2, vr13, vr5
vilvl.w vr6, vr10, vr6
vilvl.w vr14, vr18, vr14
vilvl.d vr3, vr14, vr6
vilvl.w vr7, vr11, vr7
vilvl.w vr15, vr19, vr15
vilvl.d vr4, vr15, vr7
vabsd.bu vr1, vr0, vr1
vabsd.bu vr2, vr0, vr2
vabsd.bu vr3, vr0, vr3
vabsd.bu vr4, vr0, vr4
vhaddw.hu.bu vr1, vr1, vr1
vhaddw.hu.bu vr2, vr2, vr2
vhaddw.hu.bu vr3, vr3, vr3
vhaddw.hu.bu vr4, vr4, vr4
vadd.h vr16, vr20, vr1
vadd.h vr17, vr21, vr2
vadd.h vr18, vr22, vr3
vadd.h vr19, vr23, vr4
vhaddw.wu.hu vr16, vr16, vr16
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.wu.hu vr18, vr18, vr18
vhaddw.wu.hu vr19, vr19, vr19
vhaddw.du.wu vr16, vr16, vr16
vhaddw.du.wu vr17, vr17, vr17
vhaddw.du.wu vr18, vr18, vr18
vhaddw.du.wu vr19, vr19, vr19
vhaddw.qu.du vr16, vr16, vr16
vhaddw.qu.du vr17, vr17, vr17
vhaddw.qu.du vr18, vr18, vr18
vhaddw.qu.du vr19, vr19, vr19
// Store data to p_sad_array
vstelm.w vr16, a6, 0, 0
vstelm.w vr17, a6, 4, 0
vstelm.w vr18, a6, 8, 0
vstelm.w vr19, a6, 12, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_8x4_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f0, a0, 0
fld.d f1, a0, 16
fld.d f2, a0, 32
fld.d f3, a0, 48
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr12, vr16, vr12
vilvl.d vr5, vr9, vr5
vilvl.d vr13, vr17, vr13
vilvl.d vr6, vr10, vr6
vilvl.d vr14, vr18, vr14
vilvl.d vr7, vr11, vr7
vilvl.d vr15, vr19, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr16, vr4, vr12
vadd.h vr17, vr5, vr13
vadd.h vr18, vr6, vr14
vadd.h vr19, vr7, vr15
vhaddw.wu.hu vr16, vr16, vr16
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.wu.hu vr18, vr18, vr18
vhaddw.wu.hu vr19, vr19, vr19
vhaddw.du.wu vr16, vr16, vr16
vhaddw.du.wu vr17, vr17, vr17
vhaddw.du.wu vr18, vr18, vr18
vhaddw.du.wu vr19, vr19, vr19
vhaddw.qu.du vr16, vr16, vr16
vhaddw.qu.du vr17, vr17, vr17
vhaddw.qu.du vr18, vr18, vr18
vhaddw.qu.du vr19, vr19, vr19
// Store data to p_sad_array
vstelm.w vr16, a6, 0, 0
vstelm.w vr17, a6, 4, 0
vstelm.w vr18, a6, 8, 0
vstelm.w vr19, a6, 12, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_8x8_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f0, a0, 0
fld.d f1, a0, 16
fld.d f2, a0, 32
fld.d f3, a0, 48
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr12, vr16, vr12
vilvl.d vr5, vr9, vr5
vilvl.d vr13, vr17, vr13
vilvl.d vr6, vr10, vr6
vilvl.d vr14, vr18, vr14
vilvl.d vr7, vr11, vr7
vilvl.d vr15, vr19, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr20, vr4, vr12
vadd.h vr21, vr5, vr13
vadd.h vr22, vr6, vr14
vadd.h vr23, vr7, vr15
alsl.d a1, a5, a1, 2
alsl.d a2, a5, a2, 2
alsl.d a3, a5, a3, 2
alsl.d a4, a5, a4, 2
fld.d f0, a0, 64
fld.d f1, a0, 80
fld.d f2, a0, 96
fld.d f3, a0, 112
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr12, vr16, vr12
vilvl.d vr5, vr9, vr5
vilvl.d vr13, vr17, vr13
vilvl.d vr6, vr10, vr6
vilvl.d vr14, vr18, vr14
vilvl.d vr7, vr11, vr7
vilvl.d vr15, vr19, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr16, vr4, vr12
vadd.h vr17, vr5, vr13
vadd.h vr18, vr6, vr14
vadd.h vr19, vr7, vr15
vadd.h vr16, vr16, vr20
vadd.h vr17, vr17, vr21
vadd.h vr18, vr18, vr22
vadd.h vr19, vr19, vr23
vhaddw.wu.hu vr16, vr16, vr16
vhaddw.wu.hu vr17, vr17, vr17
vhaddw.wu.hu vr18, vr18, vr18
vhaddw.wu.hu vr19, vr19, vr19
vhaddw.du.wu vr16, vr16, vr16
vhaddw.du.wu vr17, vr17, vr17
vhaddw.du.wu vr18, vr18, vr18
vhaddw.du.wu vr19, vr19, vr19
vhaddw.qu.du vr16, vr16, vr16
vhaddw.qu.du vr17, vr17, vr17
vhaddw.qu.du vr18, vr18, vr18
vhaddw.qu.du vr19, vr19, vr19
// Store data to p_sad_array
vstelm.w vr16, a6, 0, 0
vstelm.w vr17, a6, 4, 0
vstelm.w vr18, a6, 8, 0
vstelm.w vr19, a6, 12, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_8x16_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
// Load data from p_src, p_ref0, p_ref1 and p_ref2
fld.d f0, a0, 0
fld.d f1, a0, 16
fld.d f2, a0, 32
fld.d f3, a0, 48
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr12, vr16, vr12
vilvl.d vr5, vr9, vr5
vilvl.d vr13, vr17, vr13
vilvl.d vr6, vr10, vr6
vilvl.d vr14, vr18, vr14
vilvl.d vr7, vr11, vr7
vilvl.d vr15, vr19, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr20, vr4, vr12
vadd.h vr21, vr5, vr13
vadd.h vr22, vr6, vr14
vadd.h vr23, vr7, vr15
.rept 3
alsl.d a1, a5, a1, 2
alsl.d a2, a5, a2, 2
alsl.d a3, a5, a3, 2
alsl.d a4, a5, a4, 2
addi.d a0, a0, 64
fld.d f0, a0, 0
fld.d f1, a0, 16
fld.d f2, a0, 32
fld.d f3, a0, 48
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vilvl.d vr4, vr8, vr4
vilvl.d vr12, vr16, vr12
vilvl.d vr5, vr9, vr5
vilvl.d vr13, vr17, vr13
vilvl.d vr6, vr10, vr6
vilvl.d vr14, vr18, vr14
vilvl.d vr7, vr11, vr7
vilvl.d vr15, vr19, vr15
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vadd.h vr16, vr4, vr12
vadd.h vr17, vr5, vr13
vadd.h vr18, vr6, vr14
vadd.h vr19, vr7, vr15
vadd.h vr20, vr16, vr20
vadd.h vr21, vr17, vr21
vadd.h vr22, vr18, vr22
vadd.h vr23, vr19, vr23
.endr
vhaddw.wu.hu vr20, vr20, vr20
vhaddw.wu.hu vr21, vr21, vr21
vhaddw.wu.hu vr22, vr22, vr22
vhaddw.wu.hu vr23, vr23, vr23
vhaddw.du.wu vr20, vr20, vr20
vhaddw.du.wu vr21, vr21, vr21
vhaddw.du.wu vr22, vr22, vr22
vhaddw.du.wu vr23, vr23, vr23
vhaddw.qu.du vr20, vr20, vr20
vhaddw.qu.du vr21, vr21, vr21
vhaddw.qu.du vr22, vr22, vr22
vhaddw.qu.du vr23, vr23, vr23
// Store data to p_sad_array
vstelm.w vr20, a6, 0, 0
vstelm.w vr21, a6, 4, 0
vstelm.w vr22, a6, 8, 0
vstelm.w vr23, a6, 12, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_16x8_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr1, vr10
vabsd.bu vr11, vr1, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vabsd.bu vr16, vr3, vr16
vabsd.bu vr17, vr3, vr17
vabsd.bu vr18, vr3, vr18
vabsd.bu vr19, vr3, vr19
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vhaddw.hu.bu vr19, vr19, vr19
vadd.h vr0, vr4, vr8
vadd.h vr1, vr12, vr16
vadd.h vr20, vr0, vr1
vadd.h vr0, vr5, vr9
vadd.h vr1, vr13, vr17
vadd.h vr21, vr0, vr1
vadd.h vr0, vr6, vr10
vadd.h vr1, vr14, vr18
vadd.h vr22, vr0, vr1
vadd.h vr0, vr7, vr11
vadd.h vr1, vr15, vr19
vadd.h vr23, vr0, vr1
alsl.d a1, a5, a1, 2
alsl.d a2, a5, a2, 2
alsl.d a3, a5, a3, 2
alsl.d a4, a5, a4, 2
vld vr0, a0, 64
vld vr1, a0, 80
vld vr2, a0, 96
vld vr3, a0, 112
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr1, vr10
vabsd.bu vr11, vr1, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vabsd.bu vr16, vr3, vr16
vabsd.bu vr17, vr3, vr17
vabsd.bu vr18, vr3, vr18
vabsd.bu vr19, vr3, vr19
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vhaddw.hu.bu vr19, vr19, vr19
vadd.h vr0, vr4, vr8
vadd.h vr1, vr12, vr16
vadd.h vr16, vr0, vr1
vadd.h vr0, vr5, vr9
vadd.h vr1, vr13, vr17
vadd.h vr17, vr0, vr1
vadd.h vr0, vr6, vr10
vadd.h vr1, vr14, vr18
vadd.h vr18, vr0, vr1
vadd.h vr0, vr7, vr11
vadd.h vr1, vr15, vr19
vadd.h vr19, vr0, vr1
vadd.h vr20, vr16, vr20
vadd.h vr21, vr17, vr21
vadd.h vr22, vr18, vr22
vadd.h vr23, vr19, vr23
vhaddw.wu.hu vr20, vr20, vr20
vhaddw.wu.hu vr21, vr21, vr21
vhaddw.wu.hu vr22, vr22, vr22
vhaddw.wu.hu vr23, vr23, vr23
vhaddw.du.wu vr20, vr20, vr20
vhaddw.du.wu vr21, vr21, vr21
vhaddw.du.wu vr22, vr22, vr22
vhaddw.du.wu vr23, vr23, vr23
vhaddw.qu.du vr20, vr20, vr20
vhaddw.qu.du vr21, vr21, vr21
vhaddw.qu.du vr22, vr22, vr22
vhaddw.qu.du vr23, vr23, vr23
// Store data to p_sad_array
vstelm.w vr20, a6, 0, 0
vstelm.w vr21, a6, 4, 0
vstelm.w vr22, a6, 8, 0
vstelm.w vr23, a6, 12, 0
endfunc_x264
/*
* void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
* uint8_t *p_ref1, uint8_t *p_ref2,
* uint8_t *p_ref3, intptr_t i_ref_stride,
* int32_t p_sad_array[4])
*/
function_x264 pixel_sad_x4_16x16_lsx
slli.d t1, a5, 1
add.d t2, a5, t1
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr1, vr10
vabsd.bu vr11, vr1, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vabsd.bu vr16, vr3, vr16
vabsd.bu vr17, vr3, vr17
vabsd.bu vr18, vr3, vr18
vabsd.bu vr19, vr3, vr19
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vhaddw.hu.bu vr19, vr19, vr19
vadd.h vr0, vr4, vr8
vadd.h vr1, vr12, vr16
vadd.h vr20, vr0, vr1
vadd.h vr0, vr5, vr9
vadd.h vr1, vr13, vr17
vadd.h vr21, vr0, vr1
vadd.h vr0, vr6, vr10
vadd.h vr1, vr14, vr18
vadd.h vr22, vr0, vr1
vadd.h vr0, vr7, vr11
vadd.h vr1, vr15, vr19
vadd.h vr23, vr0, vr1
.rept 3
alsl.d a1, a5, a1, 2
alsl.d a2, a5, a2, 2
alsl.d a3, a5, a3, 2
alsl.d a4, a5, a4, 2
addi.d a0, a0, 64
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
vabsd.bu vr4, vr0, vr4
vabsd.bu vr5, vr0, vr5
vabsd.bu vr6, vr0, vr6
vabsd.bu vr7, vr0, vr7
vabsd.bu vr8, vr1, vr8
vabsd.bu vr9, vr1, vr9
vabsd.bu vr10, vr1, vr10
vabsd.bu vr11, vr1, vr11
vabsd.bu vr12, vr2, vr12
vabsd.bu vr13, vr2, vr13
vabsd.bu vr14, vr2, vr14
vabsd.bu vr15, vr2, vr15
vabsd.bu vr16, vr3, vr16
vabsd.bu vr17, vr3, vr17
vabsd.bu vr18, vr3, vr18
vabsd.bu vr19, vr3, vr19
vhaddw.hu.bu vr4, vr4, vr4
vhaddw.hu.bu vr5, vr5, vr5
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vhaddw.hu.bu vr19, vr19, vr19
vadd.h vr0, vr4, vr8
vadd.h vr1, vr12, vr16
vadd.h vr16, vr0, vr1
vadd.h vr0, vr5, vr9
vadd.h vr1, vr13, vr17
vadd.h vr17, vr0, vr1
vadd.h vr0, vr6, vr10
vadd.h vr1, vr14, vr18
vadd.h vr18, vr0, vr1
vadd.h vr0, vr7, vr11
vadd.h vr1, vr15, vr19
vadd.h vr19, vr0, vr1
vadd.h vr20, vr16, vr20
vadd.h vr21, vr17, vr21
vadd.h vr22, vr18, vr22
vadd.h vr23, vr19, vr23
.endr
vhaddw.wu.hu vr20, vr20, vr20
vhaddw.wu.hu vr21, vr21, vr21
vhaddw.wu.hu vr22, vr22, vr22
vhaddw.wu.hu vr23, vr23, vr23
vhaddw.du.wu vr20, vr20, vr20
vhaddw.du.wu vr21, vr21, vr21
vhaddw.du.wu vr22, vr22, vr22
vhaddw.du.wu vr23, vr23, vr23
vhaddw.qu.du vr20, vr20, vr20
vhaddw.qu.du vr21, vr21, vr21
vhaddw.qu.du vr22, vr22, vr22
vhaddw.qu.du vr23, vr23, vr23
// Store data to p_sad_array
vstelm.w vr20, a6, 0, 0
vstelm.w vr21, a6, 4, 0
vstelm.w vr22, a6, 8, 0
vstelm.w vr23, a6, 12, 0
endfunc_x264
#endif /* !HIGH_BIT_DEPTH */