2586 lines
96 KiB
ArmAsm
2586 lines
96 KiB
ArmAsm
/*****************************************************************************
|
|
* sad-a.S: loongarch sad functions
|
|
*****************************************************************************
|
|
* Copyright (C) 2023-2025 x264 project
|
|
*
|
|
* Authors: Lu Wang <wanglu@loongson.cn>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at licensing@x264.com.
|
|
*****************************************************************************/
|
|
|
|
#include "loongson_asm.S"
|
|
#include "loongson_util.S"
|
|
|
|
#if !HIGH_BIT_DEPTH
|
|
|
|
|
|
/* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_16x16_lasx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
slli.d t3, a5, 2
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 0
|
|
xvld xr16, a0, 32
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr12, xr8, xr8
|
|
xvhaddw.hu.bu xr13, xr9, xr9
|
|
xvhaddw.hu.bu xr14, xr10, xr10
|
|
xvhaddw.hu.bu xr15, xr11, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr16, xr4
|
|
xvabsd.bu xr9, xr16, xr5
|
|
xvabsd.bu xr10, xr16, xr6
|
|
xvabsd.bu xr11, xr16, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
add.d a4, a4, t3
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 64
|
|
xvld xr16, a0, 96
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr16, xr4
|
|
xvabsd.bu xr9, xr16, xr5
|
|
xvabsd.bu xr10, xr16, xr6
|
|
xvabsd.bu xr11, xr16, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
add.d a4, a4, t3
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 128
|
|
xvld xr16, a0, 160
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr16, xr4
|
|
xvabsd.bu xr9, xr16, xr5
|
|
xvabsd.bu xr10, xr16, xr6
|
|
xvabsd.bu xr11, xr16, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
add.d a4, a4, t3
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 192
|
|
xvld xr16, a0, 224
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr16, xr4
|
|
xvabsd.bu xr9, xr16, xr5
|
|
xvabsd.bu xr10, xr16, xr6
|
|
xvabsd.bu xr11, xr16, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
xvori.b xr17, xr12, 0
|
|
xvori.b xr18, xr13, 0
|
|
xvpermi.q xr12, xr14, 0x02
|
|
xvpermi.q xr14, xr17, 0x31
|
|
xvpermi.q xr13, xr15, 0x02
|
|
xvpermi.q xr15, xr18, 0x31
|
|
xvadd.h xr12, xr12, xr14
|
|
xvadd.h xr13, xr13, xr15
|
|
xvhaddw.w.h xr12, xr12, xr12
|
|
xvhaddw.w.h xr13, xr13, xr13
|
|
xvhaddw.d.w xr12, xr12, xr12
|
|
xvhaddw.d.w xr13, xr13, xr13
|
|
xvhaddw.q.d xr12, xr12, xr12
|
|
xvhaddw.q.d xr13, xr13, xr13
|
|
xvpackev.w xr13, xr13, xr12
|
|
// Store data to p_sad_array
|
|
xvstelm.d xr13, a6, 0, 0
|
|
xvstelm.d xr13, a6, 8, 2
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_16x8_lasx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
slli.d t3, a5, 2
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 0
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr12, xr8, xr8
|
|
xvhaddw.hu.bu xr13, xr9, xr9
|
|
xvhaddw.hu.bu xr14, xr10, xr10
|
|
xvhaddw.hu.bu xr15, xr11, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 32
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
add.d a4, a4, t3
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 64
|
|
vld vr4, a1, 0
|
|
vldx vr8, a1, a5
|
|
vld vr5, a2, 0
|
|
vldx vr9, a2, a5
|
|
vld vr6, a3, 0
|
|
vldx vr10, a3, a5
|
|
vld vr7, a4, 0
|
|
vldx vr11, a4, a5
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
xvld xr3, a0, 96
|
|
vldx vr4, a1, t1
|
|
vldx vr8, a1, t2
|
|
vldx vr5, a2, t1
|
|
vldx vr9, a2, t2
|
|
vldx vr6, a3, t1
|
|
vldx vr10, a3, t2
|
|
vldx vr7, a4, t1
|
|
vldx vr11, a4, t2
|
|
xvpermi.q xr4, xr8, 0x02
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr6, xr10, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvadd.h xr12, xr12, xr8
|
|
xvadd.h xr13, xr13, xr9
|
|
xvadd.h xr14, xr14, xr10
|
|
xvadd.h xr15, xr15, xr11
|
|
|
|
xvori.b xr17, xr12, 0
|
|
xvori.b xr18, xr13, 0
|
|
xvpermi.q xr12, xr14, 0x02
|
|
xvpermi.q xr14, xr17, 0x31
|
|
xvpermi.q xr13, xr15, 0x02
|
|
xvpermi.q xr15, xr18, 0x31
|
|
xvadd.h xr12, xr12, xr14
|
|
xvadd.h xr13, xr13, xr15
|
|
xvhaddw.w.h xr12, xr12, xr12
|
|
xvhaddw.w.h xr13, xr13, xr13
|
|
xvhaddw.d.w xr12, xr12, xr12
|
|
xvhaddw.d.w xr13, xr13, xr13
|
|
xvhaddw.q.d xr12, xr12, xr12
|
|
xvhaddw.q.d xr13, xr13, xr13
|
|
xvpackev.w xr13, xr13, xr12
|
|
// Store data to p_sad_array
|
|
xvstelm.d xr13, a6, 0, 0
|
|
xvstelm.d xr13, a6, 8, 2
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_8x8_lasx
|
|
slli.d t1, a5, 1
|
|
add.d t2, t1, a5
|
|
slli.d t3, a5, 2
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr6, vr7, vr6
|
|
vilvl.d vr8, vr9, vr8
|
|
vilvl.d vr10, vr11, vr10
|
|
vilvl.d vr14, vr15, vr14
|
|
vilvl.d vr16, vr17, vr16
|
|
vilvl.d vr18, vr19, vr18
|
|
vilvl.d vr20, vr21, vr20
|
|
xvpermi.q xr4, xr6, 0x02
|
|
xvpermi.q xr8, xr10, 0x02
|
|
xvpermi.q xr14, xr16, 0x02
|
|
xvpermi.q xr18, xr20, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvldrepl.d xr3, a0, 0
|
|
xvabsd.bu xr5, xr3, xr4
|
|
xvldrepl.d xr3, a0, 16
|
|
xvabsd.bu xr9, xr3, xr8
|
|
xvldrepl.d xr3, a0, 32
|
|
xvabsd.bu xr10, xr3, xr14
|
|
xvldrepl.d xr3, a0, 48
|
|
xvabsd.bu xr11, xr3, xr18
|
|
xvaddwev.h.bu xr0, xr5, xr9
|
|
xvaddwod.h.bu xr1, xr5, xr9
|
|
xvaddwev.h.bu xr2, xr10, xr11
|
|
xvaddwod.h.bu xr22, xr10, xr11
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
add.d a4, a4, t3
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
|
|
vilvl.d vr4, vr5, vr4
|
|
vilvl.d vr6, vr7, vr6
|
|
vilvl.d vr8, vr9, vr8
|
|
vilvl.d vr10, vr11, vr10
|
|
vilvl.d vr14, vr15, vr14
|
|
vilvl.d vr16, vr17, vr16
|
|
vilvl.d vr18, vr19, vr18
|
|
vilvl.d vr20, vr21, vr20
|
|
xvpermi.q xr4, xr6, 0x02
|
|
xvpermi.q xr8, xr10, 0x02
|
|
xvpermi.q xr14, xr16, 0x02
|
|
xvpermi.q xr18, xr20, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvldrepl.d xr3, a0, 64
|
|
xvabsd.bu xr5, xr3, xr4
|
|
xvldrepl.d xr3, a0, 80
|
|
xvabsd.bu xr9, xr3, xr8
|
|
xvldrepl.d xr3, a0, 96
|
|
xvabsd.bu xr10, xr3, xr14
|
|
xvldrepl.d xr3, a0, 112
|
|
xvabsd.bu xr11, xr3, xr18
|
|
xvaddwev.h.bu xr12, xr5, xr9
|
|
xvaddwod.h.bu xr13, xr5, xr9
|
|
xvaddwev.h.bu xr14, xr10, xr11
|
|
xvaddwod.h.bu xr15, xr10, xr11
|
|
xvadd.h xr5, xr0, xr12
|
|
xvadd.h xr9, xr1, xr13
|
|
xvadd.h xr10, xr2, xr14
|
|
xvadd.h xr11, xr22, xr15
|
|
xvadd.h xr5, xr5, xr9
|
|
xvadd.h xr10, xr10, xr11
|
|
xvadd.h xr10, xr10, xr5
|
|
xvhaddw.wu.hu xr10, xr10, xr10
|
|
xvhaddw.du.wu xr10, xr10, xr10
|
|
xvpermi.q xr5, xr10, 0x01
|
|
xvpickev.w xr10, xr5, xr10
|
|
// Store data to p_sad_array
|
|
vst vr10, a6, 0
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_8x4_lasx
|
|
slli.d t1, a5, 1
|
|
add.d t2, t1, a5
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
fld.d f2, a0, 0
|
|
fld.d f3, a0, 16
|
|
fld.d f12, a0, 32
|
|
fld.d f13, a0, 48
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21
|
|
|
|
vilvl.d vr3, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr13, vr13, vr12
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr15, vr19, vr15
|
|
vilvl.d vr16, vr20, vr16
|
|
vilvl.d vr17, vr21, vr17
|
|
xvpermi.q xr3, xr13, 0x02
|
|
xvpermi.q xr4, xr16, 0x02
|
|
xvpermi.q xr5, xr17, 0x02
|
|
xvpermi.q xr6, xr14, 0x02
|
|
xvpermi.q xr7, xr15, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr8, xr3, xr4
|
|
xvabsd.bu xr9, xr3, xr5
|
|
xvabsd.bu xr10, xr3, xr6
|
|
xvabsd.bu xr11, xr3, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvpermi.d xr10, xr10, 0x4e
|
|
xvpermi.d xr11, xr11, 0x4e
|
|
xvadd.h xr8, xr8, xr10
|
|
xvadd.h xr9, xr9, xr11
|
|
xvhaddw.w.h xr8, xr8, xr8
|
|
xvhaddw.w.h xr9, xr9, xr9
|
|
xvhaddw.d.w xr8, xr8, xr8
|
|
xvhaddw.d.w xr9, xr9, xr9
|
|
xvhaddw.q.d xr8, xr8, xr8
|
|
xvhaddw.q.d xr9, xr9, xr9
|
|
xvpackev.w xr9, xr9, xr8
|
|
|
|
// Store data to p_sad_array
|
|
xvstelm.d xr9, a6, 0, 0
|
|
xvstelm.d xr9, a6, 8, 2
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_4x4_lsx
|
|
slli.d t0, a5, 1
|
|
add.d t1, a5, t0
|
|
slli.d t2, a5, 2
|
|
|
|
// Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3
|
|
fld.s f2, a0, 0
|
|
fld.s f3, a0, 16
|
|
fld.s f4, a1, 0
|
|
fldx.s f8, a1, a5
|
|
fld.s f5, a2, 0
|
|
fldx.s f9, a2, a5
|
|
fld.s f6, a3, 0
|
|
fldx.s f10, a3, a5
|
|
fld.s f7, a4, 0
|
|
fldx.s f11, a4, a5
|
|
vilvl.w vr3, vr3, vr2
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr7, vr11, vr7
|
|
|
|
fld.s f2, a0, 32
|
|
fld.s f0, a0, 48
|
|
fldx.s f8, a1, t0
|
|
fldx.s f12, a1, t1
|
|
fldx.s f9, a2, t0
|
|
fldx.s f13, a2, t1
|
|
fldx.s f10, a3, t0
|
|
fldx.s f14, a3, t1
|
|
fldx.s f11, a4, t0
|
|
fldx.s f15, a4, t1
|
|
vilvl.w vr2, vr0, vr2
|
|
vilvl.w vr8, vr12, vr8
|
|
vilvl.w vr9, vr13, vr9
|
|
vilvl.w vr10, vr14, vr10
|
|
vilvl.w vr11, vr15, vr11
|
|
vilvl.d vr3, vr2, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr7, vr11, vr7
|
|
|
|
// Calculate the absolute value of the difference
|
|
vabsd.bu vr8, vr3, vr4
|
|
vabsd.bu vr9, vr3, vr5
|
|
vabsd.bu vr10, vr3, vr6
|
|
vabsd.bu vr11, vr3, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr10, vr10, vr10
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.du.wu vr8, vr8, vr8
|
|
vhaddw.du.wu vr9, vr9, vr9
|
|
vhaddw.du.wu vr10, vr10, vr10
|
|
vhaddw.du.wu vr11, vr11, vr11
|
|
vhaddw.qu.du vr8, vr8, vr8
|
|
vhaddw.qu.du vr9, vr9, vr9
|
|
vhaddw.qu.du vr10, vr10, vr10
|
|
vhaddw.qu.du vr11, vr11, vr11
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr8, a6, 0, 0
|
|
vstelm.w vr9, a6, 4, 0
|
|
vstelm.w vr10, a6, 8, 0
|
|
vstelm.w vr11, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_16x16_lasx
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
slli.d t3, a4, 2
|
|
|
|
xvld xr2, a0, 0
|
|
xvld xr3, a0, 32
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr16, xr7, xr7
|
|
xvhaddw.hu.bu xr17, xr8, xr8
|
|
xvhaddw.hu.bu xr18, xr9, xr9
|
|
xvhaddw.hu.bu xr19, xr10, xr10
|
|
xvhaddw.hu.bu xr20, xr11, xr11
|
|
xvhaddw.hu.bu xr21, xr12, xr12
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
xvld xr2, a0, 64
|
|
xvld xr3, a0, 96
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr7, xr7, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvhaddw.hu.bu xr12, xr12, xr12
|
|
xvadd.h xr16, xr16, xr7
|
|
xvadd.h xr17, xr17, xr8
|
|
xvadd.h xr18, xr18, xr9
|
|
xvadd.h xr19, xr19, xr10
|
|
xvadd.h xr20, xr20, xr11
|
|
xvadd.h xr21, xr21, xr12
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
xvld xr2, a0, 128
|
|
xvld xr3, a0, 160
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr7, xr7, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvhaddw.hu.bu xr12, xr12, xr12
|
|
xvadd.h xr16, xr16, xr7
|
|
xvadd.h xr17, xr17, xr8
|
|
xvadd.h xr18, xr18, xr9
|
|
xvadd.h xr19, xr19, xr10
|
|
xvadd.h xr20, xr20, xr11
|
|
xvadd.h xr21, xr21, xr12
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
xvld xr2, a0, 192
|
|
xvld xr3, a0, 224
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr7, xr7, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvhaddw.hu.bu xr12, xr12, xr12
|
|
xvadd.h xr16, xr16, xr7
|
|
xvadd.h xr17, xr17, xr8
|
|
xvadd.h xr18, xr18, xr9
|
|
xvadd.h xr19, xr19, xr10
|
|
xvadd.h xr20, xr20, xr11
|
|
xvadd.h xr21, xr21, xr12
|
|
xvadd.h xr11, xr16, xr19
|
|
xvadd.h xr12, xr17, xr20
|
|
xvadd.h xr13, xr18, xr21
|
|
|
|
xvhaddw.wu.hu xr11, xr11, xr11
|
|
xvhaddw.wu.hu xr12, xr12, xr12
|
|
xvhaddw.wu.hu xr13, xr13, xr13
|
|
xvhaddw.du.wu xr11, xr11, xr11
|
|
xvhaddw.du.wu xr12, xr12, xr12
|
|
xvhaddw.du.wu xr13, xr13, xr13
|
|
xvhaddw.qu.du xr11, xr11, xr11
|
|
xvhaddw.qu.du xr12, xr12, xr12
|
|
xvhaddw.qu.du xr13, xr13, xr13
|
|
xvpickve.w xr17, xr11, 4
|
|
xvpickve.w xr18, xr12, 4
|
|
xvpickve.w xr19, xr13, 4
|
|
xvadd.w xr11, xr11, xr17
|
|
xvadd.w xr12, xr12, xr18
|
|
xvadd.w xr13, xr13, xr19
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr11, a5, 0, 0
|
|
vstelm.w vr12, a5, 4, 0
|
|
vstelm.w vr13, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_16x8_lasx
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
slli.d t3, a4, 2
|
|
|
|
xvld xr2, a0, 0
|
|
xvld xr3, a0, 32
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr16, xr7, xr7
|
|
xvhaddw.hu.bu xr17, xr8, xr8
|
|
xvhaddw.hu.bu xr18, xr9, xr9
|
|
xvhaddw.hu.bu xr19, xr10, xr10
|
|
xvhaddw.hu.bu xr20, xr11, xr11
|
|
xvhaddw.hu.bu xr21, xr12, xr12
|
|
|
|
add.d a1, a1, t3
|
|
add.d a2, a2, t3
|
|
add.d a3, a3, t3
|
|
xvld xr2, a0, 64
|
|
xvld xr3, a0, 96
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
xvpermi.q xr4, xr7, 0x02
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr6, xr9, 0x02
|
|
xvpermi.q xr10, xr13, 0x02
|
|
xvpermi.q xr11, xr14, 0x02
|
|
xvpermi.q xr12, xr15, 0x02
|
|
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr7, xr2, xr4
|
|
xvabsd.bu xr8, xr2, xr5
|
|
xvabsd.bu xr9, xr2, xr6
|
|
xvabsd.bu xr10, xr3, xr10
|
|
xvabsd.bu xr11, xr3, xr11
|
|
xvabsd.bu xr12, xr3, xr12
|
|
xvhaddw.hu.bu xr7, xr7, xr7
|
|
xvhaddw.hu.bu xr8, xr8, xr8
|
|
xvhaddw.hu.bu xr9, xr9, xr9
|
|
xvhaddw.hu.bu xr10, xr10, xr10
|
|
xvhaddw.hu.bu xr11, xr11, xr11
|
|
xvhaddw.hu.bu xr12, xr12, xr12
|
|
xvadd.h xr16, xr16, xr7
|
|
xvadd.h xr17, xr17, xr8
|
|
xvadd.h xr18, xr18, xr9
|
|
xvadd.h xr19, xr19, xr10
|
|
xvadd.h xr20, xr20, xr11
|
|
xvadd.h xr21, xr21, xr12
|
|
xvadd.h xr11, xr16, xr19
|
|
xvadd.h xr12, xr17, xr20
|
|
xvadd.h xr13, xr18, xr21
|
|
|
|
xvhaddw.wu.hu xr11, xr11, xr11
|
|
xvhaddw.wu.hu xr12, xr12, xr12
|
|
xvhaddw.wu.hu xr13, xr13, xr13
|
|
xvhaddw.du.wu xr11, xr11, xr11
|
|
xvhaddw.du.wu xr12, xr12, xr12
|
|
xvhaddw.du.wu xr13, xr13, xr13
|
|
xvhaddw.qu.du xr11, xr11, xr11
|
|
xvhaddw.qu.du xr12, xr12, xr12
|
|
xvhaddw.qu.du xr13, xr13, xr13
|
|
xvpickve.w xr17, xr11, 4
|
|
xvpickve.w xr18, xr12, 4
|
|
xvpickve.w xr19, xr13, 4
|
|
xvadd.w xr11, xr11, xr17
|
|
xvadd.w xr12, xr12, xr18
|
|
xvadd.w xr13, xr13, xr19
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr11, a5, 0, 0
|
|
vstelm.w vr12, a5, 4, 0
|
|
vstelm.w vr13, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_4x4_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.s f3, a0, 0
|
|
fld.s f7, a0, 16
|
|
fld.s f11, a0, 32
|
|
fld.s f15, a0, 48
|
|
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
|
|
vilvl.w vr3, vr7, vr3
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr11, vr15, vr11
|
|
vilvl.w vr12, vr16, vr12
|
|
vilvl.w vr13, vr17, vr13
|
|
vilvl.w vr14, vr18, vr14
|
|
vilvl.d vr3, vr11, vr3
|
|
vilvl.d vr4, vr12, vr4
|
|
vilvl.d vr5, vr13, vr5
|
|
vilvl.d vr6, vr14, vr6
|
|
|
|
// Calculate the absolute value of the difference
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.du.wu vr7, vr7, vr7
|
|
vhaddw.du.wu vr8, vr8, vr8
|
|
vhaddw.du.wu vr9, vr9, vr9
|
|
vhaddw.qu.du vr7, vr7, vr7
|
|
vhaddw.qu.du vr8, vr8, vr8
|
|
vhaddw.qu.du vr9, vr9, vr9
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr7, a5, 0, 0
|
|
vstelm.w vr8, a5, 4, 0
|
|
vstelm.w vr9, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_8x4_lasx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
// Load data from p_src and p_ref
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr8, vr10, vr8
|
|
xvpermi.q xr3, xr7, 0x02
|
|
xvpermi.q xr4, xr8, 0x02
|
|
// Calculate the absolute value of the difference
|
|
xvabsd.bu xr5, xr3, xr4
|
|
xvhaddw.hu.bu xr6, xr5, xr5
|
|
xvhaddw.wu.hu xr6, xr6, xr6
|
|
xvhaddw.du.wu xr6, xr6, xr6
|
|
xvhaddw.qu.du xr6, xr6, xr6
|
|
|
|
xvpickve2gr.wu t2, xr6, 0
|
|
xvpickve2gr.wu t3, xr6, 4
|
|
add.d a0, t2, t3
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_4x4_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
// Load data from p_src and p_ref
|
|
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.w vr3, vr5, vr3
|
|
vilvl.w vr4, vr6, vr4
|
|
vilvl.w vr7, vr9, vr7
|
|
vilvl.w vr8, vr10, vr8
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
|
|
// Calculate the absolute value of the difference
|
|
vabsd.bu vr5, vr3, vr4
|
|
vhaddw.hu.bu vr6, vr5, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.du.wu vr6, vr6, vr6
|
|
vhaddw.qu.du vr6, vr6, vr6
|
|
vpickve2gr.wu a0, vr6, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_4x8_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
// Load data from p_src and p_ref
|
|
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.w vr3, vr5, vr3
|
|
vilvl.w vr4, vr6, vr4
|
|
vilvl.w vr7, vr9, vr7
|
|
vilvl.w vr8, vr10, vr8
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vabsd.bu vr11, vr3, vr4
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.w vr3, vr5, vr3
|
|
vilvl.w vr4, vr6, vr4
|
|
vilvl.w vr7, vr9, vr7
|
|
vilvl.w vr8, vr10, vr8
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vabsd.bu vr5, vr3, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
|
|
vadd.h vr6, vr11, vr5
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.du.wu vr6, vr6, vr6
|
|
vhaddw.qu.du vr6, vr6, vr6
|
|
vpickve2gr.wu a0, vr6, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_4x16_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
// Load data from p_src and p_ref
|
|
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.w vr3, vr5, vr3
|
|
vilvl.w vr4, vr6, vr4
|
|
vilvl.w vr7, vr9, vr7
|
|
vilvl.w vr8, vr10, vr8
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vabsd.bu vr11, vr3, vr4
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.w vr3, vr5, vr3
|
|
vilvl.w vr4, vr6, vr4
|
|
vilvl.w vr7, vr9, vr7
|
|
vilvl.w vr8, vr10, vr8
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vabsd.bu vr12, vr3, vr4
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr11, vr11, vr12
|
|
.endr
|
|
|
|
vhaddw.wu.hu vr11, vr11, vr11
|
|
vhaddw.du.wu vr11, vr11, vr11
|
|
vhaddw.qu.du vr11, vr11, vr11
|
|
vpickve2gr.wu a0, vr11, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_8x4_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr8, vr10, vr8
|
|
vabsd.bu vr11, vr3, vr4
|
|
vabsd.bu vr12, vr7, vr8
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr6, vr11, vr12
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.du.wu vr6, vr6, vr6
|
|
vhaddw.qu.du vr6, vr6, vr6
|
|
vpickve2gr.wu a0, vr6, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_8x8_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr8, vr10, vr8
|
|
vabsd.bu vr11, vr3, vr4
|
|
vabsd.bu vr12, vr7, vr8
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr13, vr11, vr12
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr8, vr10, vr8
|
|
vabsd.bu vr11, vr3, vr4
|
|
vabsd.bu vr12, vr7, vr8
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr6, vr11, vr12
|
|
vadd.h vr6, vr6, vr13
|
|
vhaddw.wu.hu vr6, vr6, vr6
|
|
vhaddw.du.wu vr6, vr6, vr6
|
|
vhaddw.qu.du vr6, vr6, vr6
|
|
vpickve2gr.wu a0, vr6, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_8x16_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr8, vr10, vr8
|
|
vabsd.bu vr11, vr3, vr4
|
|
vabsd.bu vr12, vr7, vr8
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr13, vr11, vr12
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9
|
|
FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10
|
|
vilvl.d vr3, vr5, vr3
|
|
vilvl.d vr7, vr9, vr7
|
|
vilvl.d vr4, vr6, vr4
|
|
vilvl.d vr8, vr10, vr8
|
|
vabsd.bu vr11, vr3, vr4
|
|
vabsd.bu vr12, vr7, vr8
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vadd.h vr14, vr11, vr12
|
|
vadd.h vr13, vr13, vr14
|
|
.endr
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu a0, vr13, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_16x8_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
|
|
vabsd.bu vr8, vr0, vr4
|
|
vabsd.bu vr9, vr1, vr5
|
|
vabsd.bu vr10, vr2, vr6
|
|
vabsd.bu vr11, vr3, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vadd.h vr8, vr8, vr9
|
|
vadd.h vr9, vr10, vr11
|
|
vadd.h vr14, vr8, vr9
|
|
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
|
|
vabsd.bu vr8, vr0, vr4
|
|
vabsd.bu vr9, vr1, vr5
|
|
vabsd.bu vr10, vr2, vr6
|
|
vabsd.bu vr11, vr3, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vadd.h vr8, vr8, vr9
|
|
vadd.h vr9, vr10, vr11
|
|
vadd.h vr12, vr8, vr9
|
|
|
|
vadd.h vr13, vr12, vr14
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu a0, vr13, 0
|
|
endfunc_x264
|
|
|
|
/* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride,
|
|
* uint8_t *p_ref, intptr_t i_ref_stride)
|
|
*/
|
|
function_x264 pixel_sad_16x16_lsx
|
|
slli.d t1, a1, 1
|
|
slli.d t2, a3, 1
|
|
add.d t3, a1, t1
|
|
add.d t4, a3, t2
|
|
|
|
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
|
|
vabsd.bu vr8, vr0, vr4
|
|
vabsd.bu vr9, vr1, vr5
|
|
vabsd.bu vr10, vr2, vr6
|
|
vabsd.bu vr11, vr3, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vadd.h vr8, vr8, vr9
|
|
vadd.h vr9, vr10, vr11
|
|
vadd.h vr13, vr8, vr9
|
|
|
|
.rept 3
|
|
alsl.d a0, a1, a0, 2
|
|
alsl.d a2, a3, a2, 2
|
|
LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3
|
|
LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7
|
|
vabsd.bu vr8, vr0, vr4
|
|
vabsd.bu vr9, vr1, vr5
|
|
vabsd.bu vr10, vr2, vr6
|
|
vabsd.bu vr11, vr3, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vadd.h vr8, vr8, vr9
|
|
vadd.h vr9, vr10, vr11
|
|
vadd.h vr12, vr8, vr9
|
|
vadd.h vr13, vr12, vr13
|
|
.endr
|
|
|
|
vhaddw.wu.hu vr13, vr13, vr13
|
|
vhaddw.du.wu vr13, vr13, vr13
|
|
vhaddw.qu.du vr13, vr13, vr13
|
|
vpickve2gr.wu a0, vr13, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_4x8_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.s f3, a0, 0
|
|
fld.s f7, a0, 16
|
|
fld.s f11, a0, 32
|
|
fld.s f15, a0, 48
|
|
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.w vr3, vr7, vr3
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr11, vr15, vr11
|
|
vilvl.w vr12, vr16, vr12
|
|
vilvl.w vr13, vr17, vr13
|
|
vilvl.w vr14, vr18, vr14
|
|
vilvl.d vr3, vr11, vr3
|
|
vilvl.d vr4, vr12, vr4
|
|
vilvl.d vr5, vr13, vr5
|
|
vilvl.d vr6, vr14, vr6
|
|
vabsd.bu vr0, vr3, vr4
|
|
vabsd.bu vr1, vr3, vr5
|
|
vabsd.bu vr2, vr3, vr6
|
|
|
|
alsl.d a1, a4, a1, 2
|
|
alsl.d a2, a4, a2, 2
|
|
alsl.d a3, a4, a3, 2
|
|
fld.s f3, a0, 64
|
|
fld.s f7, a0, 80
|
|
fld.s f11, a0, 96
|
|
fld.s f15, a0, 112
|
|
FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.w vr3, vr7, vr3
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr11, vr15, vr11
|
|
vilvl.w vr12, vr16, vr12
|
|
vilvl.w vr13, vr17, vr13
|
|
vilvl.w vr14, vr18, vr14
|
|
vilvl.d vr3, vr11, vr3
|
|
vilvl.d vr4, vr12, vr4
|
|
vilvl.d vr5, vr13, vr5
|
|
vilvl.d vr6, vr14, vr6
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
|
|
vhaddw.hu.bu vr0, vr0, vr0
|
|
vhaddw.hu.bu vr1, vr1, vr1
|
|
vhaddw.hu.bu vr2, vr2, vr2
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vadd.h vr7, vr7, vr0
|
|
vadd.h vr8, vr8, vr1
|
|
vadd.h vr9, vr9, vr2
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.du.wu vr7, vr7, vr7
|
|
vhaddw.du.wu vr8, vr8, vr8
|
|
vhaddw.du.wu vr9, vr9, vr9
|
|
vhaddw.qu.du vr7, vr7, vr7
|
|
vhaddw.qu.du vr8, vr8, vr8
|
|
vhaddw.qu.du vr9, vr9, vr9
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr7, a5, 0, 0
|
|
vstelm.w vr8, a5, 4, 0
|
|
vstelm.w vr9, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_8x4_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f3, a0, 0
|
|
fld.d f7, a0, 16
|
|
fld.d f11, a0, 32
|
|
fld.d f15, a0, 48
|
|
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr11, vr15, vr11
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr14, vr18, vr14
|
|
vabsd.bu vr0, vr3, vr4
|
|
vabsd.bu vr1, vr3, vr5
|
|
vabsd.bu vr2, vr3, vr6
|
|
vabsd.bu vr3, vr11, vr12
|
|
vabsd.bu vr4, vr11, vr13
|
|
vabsd.bu vr5, vr11, vr14
|
|
vhaddw.hu.bu vr0, vr0, vr0
|
|
vhaddw.hu.bu vr1, vr1, vr1
|
|
vhaddw.hu.bu vr2, vr2, vr2
|
|
vhaddw.hu.bu vr3, vr3, vr3
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vadd.h vr7, vr0, vr3
|
|
vadd.h vr8, vr1, vr4
|
|
vadd.h vr9, vr2, vr5
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.du.wu vr7, vr7, vr7
|
|
vhaddw.du.wu vr8, vr8, vr8
|
|
vhaddw.du.wu vr9, vr9, vr9
|
|
vhaddw.qu.du vr7, vr7, vr7
|
|
vhaddw.qu.du vr8, vr8, vr8
|
|
vhaddw.qu.du vr9, vr9, vr9
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr7, a5, 0, 0
|
|
vstelm.w vr8, a5, 4, 0
|
|
vstelm.w vr9, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_8x8_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f3, a0, 0
|
|
fld.d f7, a0, 16
|
|
fld.d f11, a0, 32
|
|
fld.d f15, a0, 48
|
|
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr11, vr15, vr11
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr14, vr18, vr14
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
vabsd.bu vr10, vr11, vr12
|
|
vabsd.bu vr15, vr11, vr13
|
|
vabsd.bu vr16, vr11, vr14
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vadd.h vr0, vr7, vr10
|
|
vadd.h vr1, vr8, vr15
|
|
vadd.h vr2, vr9, vr16
|
|
|
|
alsl.d a1, a4, a1, 2
|
|
alsl.d a2, a4, a2, 2
|
|
alsl.d a3, a4, a3, 2
|
|
fld.d f3, a0, 64
|
|
fld.d f7, a0, 80
|
|
fld.d f11, a0, 96
|
|
fld.d f15, a0, 112
|
|
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr11, vr15, vr11
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr14, vr18, vr14
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
vabsd.bu vr10, vr11, vr12
|
|
vabsd.bu vr15, vr11, vr13
|
|
vabsd.bu vr16, vr11, vr14
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vadd.h vr7, vr7, vr10
|
|
vadd.h vr8, vr8, vr15
|
|
vadd.h vr9, vr9, vr16
|
|
|
|
vadd.h vr7, vr7, vr0
|
|
vadd.h vr8, vr8, vr1
|
|
vadd.h vr9, vr9, vr2
|
|
vhaddw.wu.hu vr7, vr7, vr7
|
|
vhaddw.wu.hu vr8, vr8, vr8
|
|
vhaddw.wu.hu vr9, vr9, vr9
|
|
vhaddw.du.wu vr7, vr7, vr7
|
|
vhaddw.du.wu vr8, vr8, vr8
|
|
vhaddw.du.wu vr9, vr9, vr9
|
|
vhaddw.qu.du vr7, vr7, vr7
|
|
vhaddw.qu.du vr8, vr8, vr8
|
|
vhaddw.qu.du vr9, vr9, vr9
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr7, a5, 0, 0
|
|
vstelm.w vr8, a5, 4, 0
|
|
vstelm.w vr9, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_8x16_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f3, a0, 0
|
|
fld.d f7, a0, 16
|
|
fld.d f11, a0, 32
|
|
fld.d f15, a0, 48
|
|
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr11, vr15, vr11
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr14, vr18, vr14
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
vabsd.bu vr10, vr11, vr12
|
|
vabsd.bu vr15, vr11, vr13
|
|
vabsd.bu vr16, vr11, vr14
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vadd.h vr0, vr7, vr10
|
|
vadd.h vr1, vr8, vr15
|
|
vadd.h vr2, vr9, vr16
|
|
|
|
.rept 3
|
|
alsl.d a1, a4, a1, 2
|
|
alsl.d a2, a4, a2, 2
|
|
alsl.d a3, a4, a3, 2
|
|
addi.d a0, a0, 64
|
|
fld.d f3, a0, 0
|
|
fld.d f7, a0, 16
|
|
fld.d f11, a0, 32
|
|
fld.d f15, a0, 48
|
|
FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18
|
|
vilvl.d vr3, vr7, vr3
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr11, vr15, vr11
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr14, vr18, vr14
|
|
vabsd.bu vr7, vr3, vr4
|
|
vabsd.bu vr8, vr3, vr5
|
|
vabsd.bu vr9, vr3, vr6
|
|
vabsd.bu vr10, vr11, vr12
|
|
vabsd.bu vr15, vr11, vr13
|
|
vabsd.bu vr16, vr11, vr14
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vadd.h vr7, vr7, vr10
|
|
vadd.h vr8, vr8, vr15
|
|
vadd.h vr9, vr9, vr16
|
|
vadd.h vr0, vr7, vr0
|
|
vadd.h vr1, vr8, vr1
|
|
vadd.h vr2, vr9, vr2
|
|
.endr
|
|
|
|
vhaddw.wu.hu vr0, vr0, vr0
|
|
vhaddw.wu.hu vr1, vr1, vr1
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.du.wu vr0, vr0, vr0
|
|
vhaddw.du.wu vr1, vr1, vr1
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr0, vr0, vr0
|
|
vhaddw.qu.du vr1, vr1, vr1
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr0, a5, 0, 0
|
|
vstelm.w vr1, a5, 4, 0
|
|
vstelm.w vr2, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_16x8_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr1, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr2, vr10
|
|
vabsd.bu vr11, vr2, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr3, vr13
|
|
vabsd.bu vr14, vr3, vr14
|
|
vabsd.bu vr15, vr3, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr0, vr7, vr4
|
|
vadd.h vr1, vr13, vr10
|
|
vadd.h vr16, vr1, vr0
|
|
vadd.h vr0, vr8, vr5
|
|
vadd.h vr1, vr14, vr11
|
|
vadd.h vr17, vr1, vr0
|
|
vadd.h vr0, vr9, vr6
|
|
vadd.h vr1, vr15, vr12
|
|
vadd.h vr18, vr1, vr0
|
|
|
|
// vr16, vr17, vr18
|
|
alsl.d a1, a4, a1, 2
|
|
alsl.d a2, a4, a2, 2
|
|
alsl.d a3, a4, a3, 2
|
|
vld vr0, a0, 64
|
|
vld vr1, a0, 80
|
|
vld vr2, a0, 96
|
|
vld vr3, a0, 112
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr1, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr2, vr10
|
|
vabsd.bu vr11, vr2, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr3, vr13
|
|
vabsd.bu vr14, vr3, vr14
|
|
vabsd.bu vr15, vr3, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr0, vr7, vr4
|
|
vadd.h vr1, vr13, vr10
|
|
vadd.h vr2, vr1, vr0
|
|
vadd.h vr0, vr8, vr5
|
|
vadd.h vr1, vr14, vr11
|
|
vadd.h vr3, vr1, vr0
|
|
vadd.h vr0, vr9, vr6
|
|
vadd.h vr1, vr15, vr12
|
|
vadd.h vr4, vr1, vr0
|
|
|
|
vadd.h vr0, vr16, vr2
|
|
vadd.h vr1, vr17, vr3
|
|
vadd.h vr2, vr18, vr4
|
|
vhaddw.wu.hu vr0, vr0, vr0
|
|
vhaddw.wu.hu vr1, vr1, vr1
|
|
vhaddw.wu.hu vr2, vr2, vr2
|
|
vhaddw.du.wu vr0, vr0, vr0
|
|
vhaddw.du.wu vr1, vr1, vr1
|
|
vhaddw.du.wu vr2, vr2, vr2
|
|
vhaddw.qu.du vr0, vr0, vr0
|
|
vhaddw.qu.du vr1, vr1, vr1
|
|
vhaddw.qu.du vr2, vr2, vr2
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr0, a5, 0, 0
|
|
vstelm.w vr1, a5, 4, 0
|
|
vstelm.w vr2, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[3])
|
|
*/
|
|
function_x264 pixel_sad_x3_16x16_lsx
|
|
slli.d t1, a4, 1
|
|
add.d t2, a4, t1
|
|
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr1, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr2, vr10
|
|
vabsd.bu vr11, vr2, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr3, vr13
|
|
vabsd.bu vr14, vr3, vr14
|
|
vabsd.bu vr15, vr3, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr0, vr7, vr4
|
|
vadd.h vr1, vr13, vr10
|
|
vadd.h vr16, vr1, vr0
|
|
vadd.h vr0, vr8, vr5
|
|
vadd.h vr1, vr14, vr11
|
|
vadd.h vr17, vr1, vr0
|
|
vadd.h vr0, vr9, vr6
|
|
vadd.h vr1, vr15, vr12
|
|
vadd.h vr18, vr1, vr0
|
|
|
|
.rept 3
|
|
alsl.d a1, a4, a1, 2
|
|
alsl.d a2, a4, a2, 2
|
|
alsl.d a3, a4, a3, 2
|
|
addi.d a0, a0, 64
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13
|
|
LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14
|
|
LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr1, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr2, vr10
|
|
vabsd.bu vr11, vr2, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr3, vr13
|
|
vabsd.bu vr14, vr3, vr14
|
|
vabsd.bu vr15, vr3, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr0, vr7, vr4
|
|
vadd.h vr1, vr13, vr10
|
|
vadd.h vr2, vr1, vr0
|
|
vadd.h vr0, vr8, vr5
|
|
vadd.h vr1, vr14, vr11
|
|
vadd.h vr3, vr1, vr0
|
|
vadd.h vr0, vr9, vr6
|
|
vadd.h vr1, vr15, vr12
|
|
vadd.h vr4, vr1, vr0
|
|
|
|
vadd.h vr16, vr16, vr2
|
|
vadd.h vr17, vr17, vr3
|
|
vadd.h vr18, vr18, vr4
|
|
.endr
|
|
|
|
vhaddw.wu.hu vr16, vr16, vr16
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.wu.hu vr18, vr18, vr18
|
|
vhaddw.du.wu vr16, vr16, vr16
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.du.wu vr18, vr18, vr18
|
|
vhaddw.qu.du vr16, vr16, vr16
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vhaddw.qu.du vr18, vr18, vr18
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr16, a5, 0, 0
|
|
vstelm.w vr17, a5, 4, 0
|
|
vstelm.w vr18, a5, 8, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_4x8_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
fld.s f0, a0, 0
|
|
fld.s f1, a0, 16
|
|
fld.s f2, a0, 32
|
|
fld.s f3, a0, 48
|
|
FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr2, vr3, vr2
|
|
vilvl.d vr0, vr2, vr0
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr12, vr16, vr12
|
|
vilvl.d vr1, vr12, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr13, vr17, vr13
|
|
vilvl.d vr2, vr13, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr14, vr18, vr14
|
|
vilvl.d vr3, vr14, vr6
|
|
vilvl.w vr7, vr11, vr7
|
|
vilvl.w vr15, vr19, vr15
|
|
vilvl.d vr4, vr15, vr7
|
|
vabsd.bu vr1, vr0, vr1
|
|
vabsd.bu vr2, vr0, vr2
|
|
vabsd.bu vr3, vr0, vr3
|
|
vabsd.bu vr4, vr0, vr4
|
|
vhaddw.hu.bu vr20, vr1, vr1
|
|
vhaddw.hu.bu vr21, vr2, vr2
|
|
vhaddw.hu.bu vr22, vr3, vr3
|
|
vhaddw.hu.bu vr23, vr4, vr4
|
|
|
|
alsl.d a1, a5, a1, 2
|
|
alsl.d a2, a5, a2, 2
|
|
alsl.d a3, a5, a3, 2
|
|
alsl.d a4, a5, a4, 2
|
|
fld.s f0, a0, 64
|
|
fld.s f1, a0, 80
|
|
fld.s f2, a0, 96
|
|
fld.s f3, a0, 112
|
|
FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
|
|
vilvl.w vr0, vr1, vr0
|
|
vilvl.w vr2, vr3, vr2
|
|
vilvl.d vr0, vr2, vr0
|
|
vilvl.w vr4, vr8, vr4
|
|
vilvl.w vr12, vr16, vr12
|
|
vilvl.d vr1, vr12, vr4
|
|
vilvl.w vr5, vr9, vr5
|
|
vilvl.w vr13, vr17, vr13
|
|
vilvl.d vr2, vr13, vr5
|
|
vilvl.w vr6, vr10, vr6
|
|
vilvl.w vr14, vr18, vr14
|
|
vilvl.d vr3, vr14, vr6
|
|
vilvl.w vr7, vr11, vr7
|
|
vilvl.w vr15, vr19, vr15
|
|
vilvl.d vr4, vr15, vr7
|
|
vabsd.bu vr1, vr0, vr1
|
|
vabsd.bu vr2, vr0, vr2
|
|
vabsd.bu vr3, vr0, vr3
|
|
vabsd.bu vr4, vr0, vr4
|
|
vhaddw.hu.bu vr1, vr1, vr1
|
|
vhaddw.hu.bu vr2, vr2, vr2
|
|
vhaddw.hu.bu vr3, vr3, vr3
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vadd.h vr16, vr20, vr1
|
|
vadd.h vr17, vr21, vr2
|
|
vadd.h vr18, vr22, vr3
|
|
vadd.h vr19, vr23, vr4
|
|
|
|
vhaddw.wu.hu vr16, vr16, vr16
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.wu.hu vr18, vr18, vr18
|
|
vhaddw.wu.hu vr19, vr19, vr19
|
|
vhaddw.du.wu vr16, vr16, vr16
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.du.wu vr18, vr18, vr18
|
|
vhaddw.du.wu vr19, vr19, vr19
|
|
vhaddw.qu.du vr16, vr16, vr16
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vhaddw.qu.du vr18, vr18, vr18
|
|
vhaddw.qu.du vr19, vr19, vr19
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr16, a6, 0, 0
|
|
vstelm.w vr17, a6, 4, 0
|
|
vstelm.w vr18, a6, 8, 0
|
|
vstelm.w vr19, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_8x4_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f0, a0, 0
|
|
fld.d f1, a0, 16
|
|
fld.d f2, a0, 32
|
|
fld.d f3, a0, 48
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr15, vr19, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr16, vr4, vr12
|
|
vadd.h vr17, vr5, vr13
|
|
vadd.h vr18, vr6, vr14
|
|
vadd.h vr19, vr7, vr15
|
|
vhaddw.wu.hu vr16, vr16, vr16
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.wu.hu vr18, vr18, vr18
|
|
vhaddw.wu.hu vr19, vr19, vr19
|
|
vhaddw.du.wu vr16, vr16, vr16
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.du.wu vr18, vr18, vr18
|
|
vhaddw.du.wu vr19, vr19, vr19
|
|
vhaddw.qu.du vr16, vr16, vr16
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vhaddw.qu.du vr18, vr18, vr18
|
|
vhaddw.qu.du vr19, vr19, vr19
|
|
|
|
// Store data to p_sad_array
|
|
vstelm.w vr16, a6, 0, 0
|
|
vstelm.w vr17, a6, 4, 0
|
|
vstelm.w vr18, a6, 8, 0
|
|
vstelm.w vr19, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_8x8_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f0, a0, 0
|
|
fld.d f1, a0, 16
|
|
fld.d f2, a0, 32
|
|
fld.d f3, a0, 48
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr15, vr19, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr20, vr4, vr12
|
|
vadd.h vr21, vr5, vr13
|
|
vadd.h vr22, vr6, vr14
|
|
vadd.h vr23, vr7, vr15
|
|
|
|
alsl.d a1, a5, a1, 2
|
|
alsl.d a2, a5, a2, 2
|
|
alsl.d a3, a5, a3, 2
|
|
alsl.d a4, a5, a4, 2
|
|
fld.d f0, a0, 64
|
|
fld.d f1, a0, 80
|
|
fld.d f2, a0, 96
|
|
fld.d f3, a0, 112
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr15, vr19, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr16, vr4, vr12
|
|
vadd.h vr17, vr5, vr13
|
|
vadd.h vr18, vr6, vr14
|
|
vadd.h vr19, vr7, vr15
|
|
|
|
vadd.h vr16, vr16, vr20
|
|
vadd.h vr17, vr17, vr21
|
|
vadd.h vr18, vr18, vr22
|
|
vadd.h vr19, vr19, vr23
|
|
vhaddw.wu.hu vr16, vr16, vr16
|
|
vhaddw.wu.hu vr17, vr17, vr17
|
|
vhaddw.wu.hu vr18, vr18, vr18
|
|
vhaddw.wu.hu vr19, vr19, vr19
|
|
vhaddw.du.wu vr16, vr16, vr16
|
|
vhaddw.du.wu vr17, vr17, vr17
|
|
vhaddw.du.wu vr18, vr18, vr18
|
|
vhaddw.du.wu vr19, vr19, vr19
|
|
vhaddw.qu.du vr16, vr16, vr16
|
|
vhaddw.qu.du vr17, vr17, vr17
|
|
vhaddw.qu.du vr18, vr18, vr18
|
|
vhaddw.qu.du vr19, vr19, vr19
|
|
// Store data to p_sad_array
|
|
vstelm.w vr16, a6, 0, 0
|
|
vstelm.w vr17, a6, 4, 0
|
|
vstelm.w vr18, a6, 8, 0
|
|
vstelm.w vr19, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_8x16_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
// Load data from p_src, p_ref0, p_ref1 and p_ref2
|
|
fld.d f0, a0, 0
|
|
fld.d f1, a0, 16
|
|
fld.d f2, a0, 32
|
|
fld.d f3, a0, 48
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr15, vr19, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr20, vr4, vr12
|
|
vadd.h vr21, vr5, vr13
|
|
vadd.h vr22, vr6, vr14
|
|
vadd.h vr23, vr7, vr15
|
|
|
|
.rept 3
|
|
alsl.d a1, a5, a1, 2
|
|
alsl.d a2, a5, a2, 2
|
|
alsl.d a3, a5, a3, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.d a0, a0, 64
|
|
fld.d f0, a0, 0
|
|
fld.d f1, a0, 16
|
|
fld.d f2, a0, 32
|
|
fld.d f3, a0, 48
|
|
FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16
|
|
FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17
|
|
FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18
|
|
FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19
|
|
vilvl.d vr0, vr1, vr0
|
|
vilvl.d vr2, vr3, vr2
|
|
vilvl.d vr4, vr8, vr4
|
|
vilvl.d vr12, vr16, vr12
|
|
vilvl.d vr5, vr9, vr5
|
|
vilvl.d vr13, vr17, vr13
|
|
vilvl.d vr6, vr10, vr6
|
|
vilvl.d vr14, vr18, vr14
|
|
vilvl.d vr7, vr11, vr7
|
|
vilvl.d vr15, vr19, vr15
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vadd.h vr16, vr4, vr12
|
|
vadd.h vr17, vr5, vr13
|
|
vadd.h vr18, vr6, vr14
|
|
vadd.h vr19, vr7, vr15
|
|
vadd.h vr20, vr16, vr20
|
|
vadd.h vr21, vr17, vr21
|
|
vadd.h vr22, vr18, vr22
|
|
vadd.h vr23, vr19, vr23
|
|
.endr
|
|
vhaddw.wu.hu vr20, vr20, vr20
|
|
vhaddw.wu.hu vr21, vr21, vr21
|
|
vhaddw.wu.hu vr22, vr22, vr22
|
|
vhaddw.wu.hu vr23, vr23, vr23
|
|
vhaddw.du.wu vr20, vr20, vr20
|
|
vhaddw.du.wu vr21, vr21, vr21
|
|
vhaddw.du.wu vr22, vr22, vr22
|
|
vhaddw.du.wu vr23, vr23, vr23
|
|
vhaddw.qu.du vr20, vr20, vr20
|
|
vhaddw.qu.du vr21, vr21, vr21
|
|
vhaddw.qu.du vr22, vr22, vr22
|
|
vhaddw.qu.du vr23, vr23, vr23
|
|
// Store data to p_sad_array
|
|
vstelm.w vr20, a6, 0, 0
|
|
vstelm.w vr21, a6, 4, 0
|
|
vstelm.w vr22, a6, 8, 0
|
|
vstelm.w vr23, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_16x8_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
|
|
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
|
|
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
|
|
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr1, vr10
|
|
vabsd.bu vr11, vr1, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vabsd.bu vr16, vr3, vr16
|
|
vabsd.bu vr17, vr3, vr17
|
|
vabsd.bu vr18, vr3, vr18
|
|
vabsd.bu vr19, vr3, vr19
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vhaddw.hu.bu vr17, vr17, vr17
|
|
vhaddw.hu.bu vr18, vr18, vr18
|
|
vhaddw.hu.bu vr19, vr19, vr19
|
|
vadd.h vr0, vr4, vr8
|
|
vadd.h vr1, vr12, vr16
|
|
vadd.h vr20, vr0, vr1
|
|
vadd.h vr0, vr5, vr9
|
|
vadd.h vr1, vr13, vr17
|
|
vadd.h vr21, vr0, vr1
|
|
vadd.h vr0, vr6, vr10
|
|
vadd.h vr1, vr14, vr18
|
|
vadd.h vr22, vr0, vr1
|
|
vadd.h vr0, vr7, vr11
|
|
vadd.h vr1, vr15, vr19
|
|
vadd.h vr23, vr0, vr1
|
|
|
|
alsl.d a1, a5, a1, 2
|
|
alsl.d a2, a5, a2, 2
|
|
alsl.d a3, a5, a3, 2
|
|
alsl.d a4, a5, a4, 2
|
|
vld vr0, a0, 64
|
|
vld vr1, a0, 80
|
|
vld vr2, a0, 96
|
|
vld vr3, a0, 112
|
|
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
|
|
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
|
|
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
|
|
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr1, vr10
|
|
vabsd.bu vr11, vr1, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vabsd.bu vr16, vr3, vr16
|
|
vabsd.bu vr17, vr3, vr17
|
|
vabsd.bu vr18, vr3, vr18
|
|
vabsd.bu vr19, vr3, vr19
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vhaddw.hu.bu vr17, vr17, vr17
|
|
vhaddw.hu.bu vr18, vr18, vr18
|
|
vhaddw.hu.bu vr19, vr19, vr19
|
|
vadd.h vr0, vr4, vr8
|
|
vadd.h vr1, vr12, vr16
|
|
vadd.h vr16, vr0, vr1
|
|
vadd.h vr0, vr5, vr9
|
|
vadd.h vr1, vr13, vr17
|
|
vadd.h vr17, vr0, vr1
|
|
vadd.h vr0, vr6, vr10
|
|
vadd.h vr1, vr14, vr18
|
|
vadd.h vr18, vr0, vr1
|
|
vadd.h vr0, vr7, vr11
|
|
vadd.h vr1, vr15, vr19
|
|
vadd.h vr19, vr0, vr1
|
|
|
|
vadd.h vr20, vr16, vr20
|
|
vadd.h vr21, vr17, vr21
|
|
vadd.h vr22, vr18, vr22
|
|
vadd.h vr23, vr19, vr23
|
|
vhaddw.wu.hu vr20, vr20, vr20
|
|
vhaddw.wu.hu vr21, vr21, vr21
|
|
vhaddw.wu.hu vr22, vr22, vr22
|
|
vhaddw.wu.hu vr23, vr23, vr23
|
|
vhaddw.du.wu vr20, vr20, vr20
|
|
vhaddw.du.wu vr21, vr21, vr21
|
|
vhaddw.du.wu vr22, vr22, vr22
|
|
vhaddw.du.wu vr23, vr23, vr23
|
|
vhaddw.qu.du vr20, vr20, vr20
|
|
vhaddw.qu.du vr21, vr21, vr21
|
|
vhaddw.qu.du vr22, vr22, vr22
|
|
vhaddw.qu.du vr23, vr23, vr23
|
|
// Store data to p_sad_array
|
|
vstelm.w vr20, a6, 0, 0
|
|
vstelm.w vr21, a6, 4, 0
|
|
vstelm.w vr22, a6, 8, 0
|
|
vstelm.w vr23, a6, 12, 0
|
|
endfunc_x264
|
|
|
|
/*
|
|
* void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0,
|
|
* uint8_t *p_ref1, uint8_t *p_ref2,
|
|
* uint8_t *p_ref3, intptr_t i_ref_stride,
|
|
* int32_t p_sad_array[4])
|
|
*/
|
|
function_x264 pixel_sad_x4_16x16_lsx
|
|
slli.d t1, a5, 1
|
|
add.d t2, a5, t1
|
|
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
|
|
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
|
|
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
|
|
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr1, vr10
|
|
vabsd.bu vr11, vr1, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vabsd.bu vr16, vr3, vr16
|
|
vabsd.bu vr17, vr3, vr17
|
|
vabsd.bu vr18, vr3, vr18
|
|
vabsd.bu vr19, vr3, vr19
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vhaddw.hu.bu vr17, vr17, vr17
|
|
vhaddw.hu.bu vr18, vr18, vr18
|
|
vhaddw.hu.bu vr19, vr19, vr19
|
|
vadd.h vr0, vr4, vr8
|
|
vadd.h vr1, vr12, vr16
|
|
vadd.h vr20, vr0, vr1
|
|
vadd.h vr0, vr5, vr9
|
|
vadd.h vr1, vr13, vr17
|
|
vadd.h vr21, vr0, vr1
|
|
vadd.h vr0, vr6, vr10
|
|
vadd.h vr1, vr14, vr18
|
|
vadd.h vr22, vr0, vr1
|
|
vadd.h vr0, vr7, vr11
|
|
vadd.h vr1, vr15, vr19
|
|
vadd.h vr23, vr0, vr1
|
|
|
|
.rept 3
|
|
alsl.d a1, a5, a1, 2
|
|
alsl.d a2, a5, a2, 2
|
|
alsl.d a3, a5, a3, 2
|
|
alsl.d a4, a5, a4, 2
|
|
addi.d a0, a0, 64
|
|
vld vr0, a0, 0
|
|
vld vr1, a0, 16
|
|
vld vr2, a0, 32
|
|
vld vr3, a0, 48
|
|
LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16
|
|
LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17
|
|
LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18
|
|
LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19
|
|
vabsd.bu vr4, vr0, vr4
|
|
vabsd.bu vr5, vr0, vr5
|
|
vabsd.bu vr6, vr0, vr6
|
|
vabsd.bu vr7, vr0, vr7
|
|
vabsd.bu vr8, vr1, vr8
|
|
vabsd.bu vr9, vr1, vr9
|
|
vabsd.bu vr10, vr1, vr10
|
|
vabsd.bu vr11, vr1, vr11
|
|
vabsd.bu vr12, vr2, vr12
|
|
vabsd.bu vr13, vr2, vr13
|
|
vabsd.bu vr14, vr2, vr14
|
|
vabsd.bu vr15, vr2, vr15
|
|
vabsd.bu vr16, vr3, vr16
|
|
vabsd.bu vr17, vr3, vr17
|
|
vabsd.bu vr18, vr3, vr18
|
|
vabsd.bu vr19, vr3, vr19
|
|
vhaddw.hu.bu vr4, vr4, vr4
|
|
vhaddw.hu.bu vr5, vr5, vr5
|
|
vhaddw.hu.bu vr6, vr6, vr6
|
|
vhaddw.hu.bu vr7, vr7, vr7
|
|
vhaddw.hu.bu vr8, vr8, vr8
|
|
vhaddw.hu.bu vr9, vr9, vr9
|
|
vhaddw.hu.bu vr10, vr10, vr10
|
|
vhaddw.hu.bu vr11, vr11, vr11
|
|
vhaddw.hu.bu vr12, vr12, vr12
|
|
vhaddw.hu.bu vr13, vr13, vr13
|
|
vhaddw.hu.bu vr14, vr14, vr14
|
|
vhaddw.hu.bu vr15, vr15, vr15
|
|
vhaddw.hu.bu vr16, vr16, vr16
|
|
vhaddw.hu.bu vr17, vr17, vr17
|
|
vhaddw.hu.bu vr18, vr18, vr18
|
|
vhaddw.hu.bu vr19, vr19, vr19
|
|
vadd.h vr0, vr4, vr8
|
|
vadd.h vr1, vr12, vr16
|
|
vadd.h vr16, vr0, vr1
|
|
vadd.h vr0, vr5, vr9
|
|
vadd.h vr1, vr13, vr17
|
|
vadd.h vr17, vr0, vr1
|
|
vadd.h vr0, vr6, vr10
|
|
vadd.h vr1, vr14, vr18
|
|
vadd.h vr18, vr0, vr1
|
|
vadd.h vr0, vr7, vr11
|
|
vadd.h vr1, vr15, vr19
|
|
vadd.h vr19, vr0, vr1
|
|
vadd.h vr20, vr16, vr20
|
|
vadd.h vr21, vr17, vr21
|
|
vadd.h vr22, vr18, vr22
|
|
vadd.h vr23, vr19, vr23
|
|
.endr
|
|
vhaddw.wu.hu vr20, vr20, vr20
|
|
vhaddw.wu.hu vr21, vr21, vr21
|
|
vhaddw.wu.hu vr22, vr22, vr22
|
|
vhaddw.wu.hu vr23, vr23, vr23
|
|
vhaddw.du.wu vr20, vr20, vr20
|
|
vhaddw.du.wu vr21, vr21, vr21
|
|
vhaddw.du.wu vr22, vr22, vr22
|
|
vhaddw.du.wu vr23, vr23, vr23
|
|
vhaddw.qu.du vr20, vr20, vr20
|
|
vhaddw.qu.du vr21, vr21, vr21
|
|
vhaddw.qu.du vr22, vr22, vr22
|
|
vhaddw.qu.du vr23, vr23, vr23
|
|
// Store data to p_sad_array
|
|
vstelm.w vr20, a6, 0, 0
|
|
vstelm.w vr21, a6, 4, 0
|
|
vstelm.w vr22, a6, 8, 0
|
|
vstelm.w vr23, a6, 12, 0
|
|
endfunc_x264
|
|
#endif /* !HIGH_BIT_DEPTH */
|