/***************************************************************************** * sad-a.S: loongarch sad functions ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Lu Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" #if !HIGH_BIT_DEPTH /* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x16_lasx slli.d t1, a5, 1 add.d t2, a5, t1 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 0 xvld xr16, a0, 32 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr12, xr8, xr8 xvhaddw.hu.bu xr13, xr9, xr9 xvhaddw.hu.bu xr14, xr10, xr10 xvhaddw.hu.bu xr15, xr11, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 64 xvld xr16, a0, 96 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 128 xvld xr16, a0, 160 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 192 xvld xr16, a0, 224 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 xvori.b xr17, xr12, 0 xvori.b xr18, xr13, 0 xvpermi.q xr12, xr14, 0x02 xvpermi.q xr14, xr17, 0x31 xvpermi.q xr13, xr15, 0x02 xvpermi.q xr15, xr18, 0x31 xvadd.h xr12, xr12, xr14 xvadd.h xr13, xr13, xr15 xvhaddw.w.h xr12, xr12, xr12 xvhaddw.w.h xr13, xr13, xr13 xvhaddw.d.w xr12, xr12, xr12 xvhaddw.d.w xr13, xr13, xr13 xvhaddw.q.d xr12, xr12, xr12 xvhaddw.q.d xr13, xr13, xr13 xvpackev.w xr13, xr13, xr12 // Store data to p_sad_array xvstelm.d xr13, a6, 0, 0 xvstelm.d xr13, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x8_lasx slli.d t1, a5, 1 add.d t2, a5, t1 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 0 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr12, xr8, xr8 xvhaddw.hu.bu xr13, xr9, xr9 xvhaddw.hu.bu xr14, xr10, xr10 xvhaddw.hu.bu xr15, xr11, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 32 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 64 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 96 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 xvori.b xr17, xr12, 0 xvori.b xr18, xr13, 0 xvpermi.q xr12, xr14, 0x02 xvpermi.q xr14, xr17, 0x31 xvpermi.q xr13, xr15, 0x02 xvpermi.q xr15, xr18, 0x31 xvadd.h xr12, xr12, xr14 xvadd.h xr13, xr13, xr15 xvhaddw.w.h xr12, xr12, xr12 xvhaddw.w.h xr13, xr13, xr13 xvhaddw.d.w xr12, xr12, xr12 xvhaddw.d.w xr13, xr13, xr13 xvhaddw.q.d xr12, xr12, xr12 xvhaddw.q.d xr13, xr13, xr13 xvpackev.w xr13, xr13, xr12 // Store data to p_sad_array xvstelm.d xr13, a6, 0, 0 xvstelm.d xr13, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x8_lasx slli.d t1, a5, 1 add.d t2, t1, a5 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vilvl.d vr8, vr9, vr8 vilvl.d vr10, vr11, vr10 vilvl.d vr14, vr15, vr14 vilvl.d vr16, vr17, vr16 vilvl.d vr18, vr19, vr18 vilvl.d vr20, vr21, vr20 xvpermi.q xr4, xr6, 0x02 xvpermi.q xr8, xr10, 0x02 xvpermi.q xr14, xr16, 0x02 xvpermi.q xr18, xr20, 0x02 // Calculate the absolute value of the difference xvldrepl.d xr3, a0, 0 xvabsd.bu xr5, xr3, xr4 xvldrepl.d xr3, a0, 16 xvabsd.bu xr9, xr3, xr8 xvldrepl.d xr3, a0, 32 xvabsd.bu xr10, xr3, xr14 xvldrepl.d xr3, a0, 48 xvabsd.bu xr11, xr3, xr18 xvaddwev.h.bu xr0, xr5, xr9 xvaddwod.h.bu xr1, xr5, xr9 xvaddwev.h.bu xr2, xr10, xr11 xvaddwod.h.bu xr22, xr10, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vilvl.d vr8, vr9, vr8 vilvl.d vr10, vr11, vr10 vilvl.d vr14, vr15, vr14 vilvl.d vr16, vr17, vr16 vilvl.d vr18, vr19, vr18 vilvl.d vr20, vr21, vr20 xvpermi.q xr4, xr6, 0x02 xvpermi.q xr8, xr10, 0x02 xvpermi.q xr14, xr16, 0x02 xvpermi.q xr18, xr20, 0x02 // Calculate the absolute value of the difference xvldrepl.d xr3, a0, 64 xvabsd.bu xr5, xr3, xr4 xvldrepl.d xr3, a0, 80 xvabsd.bu xr9, xr3, xr8 xvldrepl.d xr3, a0, 96 xvabsd.bu xr10, xr3, xr14 xvldrepl.d xr3, a0, 112 xvabsd.bu xr11, xr3, xr18 xvaddwev.h.bu xr12, xr5, xr9 xvaddwod.h.bu xr13, xr5, xr9 xvaddwev.h.bu xr14, xr10, xr11 xvaddwod.h.bu xr15, xr10, xr11 xvadd.h xr5, xr0, xr12 xvadd.h xr9, xr1, xr13 xvadd.h xr10, xr2, xr14 xvadd.h xr11, xr22, xr15 xvadd.h xr5, xr5, xr9 xvadd.h xr10, xr10, xr11 xvadd.h xr10, xr10, xr5 xvhaddw.wu.hu xr10, xr10, xr10 xvhaddw.du.wu xr10, xr10, xr10 xvpermi.q xr5, xr10, 0x01 xvpickev.w xr10, xr5, xr10 // Store data to p_sad_array vst vr10, a6, 0 endfunc_x264 /* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x4_lasx slli.d t1, a5, 1 add.d t2, t1, a5 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 fld.d f2, a0, 0 fld.d f3, a0, 16 fld.d f12, a0, 32 fld.d f13, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr3, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr7, vr11, vr7 vilvl.d vr13, vr13, vr12 vilvl.d vr14, vr18, vr14 vilvl.d vr15, vr19, vr15 vilvl.d vr16, vr20, vr16 vilvl.d vr17, vr21, vr17 xvpermi.q xr3, xr13, 0x02 xvpermi.q xr4, xr16, 0x02 xvpermi.q xr5, xr17, 0x02 xvpermi.q xr6, xr14, 0x02 xvpermi.q xr7, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvpermi.d xr10, xr10, 0x4e xvpermi.d xr11, xr11, 0x4e xvadd.h xr8, xr8, xr10 xvadd.h xr9, xr9, xr11 xvhaddw.w.h xr8, xr8, xr8 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr8, xr8, xr8 xvhaddw.q.d xr9, xr9, xr9 xvpackev.w xr9, xr9, xr8 // Store data to p_sad_array xvstelm.d xr9, a6, 0, 0 xvstelm.d xr9, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_4x4_lsx slli.d t0, a5, 1 add.d t1, a5, t0 slli.d t2, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 fld.s f2, a0, 0 fld.s f3, a0, 16 fld.s f4, a1, 0 fldx.s f8, a1, a5 fld.s f5, a2, 0 fldx.s f9, a2, a5 fld.s f6, a3, 0 fldx.s f10, a3, a5 fld.s f7, a4, 0 fldx.s f11, a4, a5 vilvl.w vr3, vr3, vr2 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr7, vr11, vr7 fld.s f2, a0, 32 fld.s f0, a0, 48 fldx.s f8, a1, t0 fldx.s f12, a1, t1 fldx.s f9, a2, t0 fldx.s f13, a2, t1 fldx.s f10, a3, t0 fldx.s f14, a3, t1 fldx.s f11, a4, t0 fldx.s f15, a4, t1 vilvl.w vr2, vr0, vr2 vilvl.w vr8, vr12, vr8 vilvl.w vr9, vr13, vr9 vilvl.w vr10, vr14, vr10 vilvl.w vr11, vr15, vr11 vilvl.d vr3, vr2, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr7, vr11, vr7 // Calculate the absolute value of the difference vabsd.bu vr8, vr3, vr4 vabsd.bu vr9, vr3, vr5 vabsd.bu vr10, vr3, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.du.wu vr10, vr10, vr10 vhaddw.du.wu vr11, vr11, vr11 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 vhaddw.qu.du vr10, vr10, vr10 vhaddw.qu.du vr11, vr11, vr11 // Store data to p_sad_array vstelm.w vr8, a6, 0, 0 vstelm.w vr9, a6, 4, 0 vstelm.w vr10, a6, 8, 0 vstelm.w vr11, a6, 12, 0 endfunc_x264 /* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x16_lasx // Load data from p_src, p_ref0, p_ref1 and p_ref2 slli.d t1, a4, 1 add.d t2, a4, t1 slli.d t3, a4, 2 xvld xr2, a0, 0 xvld xr3, a0, 32 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr16, xr7, xr7 xvhaddw.hu.bu xr17, xr8, xr8 xvhaddw.hu.bu xr18, xr9, xr9 xvhaddw.hu.bu xr19, xr10, xr10 xvhaddw.hu.bu xr20, xr11, xr11 xvhaddw.hu.bu xr21, xr12, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 64 xvld xr3, a0, 96 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 128 xvld xr3, a0, 160 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 192 xvld xr3, a0, 224 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 xvadd.h xr11, xr16, xr19 xvadd.h xr12, xr17, xr20 xvadd.h xr13, xr18, xr21 xvhaddw.wu.hu xr11, xr11, xr11 xvhaddw.wu.hu xr12, xr12, xr12 xvhaddw.wu.hu xr13, xr13, xr13 xvhaddw.du.wu xr11, xr11, xr11 xvhaddw.du.wu xr12, xr12, xr12 xvhaddw.du.wu xr13, xr13, xr13 xvhaddw.qu.du xr11, xr11, xr11 xvhaddw.qu.du xr12, xr12, xr12 xvhaddw.qu.du xr13, xr13, xr13 xvpickve.w xr17, xr11, 4 xvpickve.w xr18, xr12, 4 xvpickve.w xr19, xr13, 4 xvadd.w xr11, xr11, xr17 xvadd.w xr12, xr12, xr18 xvadd.w xr13, xr13, xr19 // Store data to p_sad_array vstelm.w vr11, a5, 0, 0 vstelm.w vr12, a5, 4, 0 vstelm.w vr13, a5, 8, 0 endfunc_x264 /* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x8_lasx // Load data from p_src, p_ref0, p_ref1 and p_ref2 slli.d t1, a4, 1 add.d t2, a4, t1 slli.d t3, a4, 2 xvld xr2, a0, 0 xvld xr3, a0, 32 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr16, xr7, xr7 xvhaddw.hu.bu xr17, xr8, xr8 xvhaddw.hu.bu xr18, xr9, xr9 xvhaddw.hu.bu xr19, xr10, xr10 xvhaddw.hu.bu xr20, xr11, xr11 xvhaddw.hu.bu xr21, xr12, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 64 xvld xr3, a0, 96 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 xvadd.h xr11, xr16, xr19 xvadd.h xr12, xr17, xr20 xvadd.h xr13, xr18, xr21 xvhaddw.wu.hu xr11, xr11, xr11 xvhaddw.wu.hu xr12, xr12, xr12 xvhaddw.wu.hu xr13, xr13, xr13 xvhaddw.du.wu xr11, xr11, xr11 xvhaddw.du.wu xr12, xr12, xr12 xvhaddw.du.wu xr13, xr13, xr13 xvhaddw.qu.du xr11, xr11, xr11 xvhaddw.qu.du xr12, xr12, xr12 xvhaddw.qu.du xr13, xr13, xr13 xvpickve.w xr17, xr11, 4 xvpickve.w xr18, xr12, 4 xvpickve.w xr19, xr13, 4 xvadd.w xr11, xr11, xr17 xvadd.w xr12, xr12, xr18 xvadd.w xr13, xr13, xr19 // Store data to p_sad_array vstelm.w vr11, a5, 0, 0 vstelm.w vr12, a5, 4, 0 vstelm.w vr13, a5, 8, 0 endfunc_x264 /* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_4x4_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.s f3, a0, 0 fld.s f7, a0, 16 fld.s f11, a0, 32 fld.s f15, a0, 48 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 // Calculate the absolute value of the difference vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x4_lasx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr4, vr6, vr4 vilvl.d vr7, vr9, vr7 vilvl.d vr8, vr10, vr8 xvpermi.q xr3, xr7, 0x02 xvpermi.q xr4, xr8, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr5, xr3, xr4 xvhaddw.hu.bu xr6, xr5, xr5 xvhaddw.wu.hu xr6, xr6, xr6 xvhaddw.du.wu xr6, xr6, xr6 xvhaddw.qu.du xr6, xr6, xr6 xvpickve2gr.wu t2, xr6, 0 xvpickve2gr.wu t3, xr6, 4 add.d a0, t2, t3 endfunc_x264 /* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x4_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 // Calculate the absolute value of the difference vabsd.bu vr5, vr3, vr4 vhaddw.hu.bu vr6, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr11, vr3, vr4 vhaddw.hu.bu vr11, vr11, vr11 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr5, vr3, vr4 vhaddw.hu.bu vr5, vr5, vr5 vadd.h vr6, vr11, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr11, vr3, vr4 vhaddw.hu.bu vr11, vr11, vr11 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr12, vr3, vr4 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr11, vr11, vr12 .endr vhaddw.wu.hu vr11, vr11, vr11 vhaddw.du.wu vr11, vr11, vr11 vhaddw.qu.du vr11, vr11, vr11 vpickve2gr.wu a0, vr11, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x4_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr6, vr11, vr12 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr13, vr11, vr12 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr6, vr11, vr12 vadd.h vr6, vr6, vr13 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr13, vr11, vr12 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr14, vr11, vr12 vadd.h vr13, vr13, vr14 .endr vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_16x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr14, vr8, vr9 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr12, vr8, vr9 vadd.h vr13, vr12, vr14 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_16x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr13, vr8, vr9 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr12, vr8, vr9 vadd.h vr13, vr12, vr13 .endr vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* * void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_4x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.s f3, a0, 0 fld.s f7, a0, 16 fld.s f11, a0, 32 fld.s f15, a0, 48 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 vabsd.bu vr0, vr3, vr4 vabsd.bu vr1, vr3, vr5 vabsd.bu vr2, vr3, vr6 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 fld.s f3, a0, 64 fld.s f7, a0, 80 fld.s f11, a0, 96 fld.s f15, a0, 112 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vadd.h vr7, vr7, vr0 vadd.h vr8, vr8, vr1 vadd.h vr9, vr9, vr2 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x4_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr0, vr3, vr4 vabsd.bu vr1, vr3, vr5 vabsd.bu vr2, vr3, vr6 vabsd.bu vr3, vr11, vr12 vabsd.bu vr4, vr11, vr13 vabsd.bu vr5, vr11, vr14 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr3, vr3, vr3 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vadd.h vr7, vr0, vr3 vadd.h vr8, vr1, vr4 vadd.h vr9, vr2, vr5 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr0, vr7, vr10 vadd.h vr1, vr8, vr15 vadd.h vr2, vr9, vr16 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 fld.d f3, a0, 64 fld.d f7, a0, 80 fld.d f11, a0, 96 fld.d f15, a0, 112 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr7, vr7, vr10 vadd.h vr8, vr8, vr15 vadd.h vr9, vr9, vr16 vadd.h vr7, vr7, vr0 vadd.h vr8, vr8, vr1 vadd.h vr9, vr9, vr2 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x16_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr0, vr7, vr10 vadd.h vr1, vr8, vr15 vadd.h vr2, vr9, vr16 .rept 3 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 addi.d a0, a0, 64 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr7, vr7, vr10 vadd.h vr8, vr8, vr15 vadd.h vr9, vr9, vr16 vadd.h vr0, vr7, vr0 vadd.h vr1, vr8, vr1 vadd.h vr2, vr9, vr2 .endr vhaddw.wu.hu vr0, vr0, vr0 vhaddw.wu.hu vr1, vr1, vr1 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr0, vr0, vr0 vhaddw.du.wu vr1, vr1, vr1 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr0, vr0, vr0 vhaddw.qu.du vr1, vr1, vr1 vhaddw.qu.du vr2, vr2, vr2 // Store data to p_sad_array vstelm.w vr0, a5, 0, 0 vstelm.w vr1, a5, 4, 0 vstelm.w vr2, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr16, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr17, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr18, vr1, vr0 // vr16, vr17, vr18 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 vld vr0, a0, 64 vld vr1, a0, 80 vld vr2, a0, 96 vld vr3, a0, 112 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr2, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr3, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr4, vr1, vr0 vadd.h vr0, vr16, vr2 vadd.h vr1, vr17, vr3 vadd.h vr2, vr18, vr4 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.wu.hu vr1, vr1, vr1 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr0, vr0, vr0 vhaddw.du.wu vr1, vr1, vr1 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr0, vr0, vr0 vhaddw.qu.du vr1, vr1, vr1 vhaddw.qu.du vr2, vr2, vr2 // Store data to p_sad_array vstelm.w vr0, a5, 0, 0 vstelm.w vr1, a5, 4, 0 vstelm.w vr2, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x16_lsx slli.d t1, a4, 1 add.d t2, a4, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr16, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr17, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr18, vr1, vr0 .rept 3 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 addi.d a0, a0, 64 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr2, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr3, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr4, vr1, vr0 vadd.h vr16, vr16, vr2 vadd.h vr17, vr17, vr3 vadd.h vr18, vr18, vr4 .endr vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 // Store data to p_sad_array vstelm.w vr16, a5, 0, 0 vstelm.w vr17, a5, 4, 0 vstelm.w vr18, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_4x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 fld.s f0, a0, 0 fld.s f1, a0, 16 fld.s f2, a0, 32 fld.s f3, a0, 48 FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.d vr0, vr2, vr0 vilvl.w vr4, vr8, vr4 vilvl.w vr12, vr16, vr12 vilvl.d vr1, vr12, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr13, vr17, vr13 vilvl.d vr2, vr13, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr14, vr6 vilvl.w vr7, vr11, vr7 vilvl.w vr15, vr19, vr15 vilvl.d vr4, vr15, vr7 vabsd.bu vr1, vr0, vr1 vabsd.bu vr2, vr0, vr2 vabsd.bu vr3, vr0, vr3 vabsd.bu vr4, vr0, vr4 vhaddw.hu.bu vr20, vr1, vr1 vhaddw.hu.bu vr21, vr2, vr2 vhaddw.hu.bu vr22, vr3, vr3 vhaddw.hu.bu vr23, vr4, vr4 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 fld.s f0, a0, 64 fld.s f1, a0, 80 fld.s f2, a0, 96 fld.s f3, a0, 112 FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.d vr0, vr2, vr0 vilvl.w vr4, vr8, vr4 vilvl.w vr12, vr16, vr12 vilvl.d vr1, vr12, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr13, vr17, vr13 vilvl.d vr2, vr13, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr14, vr6 vilvl.w vr7, vr11, vr7 vilvl.w vr15, vr19, vr15 vilvl.d vr4, vr15, vr7 vabsd.bu vr1, vr0, vr1 vabsd.bu vr2, vr0, vr2 vabsd.bu vr3, vr0, vr3 vabsd.bu vr4, vr0, vr4 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr3, vr3, vr3 vhaddw.hu.bu vr4, vr4, vr4 vadd.h vr16, vr20, vr1 vadd.h vr17, vr21, vr2 vadd.h vr18, vr22, vr3 vadd.h vr19, vr23, vr4 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x4_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr20, vr4, vr12 vadd.h vr21, vr5, vr13 vadd.h vr22, vr6, vr14 vadd.h vr23, vr7, vr15 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 fld.d f0, a0, 64 fld.d f1, a0, 80 fld.d f2, a0, 96 fld.d f3, a0, 112 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vadd.h vr16, vr16, vr20 vadd.h vr17, vr17, vr21 vadd.h vr18, vr18, vr22 vadd.h vr19, vr19, vr23 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x16_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr20, vr4, vr12 vadd.h vr21, vr5, vr13 vadd.h vr22, vr6, vr14 vadd.h vr23, vr7, vr15 .rept 3 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 addi.d a0, a0, 64 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 .endr vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr20, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr21, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr22, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr23, vr0, vr1 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 vld vr0, a0, 64 vld vr1, a0, 80 vld vr2, a0, 96 vld vr3, a0, 112 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr16, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr17, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr18, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr19, vr0, vr1 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x16_lsx slli.d t1, a5, 1 add.d t2, a5, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr20, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr21, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr22, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr23, vr0, vr1 .rept 3 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 addi.d a0, a0, 64 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr16, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr17, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr18, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr19, vr0, vr1 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 .endr vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 #endif /* !HIGH_BIT_DEPTH */