/***************************************************************************** * pixel.S: aarch64 pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "pixel-a-common.S" const mask .rept 16 .byte 0xff .endr .rept 16 .byte 0x00 .endr endconst .macro SUMSUBL_AB sum, sub, a, b uaddl \sum, \a, \b usubl \sub, \a, \b .endm #if BIT_DEPTH == 8 .macro SAD_START_4 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 uabdl v16.8h, v0.8b, v1.8b .endm .macro SAD_4 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 uabal v16.8h, v0.8b, v1.8b .endm .macro SAD_START_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabdl v16.8h, v0.8b, v1.8b uabdl v17.8h, v2.8b, v3.8b .endm .macro SAD_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabal v16.8h, v0.8b, v1.8b uabal v17.8h, v2.8b, v3.8b .endm .macro SAD_START_16, dotprod=0 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 .if \dotprod == 0 uabdl v16.8h, v0.8b, v1.8b uabdl2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .else movi v18.4s, #0x0 movi v19.16b, #0x1 uabd v16.16b, v0.16b, v1.16b uabd v17.16b, v2.16b, v3.16b udot v18.4s, v16.16b, v19.16b udot v18.4s, v17.16b, v19.16b .endif .endm .macro SAD_16, dotprod=0 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 .if \dotprod == 0 uabal v16.8h, v0.8b, v1.8b uabal2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .else uabd v16.16b, v0.16b, v1.16b uabd v17.16b, v2.16b, v3.16b udot v18.4s, v16.16b, v19.16b udot v18.4s, v17.16b, v19.16b .endif .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 4 add v16.8h, v16.8h, v17.8h .endif uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_FUNC_DOTPROD w, h, name function pixel_sad\name\()_\w\()x\h\()_neon_dotprod, export=1 SAD_START_\w 1 .rept \h / 2 - 1 SAD_\w 1 .endr addv s0, v18.4s fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uabal ld1 {v0.s}[0], [x0], x7 ld1 {v1.s}[0], [x1], x5 ld1 {v0.s}[1], [x0], x7 ld1 {v1.s}[1], [x1], x5 ld1 {v2.s}[0], [x2], x5 ld1 {v2.s}[1], [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {v3.s}[0], [x3], x5 ld1 {v3.s}[1], [x3], x5 \first v17.8h, v2.8b, v0.8b .if \x == 4 ld1 {v4.s}[0], [x4], x5 ld1 {v4.s}[1], [x4], x5 .endif \first v18.8h, v3.8b, v0.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b .endif .endm .macro SAD_X_8 x, first=uabal ld1 {v0.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 ld1 {v2.8b}, [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {v3.8b}, [x3], x5 \first v17.8h, v2.8b, v0.8b ld1 {v5.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 \first v18.8h, v3.8b, v0.8b ld1 {v2.8b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b ld1 {v3.8b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b .if \x == 4 ld1 {v4.8b}, [x4], x5 ld1 {v1.8b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b uabal v19.8h, v1.8b, v5.8b .endif .endm .macro SAD_X_16 x, first=uabal ld1 {v0.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 ld1 {v2.16b}, [x2], x5 \first v16.8h, v1.8b, v0.8b \first\()2 v20.8h, v1.16b, v0.16b ld1 {v3.16b}, [x3], x5 \first v17.8h, v2.8b, v0.8b \first\()2 v21.8h, v2.16b, v0.16b ld1 {v5.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 \first v18.8h, v3.8b, v0.8b \first\()2 v22.8h, v3.16b, v0.16b ld1 {v2.16b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b uabal2 v20.8h, v1.16b, v5.16b ld1 {v3.16b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b uabal2 v21.8h, v2.16b, v5.16b .if \x == 4 ld1 {v4.16b}, [x4], x5 ld1 {v1.16b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b uabal2 v22.8h, v3.16b, v5.16b .if \x == 4 \first v19.8h, v4.8b, v0.8b \first\()2 v23.8h, v4.16b, v0.16b uabal v19.8h, v1.8b, v5.8b uabal2 v23.8h, v1.16b, v5.16b .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE SAD_X_\w \x, uabdl .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm .macro SAD_X_DOTPROD_16 x ld1 {v0.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 ld1 {v2.16b}, [x2], x5 uabd v20.16b, v1.16b, v0.16b uabd v22.16b, v2.16b, v0.16b ld1 {v5.16b}, [x0], x7 udot v16.4s, v20.16b, v28.16b udot v17.4s, v22.16b, v28.16b ld1 {v3.16b}, [x3], x5 ld1 {v1.16b}, [x1], x5 uabd v24.16b, v3.16b, v0.16b uabd v21.16b, v1.16b, v5.16b ld1 {v2.16b}, [x2], x5 ld1 {v3.16b}, [x3], x5 udot v18.4s, v24.16b, v28.16b udot v16.4s, v21.16b, v28.16b uabd v23.16b, v2.16b, v5.16b uabd v25.16b, v3.16b, v5.16b udot v17.4s, v23.16b, v28.16b udot v18.4s, v25.16b, v28.16b .if \x == 4 ld1 {v4.16b}, [x4], x5 ld1 {v1.16b}, [x4], x5 uabd v26.16b, v4.16b, v0.16b uabd v27.16b, v1.16b, v5.16b udot v19.4s, v26.16b, v28.16b udot v19.4s, v27.16b, v28.16b .endif .endm .macro SAD_X_DOTPROD_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon_dotprod, export=1 movi v16.4s, #0x0 movi v17.4s, #0x0 movi v18.4s, #0x0 .if \x == 4 movi v19.4s, #0x0 .endif movi v28.16b, #0x1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE SAD_X_DOTPROD_\w \x .rept \h / 2 - 1 SAD_X_DOTPROD_\w \x .endr addv s0, v16.4s addv s1, v17.4s addv s2, v18.4s .if \x == 4 addv s3, v19.4s .endif stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else stp s2, s3, [x6] .endif ret endfunc .endm function pixel_vsad_neon, export=1 subs w2, w2, #2 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 uabdl v6.8h, v0.8b, v1.8b uabdl2 v7.8h, v0.16b, v1.16b b.le 2f 1: subs w2, w2, #2 ld1 {v0.16b}, [x0], x1 uabal v6.8h, v1.8b, v0.8b uabal2 v7.8h, v1.16b, v0.16b ld1 {v1.16b}, [x0], x1 b.lt 2f uabal v6.8h, v0.8b, v1.8b uabal2 v7.8h, v0.16b, v1.16b b.gt 1b 2: add v5.8h, v6.8h, v7.8h uaddlv s0, v5.8h fmov w0, s0 ret endfunc #if HAVE_DOTPROD ENABLE_DOTPROD function pixel_vsad_neon_dotprod, export=1 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 subs w2, w2, #2 movi v3.16b, #0x1 movi v6.4s, #0x0 uabd v5.16b, v0.16b, v1.16b udot v6.4s, v5.16b, v3.16b b.le 2f 1: ld1 {v0.16b}, [x0], x1 subs w2, w2, #2 uabd v5.16b, v0.16b, v1.16b ld1 {v1.16b}, [x0], x1 udot v6.4s, v5.16b, v3.16b b.lt 2f uabd v5.16b, v0.16b, v1.16b udot v6.4s, v5.16b, v3.16b b.gt 1b 2: addv s0, v6.4s fmov w0, s0 ret endfunc DISABLE_DOTPROD #endif // HAVE_DOTPROD function pixel_asd8_neon, export=1 sub w4, w4, #2 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 usubl v16.8h, v0.8b, v1.8b 1: subs w4, w4, #2 ld1 {v4.8b}, [x0], x1 ld1 {v5.8b}, [x2], x3 usubl v17.8h, v2.8b, v3.8b usubl v18.8h, v4.8b, v5.8b add v16.8h, v16.8h, v17.8h ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 add v16.8h, v16.8h, v18.8h b.gt 1b usubl v17.8h, v2.8b, v3.8b add v16.8h, v16.8h, v17.8h saddlv s0, v16.8h abs v0.2s, v0.2s fmov w0, s0 ret endfunc .macro SSD_START_4 ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 usubl v2.8h, v16.8b, v17.8b ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_4 usubl v2.8h, v16.8b, v17.8b ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_END_4 usubl v2.8h, v16.8b, v17.8b smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_START_8 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b ld1 {v16.8b}, [x0], x1 smull v0.4s, v2.4h, v2.4h ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_8 usubl v2.8h, v16.8b, v17.8b ld1 {v16.8b}, [x0], x1 smlal v0.4s, v2.4h, v2.4h ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_END_8 usubl v2.8h, v16.8b, v17.8b smlal v0.4s, v2.4h, v2.4h smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_START_16 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b ld1 {v16.16b}, [x0], x1 smull v0.4s, v2.4h, v2.4h smull2 v1.4s, v2.8h, v2.8h ld1 {v17.16b}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h .endm .macro SSD_16 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b ld1 {v16.16b}, [x0], x1 smlal v0.4s, v2.4h, v2.4h smlal2 v1.4s, v2.8h, v2.8h ld1 {v17.16b}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h .endm .macro SSD_END_16 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b smlal v0.4s, v2.4h, v2.4h smlal2 v1.4s, v2.8h, v2.8h smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h add v0.4s, v0.4s, v1.4s .endm .macro SSD_FUNC w h function pixel_ssd_\w\()x\h\()_neon, export=1 SSD_START_\w .rept \h-2 SSD_\w .endr SSD_END_\w addv s0, v0.4s mov w0, v0.s[0] ret endfunc .endm .macro SSD_DOTPROD_8 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x2], x3 ld1 {v18.8b}, [x0], x1 uabd v20.8b, v16.8b, v17.8b ld1 {v19.8b}, [x2], x3 uabd v21.8b, v18.8b, v19.8b udot v22.2s, v20.8b, v20.8b udot v22.2s, v21.8b, v21.8b .endm .macro SSD_DOTPROD_16 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x0], x1 uabd v20.16b, v16.16b, v17.16b ld1 {v19.16b}, [x2], x3 uabd v21.16b, v18.16b, v19.16b udot v22.4s, v20.16b, v20.16b udot v22.4s, v21.16b, v21.16b .endm .macro SSD_DOTPROD_FUNC w h function pixel_ssd_\w\()x\h\()_neon_dotprod, export=1 movi v22.4s, #0x0 .rept \h/2 SSD_DOTPROD_\w .endr .if \w > 8 addv s0, v22.4s .else addp v0.2s, v22.2s, v22.2s .endif mov w0, v0.s[0] ret endfunc .endm function pixel_satd_4x4_neon, export=1 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 usubl v0.8h, v0.8b, v1.8b usubl v1.8h, v2.8b, v3.8b SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h zip1 v0.2d, v2.2d, v3.2d zip2 v1.2d, v2.2d, v3.2d SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.8h, v2.8h, v3.8h trn2 v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.4s, v2.4s, v3.4s trn2 v1.4s, v2.4s, v3.4s abs v0.8h, v0.8h abs v1.8h, v1.8h umax v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x8_neon, export=1 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 b satd_4x8_8x4_end_neon endfunc function pixel_satd_8x4_neon, export=1 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 endfunc function satd_4x8_8x4_end_neon usubl v0.8h, v0.8b, v1.8b usubl v1.8h, v2.8b, v3.8b usubl v2.8h, v4.8b, v5.8b usubl v3.8h, v6.8b, v7.8b SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h trn1 v0.8h, v4.8h, v5.8h trn2 v1.8h, v4.8h, v5.8h trn1 v2.8h, v6.8h, v7.8h trn2 v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h trn1 v0.4s, v16.4s, v18.4s trn2 v1.4s, v16.4s, v18.4s trn1 v2.4s, v17.4s, v19.4s trn2 v3.4s, v17.4s, v19.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h umax v0.8h, v0.8h, v1.8h umax v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x16_neon, export=1 mov x4, x30 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 usubl v16.8h, v0.8b, v1.8b usubl v17.8h, v2.8b, v3.8b usubl v18.8h, v4.8b, v5.8b usubl v19.8h, v6.8b, v7.8b ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 usubl v20.8h, v0.8b, v1.8b usubl v21.8h, v2.8b, v3.8b usubl v22.8h, v4.8b, v5.8b usubl v23.8h, v6.8b, v7.8b SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h bl satd_8x4v_8x8h_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc .macro load_diff_fly_8x8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 usubl v19.8h, v6.8b, v7.8b ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 usubl v20.8h, v0.8b, v1.8b ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 usubl v21.8h, v2.8b, v3.8b ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h usubl v22.8h, v4.8b, v5.8b usubl v23.8h, v6.8b, v7.8b .endm function pixel_satd_8x8_neon, export=1 mov x4, x30 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_8x16_neon, export=1 mov x4, x30 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v0.8h, v1.8h bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v31.8h, v0.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_8x8_neon load_diff_fly_8x8 endfunc // one vertical hadamard pass and two horizontal function satd_8x4v_8x8h_neon SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h abs v4.8h, v4.8h abs v5.8h, v5.8h abs v6.8h, v6.8h abs v7.8h, v7.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h ret endfunc function pixel_ssd_nv12_core_neon, export=1 sxtw x8, w4 add x8, x8, #8 and x8, x8, #~15 movi v6.2d, #0 movi v7.2d, #0 sub x1, x1, x8, lsl #1 sub x3, x3, x8, lsl #1 1: subs w8, w4, #16 ld2 {v0.8b,v1.8b}, [x0], #16 ld2 {v2.8b,v3.8b}, [x2], #16 ld2 {v24.8b,v25.8b}, [x0], #16 ld2 {v26.8b,v27.8b}, [x2], #16 usubl v16.8h, v0.8b, v2.8b usubl v17.8h, v1.8b, v3.8b smull v20.4s, v16.4h, v16.4h smull v21.4s, v17.4h, v17.4h usubl v18.8h, v24.8b, v26.8b usubl v19.8h, v25.8b, v27.8b smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f b.eq 3f 2: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h ld2 {v0.8b,v1.8b}, [x0], #16 ld2 {v2.8b,v3.8b}, [x2], #16 smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h subs w8, w8, #16 usubl v16.8h, v0.8b, v2.8b usubl v17.8h, v1.8b, v3.8b smlal v20.4s, v16.4h, v16.4h smlal v21.4s, v17.4h, v17.4h ld2 {v24.8b,v25.8b}, [x0], #16 ld2 {v26.8b,v27.8b}, [x2], #16 smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f usubl v18.8h, v24.8b, v26.8b usubl v19.8h, v25.8b, v27.8b b.gt 2b 3: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h 4: subs w5, w5, #1 uaddw v6.2d, v6.2d, v20.2s uaddw v7.2d, v7.2d, v21.2s add x0, x0, x1 add x2, x2, x3 uaddw2 v6.2d, v6.2d, v20.4s uaddw2 v7.2d, v7.2d, v21.4s b.gt 1b addp v6.2d, v6.2d, v7.2d st1 {v6.d}[0], [x6] st1 {v6.d}[1], [x7] ret endfunc .macro pixel_var_8 h function pixel_var_8x\h\()_neon, export=1 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x0], x1 mov x2, \h - 4 umull v1.8h, v16.8b, v16.8b uxtl v0.8h, v16.8b umull v2.8h, v17.8b, v17.8b uaddw v0.8h, v0.8h, v17.8b ld1 {v18.8b}, [x0], x1 uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h ld1 {v19.8b}, [x0], x1 1: subs x2, x2, #4 uaddw v0.8h, v0.8h, v18.8b umull v24.8h, v18.8b, v18.8b ld1 {v20.8b}, [x0], x1 uaddw v0.8h, v0.8h, v19.8b umull v25.8h, v19.8b, v19.8b uadalp v1.4s, v24.8h ld1 {v21.8b}, [x0], x1 uaddw v0.8h, v0.8h, v20.8b umull v26.8h, v20.8b, v20.8b uadalp v2.4s, v25.8h ld1 {v18.8b}, [x0], x1 uaddw v0.8h, v0.8h, v21.8b umull v27.8h, v21.8b, v21.8b uadalp v1.4s, v26.8h ld1 {v19.8b}, [x0], x1 uadalp v2.4s, v27.8h b.gt 1b uaddw v0.8h, v0.8h, v18.8b umull v28.8h, v18.8b, v18.8b uaddw v0.8h, v0.8h, v19.8b umull v29.8h, v19.8b, v19.8b uadalp v1.4s, v28.8h uadalp v2.4s, v29.8h b var_end endfunc .endm function pixel_var_16x16_neon, export=1 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x0], x1 mov x2, #14 umull v1.8h, v16.8b, v16.8b umull2 v2.8h, v16.16b, v16.16b uxtl v0.8h, v16.8b uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h uaddw2 v0.8h, v0.8h, v16.16b 1: subs x2, x2, #2 ld1 {v18.16b}, [x0], x1 uaddw v0.8h, v0.8h, v17.8b umull v3.8h, v17.8b, v17.8b uaddw2 v0.8h, v0.8h, v17.16b umull2 v4.8h, v17.16b, v17.16b uadalp v1.4s, v3.8h uadalp v2.4s, v4.8h ld1 {v17.16b}, [x0], x1 uaddw v0.8h, v0.8h, v18.8b umull v5.8h, v18.8b, v18.8b uaddw2 v0.8h, v0.8h, v18.16b umull2 v6.8h, v18.16b, v18.16b uadalp v1.4s, v5.8h uadalp v2.4s, v6.8h b.gt 1b uaddw v0.8h, v0.8h, v17.8b umull v3.8h, v17.8b, v17.8b uaddw2 v0.8h, v0.8h, v17.16b umull2 v4.8h, v17.16b, v17.16b uadalp v1.4s, v3.8h uadalp v2.4s, v4.8h endfunc function var_end add v1.4s, v1.4s, v2.4s uaddlv s0, v0.8h uaddlv d1, v1.4s mov w0, v0.s[0] mov x1, v1.d[0] orr x0, x0, x1, lsl #32 ret endfunc .macro pixel_var2_8 h function pixel_var2_8x\h\()_neon, export=1 mov x3, #16 ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 mov x5, \h - 2 usubl v0.8h, v16.8b, v18.8b usubl v1.8h, v17.8b, v19.8b ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 smull v2.4s, v0.4h, v0.4h smull2 v3.4s, v0.8h, v0.8h smull v4.4s, v1.4h, v1.4h smull2 v5.4s, v1.8h, v1.8h usubl v6.8h, v16.8b, v18.8b 1: subs x5, x5, #1 ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 smlal v4.4s, v7.4h, v7.4h smlal2 v5.4s, v7.8h, v7.8h usubl v6.8h, v16.8b, v18.8b add v1.8h, v1.8h, v7.8h b.gt 1b ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h smlal v4.4s, v7.4h, v7.4h add v1.8h, v1.8h, v7.8h smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] mov w1, v1.s[0] addv s2, v2.4s addv s4, v4.4s mul w0, w0, w0 mul w1, w1, w1 mov w3, v2.s[0] mov w4, v4.s[0] sub w0, w3, w0, lsr # 6 + (\h >> 4) sub w1, w4, w1, lsr # 6 + (\h >> 4) str w3, [x2] add w0, w0, w1 str w4, [x2, #4] ret endfunc .endm function pixel_satd_16x8_neon, export=1 mov x4, x30 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_16x16_neon, export=1 mov x4, x30 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_16x4_neon ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b usubl2 v20.8h, v0.16b, v1.16b ld1 {v5.16b}, [x2], x3 ld1 {v4.16b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b usubl2 v21.8h, v2.16b, v3.16b ld1 {v7.16b}, [x2], x3 ld1 {v6.16b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b usubl2 v22.8h, v4.16b, v5.16b usubl v19.8h, v6.8b, v7.8b usubl2 v23.8h, v6.16b, v7.16b SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h b satd_8x4v_8x8h_neon endfunc function pixel_sa8d_8x8_neon, export=1 mov x4, x30 bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc function pixel_sa8d_16x16_neon, export=1 mov x4, x30 bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc .macro sa8d_satd_8x8 satd= function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h .ifc \satd, satd_ transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h transpose v4.4s, v6.4s, v24.4s, v26.4s transpose v5.4s, v7.4s, v25.4s, v27.4s transpose v24.4s, v26.4s, v0.4s, v2.4s transpose v25.4s, v27.4s, v1.4s, v3.4s abs v0.8h, v4.8h abs v1.8h, v5.8h abs v2.8h, v6.8h abs v3.8h, v7.8h abs v4.8h, v24.8h abs v5.8h, v25.8h abs v6.8h, v26.8h abs v7.8h, v27.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h add v26.8h, v0.8h, v1.8h add v27.8h, v2.8h, v3.8h .endif SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h transpose v20.8h, v21.8h, v16.8h, v17.8h transpose v4.8h, v5.8h, v0.8h, v1.8h transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h transpose v20.4s, v22.4s, v2.4s, v0.4s transpose v21.4s, v23.4s, v3.4s, v1.4s transpose v16.4s, v18.4s, v24.4s, v4.4s transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d transpose v18.2d, v22.2d, v2.2d, v6.2d transpose v19.2d, v23.2d, v3.2d, v7.2d abs v16.8h, v16.8h abs v20.8h, v20.8h abs v17.8h, v17.8h abs v21.8h, v21.8h abs v18.8h, v18.8h abs v22.8h, v22.8h abs v19.8h, v19.8h abs v23.8h, v23.8h umax v16.8h, v16.8h, v20.8h umax v17.8h, v17.8h, v21.8h umax v18.8h, v18.8h, v22.8h umax v19.8h, v19.8h, v23.8h add v0.8h, v16.8h, v17.8h add v1.8h, v18.8h, v19.8h ret endfunc .endm function pixel_sa8d_satd_16x16_neon, export=1 mov x4, x30 bl pixel_sa8d_satd_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h uaddlp v28.4s, v26.8h uaddlp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h add v0.4s, v30.4s, v31.4s // sa8d add v1.4s, v28.4s, v29.4s // satd addv s0, v0.4s addv s1, v1.4s urshr v0.4s, v0.4s, #1 fmov w0, s0 fmov w1, s1 add x0, x0, x1, lsl #32 ret x4 endfunc .macro HADAMARD_AC w h function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 movrel x5, mask_ac_4_8 mov x4, x30 ld1 {v30.8h,v31.8h}, [x5] movi v28.16b, #0 movi v29.16b, #0 bl hadamard_ac_8x8_neon .if \h > 8 bl hadamard_ac_8x8_neon .endif .if \w > 8 sub x0, x0, x1, lsl #3 add x0, x0, #8 bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub x0, x0, x1, lsl #4 bl hadamard_ac_8x8_neon .endif addv s1, v29.4s addv s0, v28.4s mov w1, v1.s[0] mov w0, v0.s[0] lsr w1, w1, #2 lsr w0, w0, #1 orr x0, x0, x1, lsl #32 ret x4 endfunc .endm // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_neon ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x0], x1 ld1 {v18.8b}, [x0], x1 ld1 {v19.8b}, [x0], x1 SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b ld1 {v20.8b}, [x0], x1 ld1 {v21.8b}, [x0], x1 SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b ld1 {v22.8b}, [x0], x1 ld1 {v23.8b}, [x0], x1 SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h abs v0.8h, v16.8h abs v4.8h, v20.8h abs v1.8h, v17.8h abs v5.8h, v21.8h abs v2.8h, v18.8h abs v6.8h, v22.8h abs v3.8h, v19.8h abs v7.8h, v23.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h and v0.16b, v0.16b, v30.16b add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h uadalp v28.4s, v0.8h uadalp v28.4s, v1.8h SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h transpose v16.2d, v17.2d, v6.2d, v7.2d transpose v18.2d, v19.2d, v4.2d, v5.2d transpose v20.2d, v21.2d, v2.2d, v3.2d abs v16.8h, v16.8h abs v17.8h, v17.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v20.8h, v20.8h abs v21.8h, v21.8h transpose v7.2d, v6.2d, v1.2d, v0.2d umax v3.8h, v16.8h, v17.8h umax v2.8h, v18.8h, v19.8h umax v1.8h, v20.8h, v21.8h SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v1.8h and v4.16b, v4.16b, v31.16b add v2.8h, v2.8h, v2.8h abs v5.8h, v5.8h abs v4.8h, v4.8h add v2.8h, v2.8h, v5.8h add v2.8h, v2.8h, v4.8h uadalp v29.4s, v2.8h ret endfunc function pixel_ssim_4x4x2_core_neon, export=1 ld1 {v0.8b}, [x0], x1 ld1 {v2.8b}, [x2], x3 umull v16.8h, v0.8b, v0.8b umull v17.8h, v0.8b, v2.8b umull v18.8h, v2.8b, v2.8b ld1 {v28.8b}, [x0], x1 ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b uaddlp v16.4s, v16.8h uaddlp v17.4s, v17.8h uaddl v0.8h, v0.8b, v28.8b uadalp v16.4s, v18.8h uaddl v1.8h, v2.8b, v29.8b ld1 {v26.8b}, [x0], x1 ld1 {v27.8b}, [x2], x3 umull v23.8h, v26.8b, v26.8b umull v24.8h, v26.8b, v27.8b umull v25.8h, v27.8b, v27.8b uadalp v16.4s, v20.8h uaddw v0.8h, v0.8h, v26.8b uadalp v17.4s, v21.8h uaddw v1.8h, v1.8h, v27.8b uadalp v16.4s, v22.8h ld1 {v28.8b}, [x0], x1 ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b uadalp v16.4s, v23.8h uaddw v0.8h, v0.8h, v28.8b uadalp v17.4s, v24.8h uaddw v1.8h, v1.8h, v29.8b uadalp v16.4s, v25.8h uadalp v16.4s, v20.8h uadalp v17.4s, v21.8h uadalp v16.4s, v22.8h uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h addp v0.4s, v0.4s, v0.4s addp v1.4s, v1.4s, v1.4s addp v2.4s, v16.4s, v16.4s addp v3.4s, v17.4s, v17.4s st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] ret endfunc function pixel_ssim_end4_neon, export=1 mov x5, #4 ld1 {v16.4s,v17.4s}, [x0], #32 ld1 {v18.4s,v19.4s}, [x1], #32 mov w4, #0x99bb subs x2, x5, w2, uxtw mov w3, #416 // ssim_c1 = .01*.01*255*255*64 movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63 add v0.4s, v16.4s, v18.4s add v1.4s, v17.4s, v19.4s add v0.4s, v0.4s, v1.4s ld1 {v20.4s,v21.4s}, [x0], #32 ld1 {v22.4s,v23.4s}, [x1], #32 add v2.4s, v20.4s, v22.4s add v3.4s, v21.4s, v23.4s add v1.4s, v1.4s, v2.4s ld1 {v16.4s}, [x0], #16 ld1 {v18.4s}, [x1], #16 add v16.4s, v16.4s, v18.4s add v2.4s, v2.4s, v3.4s add v3.4s, v3.4s, v16.4s dup v30.4s, w3 dup v31.4s, w4 transpose v4.4s, v5.4s, v0.4s, v1.4s transpose v6.4s, v7.4s, v2.4s, v3.4s transpose v0.2d, v2.2d, v4.2d, v6.2d transpose v1.2d, v3.2d, v5.2d, v7.2d mul v16.4s, v0.4s, v1.4s // s1*s2 mul v0.4s, v0.4s, v0.4s mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 shl v3.4s, v3.4s, #7 shl v2.4s, v2.4s, #6 add v1.4s, v16.4s, v16.4s sub v2.4s, v2.4s, v0.4s // vars sub v3.4s, v3.4s, v1.4s // covar*2 add v0.4s, v0.4s, v30.4s add v2.4s, v2.4s, v31.4s add v1.4s, v1.4s, v30.4s add v3.4s, v3.4s, v31.4s scvtf v0.4s, v0.4s scvtf v2.4s, v2.4s scvtf v1.4s, v1.4s scvtf v3.4s, v3.4s fmul v0.4s, v0.4s, v2.4s fmul v1.4s, v1.4s, v3.4s fdiv v0.4s, v1.4s, v0.4s b.eq 1f movrel x3, mask add x3, x3, x2, lsl #2 ld1 {v29.4s}, [x3] and v0.16b, v0.16b, v29.16b 1: faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ret endfunc #else /* BIT_DEPTH == 8 */ .macro SAD_START_4 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v18.4s, v0.8h, v1.8h .endm .macro SAD_4 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v18.4s, v0.8h, v1.8h .endm .macro SAD_START_8 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v17.4s, v0.8h, v1.8h uabdl v18.4s, v2.4h, v3.4h uabdl2 v19.4s, v2.8h, v3.8h .endm .macro SAD_8 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v17.4s, v0.8h, v1.8h uabal v18.4s, v2.4h, v3.4h uabal2 v19.4s, v2.8h, v3.8h .endm .macro SAD_START_16 lsl x1, x1, #1 lsl x3, x3, #1 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabdl v16.4s, v0.4h, v2.4h uabdl2 v17.4s, v0.8h, v2.8h uabdl v20.4s, v1.4h, v3.4h uabdl2 v21.4s, v1.8h, v3.8h uabdl v18.4s, v4.4h, v6.4h uabdl2 v19.4s, v4.8h, v6.8h uabdl v22.4s, v5.4h, v7.4h uabdl2 v23.4s, v5.8h, v7.8h .endm .macro SAD_16 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabal v16.4s, v0.4h, v2.4h uabal2 v17.4s, v0.8h, v2.8h uabal v20.4s, v1.4h, v3.4h uabal2 v21.4s, v1.8h, v3.8h uabal v18.4s, v4.4h, v6.4h uabal2 v19.4s, v4.8h, v6.8h uabal v22.4s, v5.4h, v7.4h uabal2 v23.4s, v5.8h, v7.8h .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 8 add v20.4s, v20.4s, v21.4s add v16.4s, v16.4s, v20.4s add v22.4s, v22.4s, v23.4s add v18.4s, v18.4s, v22.4s .endif .if \w > 4 add v16.4s, v16.4s, v17.4s add v18.4s, v18.4s, v19.4s .endif add v16.4s, v16.4s, v18.4s uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uaba ld1 {v0.d}[0], [x0], x7 ld1 {v1.d}[0], [x1], x5 ld1 {v0.d}[1], [x0], x7 ld1 {v1.d}[1], [x1], x5 ld1 {v2.d}[0], [x2], x5 ld1 {v2.d}[1], [x2], x5 \first v16.8h, v1.8h, v0.8h ld1 {v3.d}[0], [x3], x5 ld1 {v3.d}[1], [x3], x5 \first v17.8h, v2.8h, v0.8h .if \x == 4 ld1 {v4.d}[0], [x4], x5 ld1 {v4.d}[1], [x4], x5 .endif \first v18.8h, v3.8h, v0.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h .endif .endm .macro SAD_X_8 x, first=uaba ld1 {v0.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v16.8h, v1.8h, v0.8h ld1 {v2.8h}, [x2], x5 ld1 {v3.8h}, [x3], x5 \first v17.8h, v2.8h, v0.8h ld1 {v5.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v18.8h, v3.8h, v0.8h ld1 {v2.8h}, [x2], x5 uaba v16.8h, v1.8h, v5.8h ld1 {v3.8h}, [x3], x5 uaba v17.8h, v2.8h, v5.8h .if \x == 4 ld1 {v4.8h}, [x4], x5 ld1 {v1.8h}, [x4], x5 .endif uaba v18.8h, v3.8h, v5.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h uaba v19.8h, v1.8h, v5.8h .endif .endm .macro SAD_X_16 x, first=uaba ld1 {v0.8h, v1.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 ld1 {v4.8h, v5.8h}, [x2], x5 \first v16.8h, v2.8h, v0.8h \first v20.8h, v3.8h, v1.8h ld1 {v24.8h, v25.8h}, [x3], x5 \first v17.8h, v4.8h, v0.8h \first v21.8h, v5.8h, v1.8h ld1 {v6.8h, v7.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 \first v18.8h, v24.8h, v0.8h \first v22.8h, v25.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x5 uaba v16.8h, v2.8h, v6.8h uaba v20.8h, v3.8h, v7.8h ld1 {v24.8h, v25.8h}, [x3], x5 uaba v17.8h, v4.8h, v6.8h uaba v21.8h, v5.8h, v7.8h .if \x == 4 ld1 {v26.8h, v27.8h}, [x4], x5 ld1 {v28.8h, v29.8h}, [x4], x5 .endif uaba v18.8h, v24.8h, v6.8h uaba v22.8h, v25.8h, v7.8h .if \x == 4 \first v19.8h, v26.8h, v0.8h \first v23.8h, v27.8h, v1.8h uaba v19.8h, v28.8h, v6.8h uaba v23.8h, v29.8h, v7.8h .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE lsl x5, x5, #1 lsl x7, x7, #1 SAD_X_\w \x, uabd .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm function pixel_vsad_neon, export=1 subs w2, w2, #2 lsl x1, x1, #1 ld1 {v0.8h, v1.8h}, [x0], x1 ld1 {v2.8h, v3.8h}, [x0], x1 uabd v6.8h, v0.8h, v2.8h uabd v7.8h, v1.8h, v3.8h b.le 2f 1: subs w2, w2, #2 ld1 {v0.8h, v1.8h}, [x0], x1 uaba v6.8h, v2.8h, v0.8h uaba v7.8h, v3.8h, v1.8h ld1 {v2.8h, v3.8h}, [x0], x1 b.lt 2f uaba v6.8h, v0.8h, v2.8h uaba v7.8h, v1.8h, v3.8h b.gt 1b 2: add v5.8h, v6.8h, v7.8h uaddlv s0, v5.8h fmov w0, s0 ret endfunc function pixel_asd8_neon, export=1 sub w4, w4, #2 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v0.8h}, [x0], x1 ld1 {v1.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 sub v16.8h, v0.8h, v1.8h 1: subs w4, w4, #2 ld1 {v4.8h}, [x0], x1 ld1 {v5.8h}, [x2], x3 sub v17.8h, v2.8h, v3.8h sub v18.8h, v4.8h, v5.8h add v16.8h, v16.8h, v17.8h ld1 {v2.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 add v16.8h, v16.8h, v18.8h b.gt 1b sub v17.8h, v2.8h, v3.8h add v16.8h, v16.8h, v17.8h saddlv s0, v16.8h abs v0.4s, v0.4s fmov w0, s0 ret endfunc .macro SSD_START_4 ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 sub v2.4h, v16.4h, v17.4h ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_4 sub v2.4h, v16.4h, v17.4h ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_END_4 sub v2.4h, v16.4h, v17.4h smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_START_8 ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 sub v2.8h, v16.8h, v17.8h ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 smull v0.4s, v2.4h, v2.4h smull2 v20.4s, v2.8h, v2.8h .endm .macro SSD_8 sub v2.8h, v16.8h, v17.8h ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h .endm .macro SSD_END_8 sub v2.8h, v16.8h, v17.8h smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h add v0.4s, v0.4s, v20.4s .endm .macro SSD_START_16 ld1 {v16.8h, v17.8h}, [x0], x1 ld1 {v18.8h, v19.8h}, [x2], x3 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h ld1 {v16.8h, v17.8h}, [x0], x1 smull v0.4s, v2.4h, v2.4h smull2 v20.4s, v2.8h, v2.8h ld1 {v18.8h, v19.8h}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h .endm .macro SSD_16 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h ld1 {v16.8h, v17.8h}, [x0], x1 smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h ld1 {v18.8h, v19.8h}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h .endm .macro SSD_END_16 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h add v0.4s, v0.4s, v20.4s .endm .macro SSD_FUNC w h function pixel_ssd_\w\()x\h\()_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 SSD_START_\w .rept \h-2 SSD_\w .endr SSD_END_\w addv s0, v0.4s fmov w0, s0 ret endfunc .endm function pixel_satd_4x4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 sub v0.8h, v0.8h, v1.8h sub v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h zip1 v0.2d, v2.2d, v3.2d zip2 v1.2d, v2.2d, v3.2d SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.8h, v2.8h, v3.8h trn2 v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.4s, v2.4s, v3.4s trn2 v1.4s, v2.4s, v3.4s abs v0.8h, v0.8h abs v1.8h, v1.8h umax v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h fmov w0, s0 ret endfunc function pixel_satd_4x8_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 b satd_4x8_8x4_end_neon endfunc function pixel_satd_8x4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 endfunc function satd_4x8_8x4_end_neon sub v0.8h, v0.8h, v1.8h sub v1.8h, v2.8h, v3.8h sub v2.8h, v4.8h, v5.8h sub v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h trn1 v0.8h, v4.8h, v5.8h trn2 v1.8h, v4.8h, v5.8h trn1 v2.8h, v6.8h, v7.8h trn2 v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h trn1 v0.4s, v16.4s, v18.4s trn2 v1.4s, v16.4s, v18.4s trn1 v2.4s, v17.4s, v19.4s trn2 v3.4s, v17.4s, v19.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h umax v0.8h, v0.8h, v1.8h umax v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 sub v16.8h, v0.8h, v1.8h sub v17.8h, v2.8h, v3.8h sub v18.8h, v4.8h, v5.8h sub v19.8h, v6.8h, v7.8h ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 sub v20.8h, v0.8h, v1.8h sub v21.8h, v2.8h, v3.8h sub v22.8h, v4.8h, v5.8h sub v23.8h, v6.8h, v7.8h SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h bl satd_8x4v_8x8h_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h fmov w0, s0 ret x4 endfunc .macro load_diff_fly_8x8 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 sub v16.8h, v0.8h, v1.8h ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 sub v17.8h, v2.8h, v3.8h ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 sub v18.8h, v4.8h, v5.8h ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 sub v19.8h, v6.8h, v7.8h ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 sub v20.8h, v0.8h, v1.8h ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 sub v21.8h, v2.8h, v3.8h ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h sub v22.8h, v4.8h, v5.8h sub v23.8h, v6.8h, v7.8h .endm function pixel_satd_8x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_8x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v0.8h, v1.8h bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v31.8h, v0.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_8x8_neon load_diff_fly_8x8 endfunc // one vertical hadamard pass and two horizontal function satd_8x4v_8x8h_neon SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h abs v4.8h, v4.8h abs v5.8h, v5.8h abs v6.8h, v6.8h abs v7.8h, v7.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h ret endfunc function pixel_ssd_nv12_core_neon, export=1 sxtw x8, w4 add x8, x8, #8 and x8, x8, #~15 movi v6.2d, #0 movi v7.2d, #0 sub x1, x1, x8, lsl #1 sub x3, x3, x8, lsl #1 lsl x1, x1, #1 lsl x3, x3, #1 lsl x4, x4, #1 1: subs w8, w4, #32 ld2 {v0.8h, v1.8h}, [x0], #32 ld2 {v2.8h, v3.8h}, [x2], #32 ld2 {v24.8h, v25.8h}, [x0], #32 ld2 {v26.8h, v27.8h}, [x2], #32 sub v16.8h, v0.8h, v2.8h sub v17.8h, v1.8h, v3.8h smull v20.4s, v16.4h, v16.4h smull v21.4s, v17.4h, v17.4h sub v18.8h, v24.8h, v26.8h sub v19.8h, v25.8h, v27.8h smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f b.eq 3f 2: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h ld2 {v0.8h, v1.8h}, [x0], #32 ld2 {v2.8h, v3.8h}, [x2], #32 smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h subs w8, w8, #32 sub v16.8h, v0.8h, v2.8h sub v17.8h, v1.8h, v3.8h smlal v20.4s, v16.4h, v16.4h smlal v21.4s, v17.4h, v17.4h ld2 {v24.8h,v25.8h}, [x0], #32 ld2 {v26.8h,v27.8h}, [x2], #32 smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f sub v18.8h, v24.8h, v26.8h sub v19.8h, v25.8h, v27.8h b.gt 2b 3: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h 4: subs w5, w5, #1 uaddw v6.2d, v6.2d, v20.2s uaddw v7.2d, v7.2d, v21.2s add x0, x0, x1 add x2, x2, x3 uaddw2 v6.2d, v6.2d, v20.4s uaddw2 v7.2d, v7.2d, v21.4s b.gt 1b addp v6.2d, v6.2d, v7.2d st1 {v6.d}[0], [x6] st1 {v6.d}[1], [x7] ret endfunc .macro pixel_var_8 h function pixel_var_8x\h\()_neon, export=1 lsl x1, x1, #1 ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x0], x1 mov x2, \h - 4 umull v1.4s, v16.4h, v16.4h umull2 v30.4s, v16.8h, v16.8h mov v0.16b, v16.16b umull v2.4s, v17.4h, v17.4h umull2 v31.4s, v17.8h, v17.8h add v0.8h, v0.8h, v17.8h ld1 {v18.8h}, [x0], x1 ld1 {v19.8h}, [x0], x1 1: subs x2, x2, #4 add v0.8h, v0.8h, v18.8h umull v24.4s, v18.4h, v18.4h umull2 v25.4s, v18.8h, v18.8h ld1 {v20.8h}, [x0], x1 add v0.8h, v0.8h, v19.8h umull v26.4s, v19.4h, v19.4h umull2 v27.4s, v19.8h, v19.8h add v1.4s, v1.4s, v24.4s add v30.4s, v30.4s, v25.4s ld1 {v21.8h}, [x0], x1 add v0.8h, v0.8h, v20.8h umull v28.4s, v20.4h, v20.4h umull2 v29.4s, v20.8h, v20.8h add v2.4s, v2.4s, v26.4s add v31.4s, v31.4s, v27.4s ld1 {v18.8h}, [x0], x1 add v0.8h, v0.8h, v21.8h umull v3.4s, v21.4h, v21.4h umull2 v4.4s, v21.8h, v21.8h add v1.4s, v1.4s, v28.4s add v30.4s, v30.4s, v29.4s ld1 {v19.8h}, [x0], x1 add v2.4s, v2.4s, v3.4s add v31.4s, v31.4s, v4.4s b.gt 1b add v0.8h, v0.8h, v18.8h umull v24.4s, v18.4h, v18.4h umull2 v25.4s, v18.8h, v18.8h add v0.8h, v0.8h, v19.8h umull v26.4s, v19.4h, v19.4h umull2 v27.4s, v19.8h, v19.8h add v1.4s, v1.4s, v24.4s add v30.4s, v30.4s, v25.4s add v2.4s, v2.4s, v26.4s add v31.4s, v31.4s, v27.4s b var_end endfunc .endm function pixel_var_16x16_neon, export=1 lsl x1, x1, #1 ld1 {v16.8h, v17.8h}, [x0], x1 ld1 {v18.8h, v19.8h}, [x0], x1 mov x2, #14 umull v1.4s, v16.4h, v16.4h umull2 v30.4s, v16.8h, v16.8h add v0.8h, v16.8h, v17.8h umull v2.4s, v17.4h, v17.4h umull2 v31.4s, v17.8h, v17.8h 1: subs x2, x2, #2 ld1 {v20.8h, v21.8h}, [x0], x1 add v0.8h, v0.8h, v18.8h umlal v1.4s, v18.4h, v18.4h umlal2 v30.4s, v18.8h, v18.8h umlal v2.4s, v19.4h, v19.4h umlal2 v31.4s, v19.8h, v19.8h add v0.8h, v0.8h, v19.8h ld1 {v18.8h, v19.8h}, [x0], x1 add v0.8h, v0.8h, v20.8h umlal v1.4s, v20.4h, v20.4h umlal2 v30.4s, v20.8h, v20.8h umlal v2.4s, v21.4h, v21.4h umlal2 v31.4s, v21.8h, v21.8h add v0.8h, v0.8h, v21.8h b.gt 1b add v0.8h, v0.8h, v18.8h umlal v1.4s, v18.4h, v18.4h umlal2 v30.4s, v18.8h, v18.8h umlal v2.4s, v19.4h, v19.4h umlal2 v31.4s, v19.8h, v19.8h add v0.8h, v0.8h, v19.8h endfunc function var_end add v1.4s, v1.4s, v2.4s add v30.4s, v30.4s, v31.4s add v1.4s, v1.4s, v30.4s uaddlv s0, v0.8h uaddlv d1, v1.4s mov w0, v0.s[0] mov x1, v1.d[0] orr x0, x0, x1, lsl #32 ret endfunc .macro pixel_var2_8 h function pixel_var2_8x\h\()_neon, export=1 mov x3, #32 ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 mov x5, \h - 2 sub v0.8h, v16.8h, v18.8h sub v1.8h, v17.8h, v19.8h ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 smull v2.4s, v0.4h, v0.4h smull2 v3.4s, v0.8h, v0.8h smull v4.4s, v1.4h, v1.4h smull2 v5.4s, v1.8h, v1.8h sub v6.8h, v16.8h, v18.8h 1: subs x5, x5, #1 ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h sub v7.8h, v17.8h, v19.8h add v0.8h, v0.8h, v6.8h ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 smlal v4.4s, v7.4h, v7.4h smlal2 v5.4s, v7.8h, v7.8h sub v6.8h, v16.8h, v18.8h add v1.8h, v1.8h, v7.8h b.gt 1b ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h sub v7.8h, v17.8h, v19.8h add v0.8h, v0.8h, v6.8h smlal v4.4s, v7.4h, v7.4h add v1.8h, v1.8h, v7.8h smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] mov w1, v1.s[0] addv s2, v2.4s addv s4, v4.4s mul w0, w0, w0 mul w1, w1, w1 mov w3, v2.s[0] mov w4, v4.s[0] sub w0, w3, w0, lsr # 6 + (\h >> 4) sub w1, w4, w1, lsr # 6 + (\h >> 4) str w3, [x2] add w0, w0, w1 str w4, [x2, #4] ret endfunc .endm function pixel_satd_16x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_16x4_neon uaddl v30.4s, v0.4h, v1.4h uaddl v31.4s, v2.4h, v3.4h uaddl2 v28.4s, v0.8h, v1.8h uaddl2 v29.4s, v2.8h, v3.8h add v30.4s, v30.4s, v28.4s add v31.4s, v31.4s, v29.4s bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] ret x4 endfunc function satd_16x4_neon ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x0], x1 sub v16.8h, v2.8h, v0.8h sub v20.8h, v3.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x0], x1 sub v17.8h, v6.8h, v4.8h sub v21.8h, v7.8h, v5.8h ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x0], x1 sub v18.8h, v2.8h, v0.8h sub v22.8h, v3.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x0], x1 sub v19.8h, v6.8h, v4.8h sub v23.8h, v7.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h b satd_8x4v_8x8h_neon endfunc function pixel_sa8d_8x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc function pixel_sa8d_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #16 add x2, x2, #16 bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc .macro sa8d_satd_8x8 satd= function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h .ifc \satd, satd_ transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h transpose v4.4s, v6.4s, v24.4s, v26.4s transpose v5.4s, v7.4s, v25.4s, v27.4s transpose v24.4s, v26.4s, v0.4s, v2.4s transpose v25.4s, v27.4s, v1.4s, v3.4s abs v0.8h, v4.8h abs v1.8h, v5.8h abs v2.8h, v6.8h abs v3.8h, v7.8h abs v4.8h, v24.8h abs v5.8h, v25.8h abs v6.8h, v26.8h abs v7.8h, v27.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h add v26.8h, v0.8h, v1.8h add v27.8h, v2.8h, v3.8h .endif SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h transpose v20.8h, v21.8h, v16.8h, v17.8h transpose v4.8h, v5.8h, v0.8h, v1.8h transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h transpose v20.4s, v22.4s, v2.4s, v0.4s transpose v21.4s, v23.4s, v3.4s, v1.4s transpose v16.4s, v18.4s, v24.4s, v4.4s transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d transpose v18.2d, v22.2d, v2.2d, v6.2d transpose v19.2d, v23.2d, v3.2d, v7.2d abs v16.8h, v16.8h abs v20.8h, v20.8h abs v17.8h, v17.8h abs v21.8h, v21.8h abs v18.8h, v18.8h abs v22.8h, v22.8h abs v19.8h, v19.8h abs v23.8h, v23.8h umax v16.8h, v16.8h, v20.8h umax v17.8h, v17.8h, v21.8h umax v18.8h, v18.8h, v22.8h umax v19.8h, v19.8h, v23.8h add v0.8h, v16.8h, v17.8h add v1.8h, v18.8h, v19.8h ret endfunc .endm function pixel_sa8d_satd_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_satd_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h uaddlp v28.4s, v26.8h uaddlp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #16 add x2, x2, #16 bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h add v0.4s, v30.4s, v31.4s // sa8d add v1.4s, v28.4s, v29.4s // satd addv s0, v0.4s addv s1, v1.4s urshr v0.4s, v0.4s, #1 fmov w0, s0 fmov w1, s1 add x0, x0, x1, lsl #32 ret x4 endfunc .macro HADAMARD_AC w h function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 movrel x5, mask_ac_4_8 mov x4, x30 lsl x1, x1, #1 ld1 {v30.8h,v31.8h}, [x5] movi v28.16b, #0 movi v29.16b, #0 bl hadamard_ac_8x8_neon .if \h > 8 bl hadamard_ac_8x8_neon .endif .if \w > 8 sub x0, x0, x1, lsl #3 add x0, x0, 16 bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub x0, x0, x1, lsl #4 bl hadamard_ac_8x8_neon .endif addv s1, v29.4s addv s0, v28.4s mov w1, v1.s[0] mov w0, v0.s[0] lsr w1, w1, #2 lsr w0, w0, #1 orr x0, x0, x1, lsl #32 ret x4 endfunc .endm // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_neon ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x0], x1 ld1 {v18.8h}, [x0], x1 ld1 {v19.8h}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h ld1 {v20.8h}, [x0], x1 ld1 {v21.8h}, [x0], x1 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h ld1 {v22.8h}, [x0], x1 ld1 {v23.8h}, [x0], x1 SUMSUB_AB v4.8h, v5.8h, v20.8h, v21.8h SUMSUB_AB v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h abs v0.8h, v16.8h abs v4.8h, v20.8h abs v1.8h, v17.8h abs v5.8h, v21.8h abs v2.8h, v18.8h abs v6.8h, v22.8h abs v3.8h, v19.8h abs v7.8h, v23.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h and v0.16b, v0.16b, v30.16b add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h uadalp v28.4s, v0.8h uadalp v28.4s, v1.8h SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h transpose v16.2d, v17.2d, v6.2d, v7.2d transpose v18.2d, v19.2d, v4.2d, v5.2d transpose v20.2d, v21.2d, v2.2d, v3.2d abs v16.8h, v16.8h abs v17.8h, v17.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v20.8h, v20.8h abs v21.8h, v21.8h transpose v7.2d, v6.2d, v1.2d, v0.2d umax v3.8h, v16.8h, v17.8h umax v2.8h, v18.8h, v19.8h umax v1.8h, v20.8h, v21.8h SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v1.8h and v4.16b, v4.16b, v31.16b add v2.8h, v2.8h, v2.8h abs v5.8h, v5.8h abs v4.8h, v4.8h add v2.8h, v2.8h, v5.8h add v2.8h, v2.8h, v4.8h uadalp v29.4s, v2.8h ret endfunc function pixel_ssim_4x4x2_core_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v0.8h}, [x0], x1 ld1 {v2.8h}, [x2], x3 ld1 {v28.8h}, [x0], x1 ld1 {v29.8h}, [x2], x3 umull v16.4s, v0.4h, v0.4h umull2 v17.4s, v0.8h, v0.8h umull v18.4s, v0.4h, v2.4h umull2 v19.4s, v0.8h, v2.8h umlal v16.4s, v2.4h, v2.4h umlal2 v17.4s, v2.8h, v2.8h ld1 {v26.8h}, [x0], x1 ld1 {v27.8h}, [x2], x3 umlal v16.4s, v28.4h, v28.4h umlal2 v17.4s, v28.8h, v28.8h umlal v18.4s, v28.4h, v29.4h umlal2 v19.4s, v28.8h, v29.8h umlal v16.4s, v29.4h, v29.4h umlal2 v17.4s, v29.8h, v29.8h add v0.8h, v0.8h, v28.8h add v1.8h, v2.8h, v29.8h umlal v16.4s, v26.4h, v26.4h umlal2 v17.4s, v26.8h, v26.8h umlal v18.4s, v26.4h, v27.4h umlal2 v19.4s, v26.8h, v27.8h umlal v16.4s, v27.4h, v27.4h umlal2 v17.4s, v27.8h, v27.8h ld1 {v28.8h}, [x0], x1 ld1 {v29.8h}, [x2], x3 add v0.8h, v0.8h, v26.8h add v1.8h, v1.8h, v27.8h umlal v16.4s, v28.4h, v28.4h umlal2 v17.4s, v28.8h, v28.8h umlal v18.4s, v28.4h, v29.4h umlal2 v19.4s, v28.8h, v29.8h umlal v16.4s, v29.4h, v29.4h umlal2 v17.4s, v29.8h, v29.8h add v0.8h, v0.8h, v28.8h add v1.8h, v1.8h, v29.8h addp v16.4s, v16.4s, v17.4s addp v17.4s, v18.4s, v19.4s uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h addp v0.4s, v0.4s, v0.4s addp v1.4s, v1.4s, v1.4s addp v2.4s, v16.4s, v16.4s addp v3.4s, v17.4s, v17.4s st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [x4] ret endfunc function pixel_ssim_end4_neon, export=1 mov x5, #4 ld1 {v16.4s, v17.4s}, [x0], #32 ld1 {v18.4s, v19.4s}, [x1], #32 subs x2, x5, w2, uxtw // These values must be stored in float, since with 10 bit depth edge cases // may overflow. The hexadecimal values are IEEE-754 representation of the // floating point numbers. ldr w3, =0x45d14e49 // ssim_c1 = .01*.01*1023*1023*64 ldr w4, =0x4a67ca32 // ssim_c2 = .03*.03*1023*1023*64*63 add v0.4s, v16.4s, v18.4s add v1.4s, v17.4s, v19.4s add v0.4s, v0.4s, v1.4s ld1 {v20.4s, v21.4s}, [x0], #32 ld1 {v22.4s, v23.4s}, [x1], #32 add v2.4s, v20.4s, v22.4s add v3.4s, v21.4s, v23.4s add v1.4s, v1.4s, v2.4s ld1 {v16.4s}, [x0], #16 ld1 {v18.4s}, [x1], #16 add v16.4s, v16.4s, v18.4s add v2.4s, v2.4s, v3.4s add v3.4s, v3.4s, v16.4s dup v30.4s, w3 dup v31.4s, w4 transpose v4.4s, v5.4s, v0.4s, v1.4s transpose v6.4s, v7.4s, v2.4s, v3.4s transpose v0.2d, v2.2d, v4.2d, v6.2d transpose v1.2d, v3.2d, v5.2d, v7.2d // Conversion to floating point number must occur earlier than in 8 bit case // because of the range overflow scvtf v0.4s, v0.4s scvtf v2.4s, v2.4s scvtf v1.4s, v1.4s scvtf v3.4s, v3.4s fmul v16.4s, v0.4s, v1.4s // s1*s2 fmul v0.4s, v0.4s, v0.4s fmla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 // IEEE-754 hexadecimal representation of multipliers ldr w3, =0x42800000 // 64 ldr w4, =0x43000000 // 128 dup v28.4s, w3 dup v29.4s, w4 fmul v2.4s, v2.4s, v28.4s fmul v3.4s, v3.4s, v29.4s fadd v1.4s, v16.4s, v16.4s fsub v2.4s, v2.4s, v0.4s // vars fsub v3.4s, v3.4s, v1.4s // covar*2 fadd v0.4s, v0.4s, v30.4s fadd v2.4s, v2.4s, v31.4s fadd v1.4s, v1.4s, v30.4s fadd v3.4s, v3.4s, v31.4s fmul v0.4s, v0.4s, v2.4s fmul v1.4s, v1.4s, v3.4s fdiv v0.4s, v1.4s, v0.4s b.eq 1f movrel x3, mask add x3, x3, x2, lsl #2 ld1 {v29.4s}, [x3] and v0.16b, v0.16b, v29.16b 1: faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ret endfunc #endif /* BIT_DEPTH == 8 */ SAD_FUNC 4, 4 SAD_FUNC 4, 8 SAD_FUNC 4, 16 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 SAD_FUNC 16, 8 SAD_FUNC 16, 16 SAD_X_FUNC 3, 4, 4 SAD_X_FUNC 3, 4, 8 SAD_X_FUNC 3, 8, 4 SAD_X_FUNC 3, 8, 8 SAD_X_FUNC 3, 8, 16 SAD_X_FUNC 3, 16, 8 SAD_X_FUNC 3, 16, 16 SAD_X_FUNC 4, 4, 4 SAD_X_FUNC 4, 4, 8 SAD_X_FUNC 4, 8, 4 SAD_X_FUNC 4, 8, 8 SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 SSD_FUNC 4, 4 SSD_FUNC 4, 8 SSD_FUNC 4, 16 SSD_FUNC 8, 4 SSD_FUNC 8, 8 SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 pixel_var_8 8 pixel_var_8 16 pixel_var2_8 8 pixel_var2_8 16 sa8d_satd_8x8 sa8d_satd_8x8 satd_ HADAMARD_AC 8, 8 HADAMARD_AC 8, 16 HADAMARD_AC 16, 8 HADAMARD_AC 16, 16 #if BIT_DEPTH == 8 && HAVE_DOTPROD ENABLE_DOTPROD SAD_FUNC_DOTPROD 16, 8 SAD_FUNC_DOTPROD 16, 16 SAD_X_DOTPROD_FUNC 3, 16, 8 SAD_X_DOTPROD_FUNC 3, 16, 16 SAD_X_DOTPROD_FUNC 4, 16, 8 SAD_X_DOTPROD_FUNC 4, 16, 16 SSD_DOTPROD_FUNC 8, 4 SSD_DOTPROD_FUNC 8, 8 SSD_DOTPROD_FUNC 8, 16 SSD_DOTPROD_FUNC 16, 8 SSD_DOTPROD_FUNC 16, 16 DISABLE_DOTPROD #endif // BIT_DEPTH == 8 && HAVE_DOTPROD