/*****************************************************************************
 * pixel.S: aarch64 pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/

#include "asm.S"
#include "pixel-a-common.S"

const mask
.rept 16
.byte 0xff
.endr
.rept 16
.byte 0x00
.endr
endconst

.macro SUMSUBL_AB   sum, sub, a, b
    uaddl       \sum, \a, \b
    usubl       \sub, \a, \b
.endm

#if BIT_DEPTH == 8

.macro SAD_START_4
    ld1         {v1.s}[0], [x2], x3
    ld1         {v0.s}[0], [x0], x1
    ld1         {v1.s}[1], [x2], x3
    ld1         {v0.s}[1], [x0], x1
    uabdl       v16.8h,  v0.8b,  v1.8b
.endm

.macro SAD_4
    ld1         {v1.s}[0], [x2], x3
    ld1         {v0.s}[0], [x0], x1
    ld1         {v1.s}[1], [x2], x3
    ld1         {v0.s}[1], [x0], x1
    uabal       v16.8h,  v0.8b,  v1.8b
.endm

.macro SAD_START_8
    ld1         {v1.8b}, [x2], x3
    ld1         {v0.8b}, [x0], x1
    ld1         {v3.8b}, [x2], x3
    ld1         {v2.8b}, [x0], x1
    uabdl       v16.8h,  v0.8b,  v1.8b
    uabdl       v17.8h,  v2.8b,  v3.8b
.endm

.macro SAD_8
    ld1         {v1.8b}, [x2], x3
    ld1         {v0.8b}, [x0], x1
    ld1         {v3.8b}, [x2], x3
    ld1         {v2.8b}, [x0], x1
    uabal       v16.8h,  v0.8b,  v1.8b
    uabal       v17.8h,  v2.8b,  v3.8b
.endm

.macro SAD_START_16, dotprod=0
    ld1         {v1.16b}, [x2], x3
    ld1         {v0.16b}, [x0], x1
    ld1         {v3.16b}, [x2], x3
    ld1         {v2.16b}, [x0], x1
.if \dotprod == 0
    uabdl       v16.8h,  v0.8b,  v1.8b
    uabdl2      v17.8h,  v0.16b, v1.16b
    uabal       v16.8h,  v2.8b,  v3.8b
    uabal2      v17.8h,  v2.16b, v3.16b
.else
    movi        v18.4s, #0x0
    movi        v19.16b, #0x1
    uabd        v16.16b, v0.16b,  v1.16b
    uabd        v17.16b, v2.16b,  v3.16b
    udot        v18.4s, v16.16b, v19.16b
    udot        v18.4s, v17.16b, v19.16b
.endif
.endm

.macro SAD_16, dotprod=0
    ld1         {v1.16b}, [x2], x3
    ld1         {v0.16b}, [x0], x1
    ld1         {v3.16b}, [x2], x3
    ld1         {v2.16b}, [x0], x1
.if \dotprod == 0
    uabal       v16.8h,  v0.8b,  v1.8b
    uabal2      v17.8h,  v0.16b, v1.16b
    uabal       v16.8h,  v2.8b,  v3.8b
    uabal2      v17.8h,  v2.16b, v3.16b
.else
    uabd        v16.16b, v0.16b,  v1.16b
    uabd        v17.16b, v2.16b,  v3.16b
    udot        v18.4s, v16.16b, v19.16b
    udot        v18.4s, v17.16b, v19.16b
.endif
.endm

.macro SAD_FUNC w, h, name
function pixel_sad\name\()_\w\()x\h\()_neon, export=1
    SAD_START_\w

.rept \h / 2 - 1
    SAD_\w
.endr
.if \w > 4
    add         v16.8h,  v16.8h,  v17.8h
.endif
    uaddlv      s0,  v16.8h
    fmov        w0,  s0
    ret
endfunc
.endm

.macro SAD_FUNC_DOTPROD w, h, name
function pixel_sad\name\()_\w\()x\h\()_neon_dotprod, export=1
    SAD_START_\w 1

.rept \h / 2 - 1
    SAD_\w 1
.endr
    addv        s0,  v18.4s
    fmov        w0,  s0
    ret
endfunc
.endm

.macro SAD_X_4 x, first=uabal
    ld1         {v0.s}[0], [x0], x7
    ld1         {v1.s}[0], [x1], x5
    ld1         {v0.s}[1], [x0], x7
    ld1         {v1.s}[1], [x1], x5
    ld1         {v2.s}[0], [x2], x5
    ld1         {v2.s}[1], [x2], x5
    \first      v16.8h,  v1.8b,  v0.8b
    ld1         {v3.s}[0], [x3], x5
    ld1         {v3.s}[1], [x3], x5
    \first      v17.8h,  v2.8b,  v0.8b
.if \x == 4
    ld1         {v4.s}[0], [x4], x5
    ld1         {v4.s}[1], [x4], x5
.endif
    \first      v18.8h,  v3.8b,  v0.8b
.if \x == 4
    \first      v19.8h,  v4.8b,  v0.8b
.endif
.endm

.macro SAD_X_8 x, first=uabal
    ld1         {v0.8b}, [x0], x7
    ld1         {v1.8b}, [x1], x5
    ld1         {v2.8b}, [x2], x5
    \first      v16.8h,  v1.8b,  v0.8b
    ld1         {v3.8b}, [x3], x5
    \first      v17.8h,  v2.8b,  v0.8b
    ld1         {v5.8b}, [x0], x7
    ld1         {v1.8b}, [x1], x5
    \first      v18.8h,  v3.8b,  v0.8b
    ld1         {v2.8b}, [x2], x5
    uabal       v16.8h,  v1.8b,  v5.8b
    ld1         {v3.8b}, [x3], x5
    uabal       v17.8h,  v2.8b,  v5.8b
.if \x == 4
    ld1         {v4.8b}, [x4], x5
    ld1         {v1.8b}, [x4], x5
.endif
    uabal       v18.8h,  v3.8b,  v5.8b
.if \x == 4
    \first      v19.8h,  v4.8b,  v0.8b
    uabal       v19.8h,  v1.8b,  v5.8b
.endif
.endm

.macro SAD_X_16 x, first=uabal
    ld1         {v0.16b}, [x0], x7
    ld1         {v1.16b}, [x1], x5
    ld1         {v2.16b}, [x2], x5
    \first      v16.8h,  v1.8b,  v0.8b
    \first\()2  v20.8h,  v1.16b, v0.16b
    ld1         {v3.16b}, [x3], x5
    \first      v17.8h,  v2.8b,  v0.8b
    \first\()2  v21.8h,  v2.16b, v0.16b
    ld1         {v5.16b}, [x0], x7
    ld1         {v1.16b}, [x1], x5
    \first      v18.8h,  v3.8b,  v0.8b
    \first\()2  v22.8h,  v3.16b, v0.16b
    ld1         {v2.16b}, [x2], x5
    uabal       v16.8h,  v1.8b,  v5.8b
    uabal2      v20.8h,  v1.16b, v5.16b
    ld1         {v3.16b}, [x3], x5
    uabal       v17.8h,  v2.8b,  v5.8b
    uabal2      v21.8h,  v2.16b, v5.16b
.if \x == 4
    ld1         {v4.16b}, [x4], x5
    ld1         {v1.16b}, [x4], x5
.endif
    uabal       v18.8h,  v3.8b,  v5.8b
    uabal2      v22.8h,  v3.16b, v5.16b
.if \x == 4
    \first      v19.8h,  v4.8b,  v0.8b
    \first\()2  v23.8h,  v4.16b, v0.16b
    uabal       v19.8h,  v1.8b,  v5.8b
    uabal2      v23.8h,  v1.16b, v5.16b
.endif
.endm

.macro SAD_X_FUNC x, w, h
function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
.if \x == 3
    mov         x6,  x5
    mov         x5,  x4
.endif
    mov         x7,  #FENC_STRIDE

    SAD_X_\w    \x, uabdl

.rept \h / 2 - 1
    SAD_X_\w    \x
.endr

.if \w > 8
    add         v16.8h, v16.8h, v20.8h
    add         v17.8h, v17.8h, v21.8h
    add         v18.8h, v18.8h, v22.8h
.if \x == 4
    add         v19.8h, v19.8h, v23.8h
.endif
.endif
// add up the sads
    uaddlv      s0,  v16.8h
    uaddlv      s1,  v17.8h
    uaddlv      s2,  v18.8h

    stp         s0,  s1,  [x6], #8
.if \x == 3
    str         s2,  [x6]
.else
    uaddlv      s3,  v19.8h
    stp         s2,  s3,  [x6]
.endif
    ret
endfunc
.endm

.macro SAD_X_DOTPROD_16 x
    ld1         {v0.16b}, [x0], x7
    ld1         {v1.16b}, [x1], x5
    ld1         {v2.16b}, [x2], x5
    uabd        v20.16b, v1.16b,  v0.16b
    uabd        v22.16b, v2.16b,  v0.16b
    ld1         {v5.16b}, [x0], x7
    udot        v16.4s, v20.16b, v28.16b
    udot        v17.4s, v22.16b, v28.16b
    ld1         {v3.16b}, [x3], x5
    ld1         {v1.16b}, [x1], x5
    uabd        v24.16b, v3.16b,  v0.16b
    uabd        v21.16b, v1.16b,  v5.16b
    ld1         {v2.16b}, [x2], x5
    ld1         {v3.16b}, [x3], x5
    udot        v18.4s, v24.16b, v28.16b
    udot        v16.4s, v21.16b, v28.16b
    uabd        v23.16b, v2.16b,  v5.16b
    uabd        v25.16b, v3.16b,  v5.16b
    udot        v17.4s, v23.16b, v28.16b
    udot        v18.4s, v25.16b, v28.16b
.if \x == 4
    ld1         {v4.16b}, [x4], x5
    ld1         {v1.16b}, [x4], x5
    uabd        v26.16b, v4.16b,  v0.16b
    uabd        v27.16b, v1.16b,  v5.16b
    udot        v19.4s, v26.16b, v28.16b
    udot        v19.4s, v27.16b, v28.16b
.endif
.endm

.macro SAD_X_DOTPROD_FUNC x, w, h
function pixel_sad_x\x\()_\w\()x\h\()_neon_dotprod, export=1
    movi        v16.4s, #0x0
    movi        v17.4s, #0x0
    movi        v18.4s, #0x0
.if \x == 4
    movi        v19.4s, #0x0
.endif
    movi        v28.16b, #0x1

.if \x == 3
    mov         x6,  x5
    mov         x5,  x4
.endif
    mov         x7,  #FENC_STRIDE

    SAD_X_DOTPROD_\w    \x

.rept \h / 2 - 1
    SAD_X_DOTPROD_\w    \x
.endr

    addv        s0,  v16.4s
    addv        s1,  v17.4s
    addv        s2,  v18.4s
.if \x == 4
    addv        s3,  v19.4s
.endif
    stp         s0,  s1,  [x6], #8
.if \x == 3
    str         s2,  [x6]
.else
    stp         s2,  s3,  [x6]
.endif
    ret
endfunc
.endm

function pixel_vsad_neon, export=1
    subs        w2,  w2,  #2
    ld1         {v0.16b},  [x0],  x1
    ld1         {v1.16b},  [x0],  x1
    uabdl       v6.8h,  v0.8b,  v1.8b
    uabdl2      v7.8h,  v0.16b, v1.16b
    b.le        2f
1:
    subs        w2,  w2,  #2
    ld1         {v0.16b},  [x0],  x1
    uabal       v6.8h,  v1.8b,  v0.8b
    uabal2      v7.8h,  v1.16b, v0.16b
    ld1         {v1.16b},  [x0],  x1
    b.lt        2f
    uabal       v6.8h,  v0.8b,  v1.8b
    uabal2      v7.8h,  v0.16b, v1.16b
    b.gt        1b
2:
    add         v5.8h,  v6.8h,  v7.8h
    uaddlv      s0,  v5.8h
    fmov        w0,  s0
    ret
endfunc

#if HAVE_DOTPROD
ENABLE_DOTPROD
function pixel_vsad_neon_dotprod, export=1
    ld1         {v0.16b},  [x0],  x1
    ld1         {v1.16b},  [x0],  x1
    subs        w2,  w2,  #2
    movi        v3.16b, #0x1
    movi        v6.4s, #0x0
    uabd        v5.16b, v0.16b,  v1.16b
    udot        v6.4s,  v5.16b,  v3.16b
    b.le        2f
1:
    ld1         {v0.16b},  [x0],  x1
    subs        w2,  w2,  #2
    uabd        v5.16b, v0.16b,  v1.16b
    ld1         {v1.16b},  [x0],  x1
    udot        v6.4s,  v5.16b,  v3.16b
    b.lt        2f
    uabd        v5.16b, v0.16b,  v1.16b
    udot        v6.4s,  v5.16b,  v3.16b
    b.gt        1b
2:
    addv        s0,  v6.4s
    fmov        w0,  s0
    ret
endfunc
DISABLE_DOTPROD
#endif // HAVE_DOTPROD

function pixel_asd8_neon, export=1
    sub         w4,  w4,  #2
    ld1         {v0.8b}, [x0], x1
    ld1         {v1.8b}, [x2], x3
    ld1         {v2.8b}, [x0], x1
    ld1         {v3.8b}, [x2], x3
    usubl       v16.8h, v0.8b,  v1.8b
1:
    subs        w4,  w4,  #2
    ld1         {v4.8b}, [x0], x1
    ld1         {v5.8b}, [x2], x3
    usubl       v17.8h, v2.8b,  v3.8b
    usubl       v18.8h, v4.8b,  v5.8b
    add         v16.8h, v16.8h, v17.8h
    ld1         {v2.8b}, [x0], x1
    ld1         {v3.8b}, [x2], x3
    add         v16.8h, v16.8h, v18.8h
    b.gt        1b
    usubl       v17.8h, v2.8b,  v3.8b
    add         v16.8h, v16.8h, v17.8h
    saddlv      s0,  v16.8h
    abs         v0.2s,  v0.2s
    fmov        w0,  s0
    ret
endfunc

.macro SSD_START_4
    ld1         {v16.s}[0], [x0], x1
    ld1         {v17.s}[0], [x2], x3
    usubl       v2.8h,  v16.8b,  v17.8b
    ld1         {v16.s}[0], [x0], x1
    ld1         {v17.s}[0], [x2], x3
    smull       v0.4s,  v2.4h,   v2.4h
.endm

.macro SSD_4
    usubl       v2.8h,  v16.8b,  v17.8b
    ld1         {v16.s}[0], [x0], x1
    ld1         {v17.s}[0], [x2], x3
    smlal       v0.4s,  v2.4h,   v2.4h
.endm

.macro SSD_END_4
    usubl       v2.8h,  v16.8b,  v17.8b
    smlal       v0.4s,  v2.4h,   v2.4h
.endm

.macro SSD_START_8
    ld1         {v16.8b}, [x0], x1
    ld1         {v17.8b}, [x2], x3
    usubl       v2.8h,  v16.8b,  v17.8b
    ld1         {v16.8b}, [x0], x1
    smull       v0.4s,  v2.4h,   v2.4h
    ld1         {v17.8b}, [x2], x3
    smlal2      v0.4s,  v2.8h,   v2.8h
.endm

.macro SSD_8
    usubl       v2.8h,  v16.8b,  v17.8b
    ld1         {v16.8b}, [x0], x1
    smlal       v0.4s,  v2.4h,   v2.4h
    ld1         {v17.8b}, [x2], x3
    smlal2      v0.4s,  v2.8h,   v2.8h
.endm

.macro SSD_END_8
    usubl       v2.8h,  v16.8b,  v17.8b
    smlal       v0.4s,  v2.4h,   v2.4h
    smlal2      v0.4s,  v2.8h,   v2.8h
.endm

.macro SSD_START_16
    ld1         {v16.16b}, [x0], x1
    ld1         {v17.16b}, [x2], x3
    usubl       v2.8h,  v16.8b,  v17.8b
    usubl2      v3.8h,  v16.16b, v17.16b
    ld1         {v16.16b}, [x0], x1
    smull       v0.4s,  v2.4h,   v2.4h
    smull2      v1.4s,  v2.8h,   v2.8h
    ld1         {v17.16b}, [x2], x3
    smlal       v0.4s,  v3.4h,   v3.4h
    smlal2      v1.4s,  v3.8h,   v3.8h
.endm

.macro SSD_16
    usubl       v2.8h,  v16.8b,  v17.8b
    usubl2      v3.8h,  v16.16b, v17.16b
    ld1         {v16.16b}, [x0], x1
    smlal       v0.4s,  v2.4h,   v2.4h
    smlal2      v1.4s,  v2.8h,   v2.8h
    ld1         {v17.16b}, [x2], x3
    smlal       v0.4s,  v3.4h,   v3.4h
    smlal2      v1.4s,  v3.8h,   v3.8h
.endm

.macro SSD_END_16
    usubl       v2.8h,  v16.8b,  v17.8b
    usubl2      v3.8h,  v16.16b, v17.16b
    smlal       v0.4s,  v2.4h,   v2.4h
    smlal2      v1.4s,  v2.8h,   v2.8h
    smlal       v0.4s,  v3.4h,   v3.4h
    smlal2      v1.4s,  v3.8h,   v3.8h
    add         v0.4s,  v0.4s,   v1.4s
.endm

.macro SSD_FUNC w h
function pixel_ssd_\w\()x\h\()_neon, export=1
    SSD_START_\w
.rept \h-2
    SSD_\w
.endr
    SSD_END_\w

    addv        s0,  v0.4s
    mov         w0,  v0.s[0]
    ret
endfunc
.endm

.macro SSD_DOTPROD_8
    ld1         {v16.8b}, [x0], x1
    ld1         {v17.8b}, [x2], x3
    ld1         {v18.8b}, [x0], x1
    uabd        v20.8b,  v16.8b,  v17.8b
    ld1         {v19.8b}, [x2], x3
    uabd        v21.8b,  v18.8b,  v19.8b
    udot        v22.2s,  v20.8b,  v20.8b
    udot        v22.2s,  v21.8b,  v21.8b
.endm

.macro SSD_DOTPROD_16
    ld1         {v16.16b}, [x0], x1
    ld1         {v17.16b}, [x2], x3
    ld1         {v18.16b}, [x0], x1
    uabd        v20.16b, v16.16b, v17.16b
    ld1         {v19.16b}, [x2], x3
    uabd        v21.16b, v18.16b, v19.16b
    udot        v22.4s,  v20.16b, v20.16b
    udot        v22.4s,  v21.16b, v21.16b
.endm

.macro SSD_DOTPROD_FUNC w h
function pixel_ssd_\w\()x\h\()_neon_dotprod, export=1
    movi        v22.4s, #0x0

.rept \h/2
    SSD_DOTPROD_\w
.endr
.if \w > 8
    addv        s0,  v22.4s
.else
    addp        v0.2s, v22.2s, v22.2s
.endif
    mov         w0,  v0.s[0]
    ret
endfunc
.endm

function pixel_satd_4x4_neon, export=1
    ld1         {v1.s}[0],  [x2], x3
    ld1         {v0.s}[0],  [x0], x1
    ld1         {v3.s}[0],  [x2], x3
    ld1         {v2.s}[0],  [x0], x1
    ld1         {v1.s}[1],  [x2], x3
    ld1         {v0.s}[1],  [x0], x1
    ld1         {v3.s}[1],  [x2], x3
    ld1         {v2.s}[1],  [x0], x1

    usubl       v0.8h,  v0.8b,  v1.8b
    usubl       v1.8h,  v2.8b,  v3.8b
    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h

    zip1        v0.2d,  v2.2d,  v3.2d
    zip2        v1.2d,  v2.2d,  v3.2d
    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h

    trn1        v0.8h,  v2.8h,  v3.8h
    trn2        v1.8h,  v2.8h,  v3.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h

    trn1        v0.4s,  v2.4s,  v3.4s
    trn2        v1.4s,  v2.4s,  v3.4s
    abs         v0.8h,  v0.8h
    abs         v1.8h,  v1.8h
    umax        v0.8h,  v0.8h,  v1.8h

    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret
endfunc

function pixel_satd_4x8_neon, export=1
    ld1         {v1.s}[0],  [x2], x3
    ld1         {v0.s}[0],  [x0], x1
    ld1         {v3.s}[0],  [x2], x3
    ld1         {v2.s}[0],  [x0], x1
    ld1         {v5.s}[0],  [x2], x3
    ld1         {v4.s}[0],  [x0], x1
    ld1         {v7.s}[0],  [x2], x3
    ld1         {v6.s}[0],  [x0], x1
    ld1         {v1.s}[1],  [x2], x3
    ld1         {v0.s}[1],  [x0], x1
    ld1         {v3.s}[1],  [x2], x3
    ld1         {v2.s}[1],  [x0], x1
    ld1         {v5.s}[1],  [x2], x3
    ld1         {v4.s}[1],  [x0], x1
    ld1         {v7.s}[1],  [x2], x3
    ld1         {v6.s}[1],  [x0], x1
    b           satd_4x8_8x4_end_neon
endfunc

function pixel_satd_8x4_neon, export=1
    ld1         {v1.8b},  [x2], x3
    ld1         {v0.8b},  [x0], x1
    ld1         {v3.8b},  [x2], x3
    ld1         {v2.8b},  [x0], x1
    ld1         {v5.8b},  [x2], x3
    ld1         {v4.8b},  [x0], x1
    ld1         {v7.8b},  [x2], x3
    ld1         {v6.8b},  [x0], x1
endfunc

function satd_4x8_8x4_end_neon
    usubl       v0.8h,  v0.8b,  v1.8b
    usubl       v1.8h,  v2.8b,  v3.8b
    usubl       v2.8h,  v4.8b,  v5.8b
    usubl       v3.8h,  v6.8b,  v7.8b

    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h

    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h

    trn1        v0.8h,  v4.8h,  v5.8h
    trn2        v1.8h,  v4.8h,  v5.8h
    trn1        v2.8h,  v6.8h,  v7.8h
    trn2        v3.8h,  v6.8h,  v7.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h

    trn1        v0.4s,  v16.4s, v18.4s
    trn2        v1.4s,  v16.4s, v18.4s
    trn1        v2.4s,  v17.4s, v19.4s
    trn2        v3.4s,  v17.4s, v19.4s
    abs         v0.8h,  v0.8h
    abs         v1.8h,  v1.8h
    abs         v2.8h,  v2.8h
    abs         v3.8h,  v3.8h
    umax        v0.8h,  v0.8h,  v1.8h
    umax        v1.8h,  v2.8h,  v3.8h
    add         v0.8h,  v0.8h,  v1.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret
endfunc

function pixel_satd_4x16_neon, export=1
    mov         x4,  x30
    ld1         {v1.s}[0],  [x2], x3
    ld1         {v0.s}[0],  [x0], x1
    ld1         {v3.s}[0],  [x2], x3
    ld1         {v2.s}[0],  [x0], x1
    ld1         {v5.s}[0],  [x2], x3
    ld1         {v4.s}[0],  [x0], x1
    ld1         {v7.s}[0],  [x2], x3
    ld1         {v6.s}[0],  [x0], x1
    ld1         {v1.s}[1],  [x2], x3
    ld1         {v0.s}[1],  [x0], x1
    ld1         {v3.s}[1],  [x2], x3
    ld1         {v2.s}[1],  [x0], x1
    ld1         {v5.s}[1],  [x2], x3
    ld1         {v4.s}[1],  [x0], x1
    ld1         {v7.s}[1],  [x2], x3
    ld1         {v6.s}[1],  [x0], x1
    usubl       v16.8h, v0.8b,  v1.8b
    usubl       v17.8h, v2.8b,  v3.8b
    usubl       v18.8h, v4.8b,  v5.8b
    usubl       v19.8h, v6.8b,  v7.8b
    ld1         {v1.s}[0],  [x2], x3
    ld1         {v0.s}[0],  [x0], x1
    ld1         {v3.s}[0],  [x2], x3
    ld1         {v2.s}[0],  [x0], x1
    ld1         {v5.s}[0],  [x2], x3
    ld1         {v4.s}[0],  [x0], x1
    ld1         {v7.s}[0],  [x2], x3
    ld1         {v6.s}[0],  [x0], x1
    ld1         {v1.s}[1],  [x2], x3
    ld1         {v0.s}[1],  [x0], x1
    ld1         {v3.s}[1],  [x2], x3
    ld1         {v2.s}[1],  [x0], x1
    ld1         {v5.s}[1],  [x2], x3
    ld1         {v4.s}[1],  [x0], x1
    ld1         {v7.s}[1],  [x2], x3
    ld1         {v6.s}[1],  [x0], x1
    usubl       v20.8h, v0.8b,  v1.8b
    usubl       v21.8h, v2.8b,  v3.8b
    usubl       v22.8h, v4.8b,  v5.8b
    usubl       v23.8h, v6.8b,  v7.8b

    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h

    bl          satd_8x4v_8x8h_neon

    add         v30.8h, v0.8h,  v1.8h
    add         v31.8h, v2.8h,  v3.8h
    add         v0.8h,  v30.8h, v31.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret         x4
endfunc

.macro load_diff_fly_8x8
    ld1         {v1.8b},  [x2], x3
    ld1         {v0.8b},  [x0], x1
    ld1         {v3.8b},  [x2], x3
    ld1         {v2.8b},  [x0], x1
    usubl       v16.8h, v0.8b,  v1.8b
    ld1         {v5.8b},  [x2], x3
    ld1         {v4.8b},  [x0], x1
    usubl       v17.8h, v2.8b,  v3.8b
    ld1         {v7.8b},  [x2], x3
    ld1         {v6.8b},  [x0], x1
    usubl       v18.8h, v4.8b,  v5.8b
    ld1         {v1.8b},  [x2], x3
    ld1         {v0.8b},  [x0], x1
    usubl       v19.8h, v6.8b,  v7.8b
    ld1         {v3.8b},  [x2], x3
    ld1         {v2.8b},  [x0], x1
    usubl       v20.8h, v0.8b,  v1.8b
    ld1         {v5.8b},  [x2], x3
    ld1         {v4.8b},  [x0], x1
    usubl       v21.8h, v2.8b,  v3.8b
    ld1         {v7.8b},  [x2], x3
    ld1         {v6.8b},  [x0], x1

    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h

    usubl       v22.8h, v4.8b,  v5.8b
    usubl       v23.8h, v6.8b,  v7.8b
.endm

function pixel_satd_8x8_neon, export=1
    mov         x4,  x30

    bl          satd_8x8_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v0.8h,  v0.8h,  v1.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret         x4
endfunc

function pixel_satd_8x16_neon, export=1
    mov         x4,  x30

    bl          satd_8x8_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v30.8h, v0.8h,  v1.8h

    bl          satd_8x8_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v31.8h, v0.8h,  v1.8h
    add         v0.8h,  v30.8h, v31.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret         x4
endfunc

function satd_8x8_neon
    load_diff_fly_8x8
endfunc

// one vertical hadamard pass and two horizontal
function satd_8x4v_8x8h_neon
    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h


    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h

    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s

    abs         v0.8h,  v0.8h
    abs         v1.8h,  v1.8h
    abs         v2.8h,  v2.8h
    abs         v3.8h,  v3.8h
    abs         v4.8h,  v4.8h
    abs         v5.8h,  v5.8h
    abs         v6.8h,  v6.8h
    abs         v7.8h,  v7.8h

    umax        v0.8h,  v0.8h,  v2.8h
    umax        v1.8h,  v1.8h,  v3.8h
    umax        v2.8h,  v4.8h,  v6.8h
    umax        v3.8h,  v5.8h,  v7.8h

    ret
endfunc

function pixel_ssd_nv12_core_neon, export=1
    sxtw        x8,  w4
    add         x8,  x8,  #8
    and         x8,  x8,  #~15
    movi        v6.2d,  #0
    movi        v7.2d,  #0
    sub         x1,  x1,  x8, lsl #1
    sub         x3,  x3,  x8, lsl #1
1:
    subs        w8,  w4,  #16
    ld2         {v0.8b,v1.8b},   [x0],  #16
    ld2         {v2.8b,v3.8b},   [x2],  #16
    ld2         {v24.8b,v25.8b}, [x0],  #16
    ld2         {v26.8b,v27.8b}, [x2],  #16

    usubl       v16.8h, v0.8b,  v2.8b
    usubl       v17.8h, v1.8b,  v3.8b
    smull       v20.4s, v16.4h, v16.4h
    smull       v21.4s, v17.4h, v17.4h
    usubl       v18.8h, v24.8b, v26.8b
    usubl       v19.8h, v25.8b, v27.8b
    smlal2      v20.4s, v16.8h, v16.8h
    smlal2      v21.4s, v17.8h, v17.8h

    b.lt        4f
    b.eq        3f
2:
    smlal       v20.4s, v18.4h, v18.4h
    smlal       v21.4s, v19.4h, v19.4h
    ld2         {v0.8b,v1.8b}, [x0],  #16
    ld2         {v2.8b,v3.8b}, [x2],  #16
    smlal2      v20.4s, v18.8h, v18.8h
    smlal2      v21.4s, v19.8h, v19.8h

    subs        w8,  w8,  #16
    usubl       v16.8h, v0.8b,  v2.8b
    usubl       v17.8h, v1.8b,  v3.8b
    smlal       v20.4s, v16.4h, v16.4h
    smlal       v21.4s, v17.4h, v17.4h
    ld2         {v24.8b,v25.8b}, [x0],  #16
    ld2         {v26.8b,v27.8b}, [x2],  #16
    smlal2      v20.4s, v16.8h, v16.8h
    smlal2      v21.4s, v17.8h, v17.8h
    b.lt        4f

    usubl       v18.8h, v24.8b, v26.8b
    usubl       v19.8h, v25.8b, v27.8b
    b.gt        2b
3:
    smlal       v20.4s, v18.4h, v18.4h
    smlal       v21.4s, v19.4h, v19.4h
    smlal2      v20.4s, v18.8h, v18.8h
    smlal2      v21.4s, v19.8h, v19.8h
4:
    subs        w5,  w5,  #1
    uaddw       v6.2d,  v6.2d,  v20.2s
    uaddw       v7.2d,  v7.2d,  v21.2s
    add         x0,  x0,  x1
    add         x2,  x2,  x3
    uaddw2      v6.2d,  v6.2d,  v20.4s
    uaddw2      v7.2d,  v7.2d,  v21.4s
    b.gt        1b

    addp        v6.2d,  v6.2d,  v7.2d
    st1         {v6.d}[0], [x6]
    st1         {v6.d}[1], [x7]

    ret
endfunc

.macro pixel_var_8 h
function pixel_var_8x\h\()_neon, export=1
    ld1             {v16.8b}, [x0], x1
    ld1             {v17.8b}, [x0], x1
    mov             x2,  \h - 4
    umull           v1.8h,  v16.8b, v16.8b
    uxtl            v0.8h,  v16.8b
    umull           v2.8h,  v17.8b, v17.8b
    uaddw           v0.8h,  v0.8h,  v17.8b
    ld1             {v18.8b}, [x0], x1
    uaddlp          v1.4s,  v1.8h
    uaddlp          v2.4s,  v2.8h
    ld1             {v19.8b}, [x0], x1

1:  subs            x2,  x2,  #4
    uaddw           v0.8h,  v0.8h,  v18.8b
    umull           v24.8h, v18.8b, v18.8b
    ld1             {v20.8b}, [x0], x1
    uaddw           v0.8h,  v0.8h,  v19.8b
    umull           v25.8h, v19.8b, v19.8b
    uadalp          v1.4s,  v24.8h
    ld1             {v21.8b}, [x0], x1
    uaddw           v0.8h,  v0.8h,  v20.8b
    umull           v26.8h, v20.8b, v20.8b
    uadalp          v2.4s,  v25.8h
    ld1             {v18.8b}, [x0], x1
    uaddw           v0.8h,  v0.8h,  v21.8b
    umull           v27.8h, v21.8b, v21.8b
    uadalp          v1.4s,  v26.8h
    ld1             {v19.8b}, [x0], x1
    uadalp          v2.4s,  v27.8h
    b.gt            1b

    uaddw           v0.8h,  v0.8h,  v18.8b
    umull           v28.8h, v18.8b, v18.8b
    uaddw           v0.8h,  v0.8h,  v19.8b
    umull           v29.8h, v19.8b, v19.8b
    uadalp          v1.4s,  v28.8h
    uadalp          v2.4s,  v29.8h

    b               var_end
endfunc
.endm

function pixel_var_16x16_neon, export=1
    ld1             {v16.16b}, [x0],  x1
    ld1             {v17.16b}, [x0],  x1
    mov             x2,  #14
    umull           v1.8h,  v16.8b,  v16.8b
    umull2          v2.8h,  v16.16b, v16.16b
    uxtl            v0.8h,  v16.8b
    uaddlp          v1.4s,  v1.8h
    uaddlp          v2.4s,  v2.8h
    uaddw2          v0.8h,  v0.8h,   v16.16b

1:  subs            x2,  x2,  #2
    ld1             {v18.16b}, [x0],  x1
    uaddw           v0.8h,  v0.8h,   v17.8b
    umull           v3.8h,  v17.8b,  v17.8b
    uaddw2          v0.8h,  v0.8h,   v17.16b
    umull2          v4.8h,  v17.16b, v17.16b
    uadalp          v1.4s,  v3.8h
    uadalp          v2.4s,  v4.8h

    ld1             {v17.16b}, [x0],  x1
    uaddw           v0.8h,  v0.8h,   v18.8b
    umull           v5.8h,  v18.8b,  v18.8b
    uaddw2          v0.8h,  v0.8h,   v18.16b
    umull2          v6.8h,  v18.16b, v18.16b
    uadalp          v1.4s,  v5.8h
    uadalp          v2.4s,  v6.8h
    b.gt            1b

    uaddw           v0.8h,  v0.8h,   v17.8b
    umull           v3.8h,  v17.8b,  v17.8b
    uaddw2          v0.8h,  v0.8h,   v17.16b
    umull2          v4.8h,  v17.16b, v17.16b
    uadalp          v1.4s,  v3.8h
    uadalp          v2.4s,  v4.8h
endfunc

function var_end
    add             v1.4s,  v1.4s,  v2.4s
    uaddlv          s0,  v0.8h
    uaddlv          d1,  v1.4s
    mov             w0,  v0.s[0]
    mov             x1,  v1.d[0]
    orr             x0,  x0,  x1,  lsl #32
    ret
endfunc

.macro pixel_var2_8 h
function pixel_var2_8x\h\()_neon, export=1
    mov             x3,  #16
    ld1             {v16.8b}, [x0], #8
    ld1             {v18.8b}, [x1], x3
    ld1             {v17.8b}, [x0], #8
    ld1             {v19.8b}, [x1], x3
    mov             x5,  \h - 2
    usubl           v0.8h,  v16.8b, v18.8b
    usubl           v1.8h,  v17.8b, v19.8b
    ld1             {v16.8b}, [x0], #8
    ld1             {v18.8b}, [x1], x3
    smull           v2.4s,  v0.4h,  v0.4h
    smull2          v3.4s,  v0.8h,  v0.8h
    smull           v4.4s,  v1.4h,  v1.4h
    smull2          v5.4s,  v1.8h,  v1.8h

    usubl           v6.8h,  v16.8b, v18.8b

1:  subs            x5,  x5,  #1
    ld1             {v17.8b}, [x0], #8
    ld1             {v19.8b}, [x1], x3
    smlal           v2.4s,  v6.4h,  v6.4h
    smlal2          v3.4s,  v6.8h,  v6.8h
    usubl           v7.8h,  v17.8b, v19.8b
    add             v0.8h,  v0.8h,  v6.8h
    ld1             {v16.8b}, [x0], #8
    ld1             {v18.8b}, [x1], x3
    smlal           v4.4s,  v7.4h,  v7.4h
    smlal2          v5.4s,  v7.8h,  v7.8h
    usubl           v6.8h,  v16.8b, v18.8b
    add             v1.8h,  v1.8h,  v7.8h
    b.gt            1b

    ld1             {v17.8b}, [x0], #8
    ld1             {v19.8b}, [x1], x3
    smlal           v2.4s,  v6.4h,  v6.4h
    smlal2          v3.4s,  v6.8h,  v6.8h
    usubl           v7.8h,  v17.8b, v19.8b
    add             v0.8h,  v0.8h,  v6.8h
    smlal           v4.4s,  v7.4h,  v7.4h
    add             v1.8h,  v1.8h,  v7.8h
    smlal2          v5.4s,  v7.8h,  v7.8h

    saddlv          s0,  v0.8h
    saddlv          s1,  v1.8h
    add             v2.4s,  v2.4s,  v3.4s
    add             v4.4s,  v4.4s,  v5.4s
    mov             w0,  v0.s[0]
    mov             w1,  v1.s[0]
    addv            s2,  v2.4s
    addv            s4,  v4.4s
    mul             w0,  w0,  w0
    mul             w1,  w1,  w1
    mov             w3,  v2.s[0]
    mov             w4,  v4.s[0]
    sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
    sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
    str             w3,  [x2]
    add             w0,  w0,  w1
    str             w4,  [x2, #4]

    ret
endfunc
.endm

function pixel_satd_16x8_neon, export=1
    mov         x4,  x30

    bl          satd_16x4_neon
    add         v30.8h, v0.8h,  v1.8h
    add         v31.8h, v2.8h,  v3.8h

    bl          satd_16x4_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v30.8h, v30.8h, v0.8h
    add         v31.8h, v31.8h, v1.8h

    add         v0.8h,  v30.8h, v31.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret         x4
endfunc

function pixel_satd_16x16_neon, export=1
    mov         x4,  x30

    bl          satd_16x4_neon
    add         v30.8h, v0.8h,  v1.8h
    add         v31.8h, v2.8h,  v3.8h

    bl          satd_16x4_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v30.8h, v30.8h, v0.8h
    add         v31.8h, v31.8h, v1.8h

    bl          satd_16x4_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v30.8h, v30.8h, v0.8h
    add         v31.8h, v31.8h, v1.8h

    bl          satd_16x4_neon
    add         v0.8h,  v0.8h,  v1.8h
    add         v1.8h,  v2.8h,  v3.8h
    add         v30.8h, v30.8h, v0.8h
    add         v31.8h, v31.8h, v1.8h

    add         v0.8h,  v30.8h, v31.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    ret         x4
endfunc

function satd_16x4_neon
    ld1         {v1.16b},  [x2], x3
    ld1         {v0.16b},  [x0], x1
    ld1         {v3.16b},  [x2], x3
    ld1         {v2.16b},  [x0], x1
    usubl       v16.8h, v0.8b,  v1.8b
    usubl2      v20.8h, v0.16b, v1.16b
    ld1         {v5.16b},  [x2], x3
    ld1         {v4.16b},  [x0], x1
    usubl       v17.8h, v2.8b,  v3.8b
    usubl2      v21.8h, v2.16b, v3.16b
    ld1         {v7.16b},  [x2], x3
    ld1         {v6.16b},  [x0], x1

    usubl       v18.8h, v4.8b,  v5.8b
    usubl2      v22.8h, v4.16b, v5.16b
    usubl       v19.8h, v6.8b,  v7.8b
    usubl2      v23.8h, v6.16b, v7.16b

    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h

    b           satd_8x4v_8x8h_neon
endfunc

function pixel_sa8d_8x8_neon, export=1
    mov         x4,  x30
    bl          pixel_sa8d_8x8_neon
    add         v0.8h,  v0.8h,  v1.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    add         w0,  w0,  #1
    lsr         w0,  w0,  #1
    ret         x4
endfunc

function pixel_sa8d_16x16_neon, export=1
    mov         x4,  x30
    bl          pixel_sa8d_8x8_neon
    uaddlp      v30.4s, v0.8h
    uaddlp      v31.4s, v1.8h
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    sub         x0,  x0,  x1,  lsl #4
    sub         x2,  x2,  x3,  lsl #4
    add         x0,  x0,  #8
    add         x2,  x2,  #8
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    add         v0.4s,  v30.4s, v31.4s
    addv        s0,  v0.4s
    mov         w0,  v0.s[0]
    add         w0,  w0,  #1
    lsr         w0,  w0,  #1
    ret         x4
endfunc

.macro sa8d_satd_8x8 satd=
function pixel_sa8d_\satd\()8x8_neon
    load_diff_fly_8x8

    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h

    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
.ifc \satd, satd_
    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h

    SUMSUB_AB   v24.8h, v25.8h, v0.8h,  v1.8h
    SUMSUB_AB   v26.8h, v27.8h, v2.8h,  v3.8h
    SUMSUB_AB   v0.8h,  v1.8h,  v4.8h,  v5.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v6.8h,  v7.8h

    transpose   v4.4s,  v6.4s,  v24.4s, v26.4s
    transpose   v5.4s,  v7.4s,  v25.4s, v27.4s
    transpose   v24.4s, v26.4s, v0.4s,  v2.4s
    transpose   v25.4s, v27.4s, v1.4s,  v3.4s

    abs         v0.8h,  v4.8h
    abs         v1.8h,  v5.8h
    abs         v2.8h,  v6.8h
    abs         v3.8h,  v7.8h
    abs         v4.8h,  v24.8h
    abs         v5.8h,  v25.8h
    abs         v6.8h,  v26.8h
    abs         v7.8h,  v27.8h

    umax        v0.8h,  v0.8h,  v2.8h
    umax        v1.8h,  v1.8h,  v3.8h
    umax        v2.8h,  v4.8h,  v6.8h
    umax        v3.8h,  v5.8h,  v7.8h

    add         v26.8h, v0.8h,  v1.8h
    add         v27.8h, v2.8h,  v3.8h
.endif

    SUMSUB_AB   v0.8h,  v16.8h, v16.8h, v20.8h
    SUMSUB_AB   v1.8h,  v17.8h, v17.8h, v21.8h
    SUMSUB_AB   v2.8h,  v18.8h, v18.8h, v22.8h
    SUMSUB_AB   v3.8h,  v19.8h, v19.8h, v23.8h

    transpose   v20.8h, v21.8h, v16.8h, v17.8h
    transpose   v4.8h,  v5.8h,  v0.8h,  v1.8h
    transpose   v22.8h, v23.8h, v18.8h, v19.8h
    transpose   v6.8h,  v7.8h,  v2.8h,  v3.8h

    SUMSUB_AB   v2.8h,  v3.8h,  v20.8h, v21.8h
    SUMSUB_AB   v24.8h, v25.8h, v4.8h,  v5.8h
    SUMSUB_AB   v0.8h,  v1.8h,  v22.8h, v23.8h
    SUMSUB_AB   v4.8h,  v5.8h,  v6.8h,  v7.8h

    transpose   v20.4s, v22.4s, v2.4s,  v0.4s
    transpose   v21.4s, v23.4s, v3.4s,  v1.4s
    transpose   v16.4s, v18.4s, v24.4s, v4.4s
    transpose   v17.4s, v19.4s, v25.4s, v5.4s

    SUMSUB_AB   v0.8h,  v2.8h,  v20.8h, v22.8h
    SUMSUB_AB   v1.8h,  v3.8h,  v21.8h, v23.8h
    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h

    transpose   v16.2d, v20.2d,  v0.2d,  v4.2d
    transpose   v17.2d, v21.2d,  v1.2d,  v5.2d
    transpose   v18.2d, v22.2d,  v2.2d,  v6.2d
    transpose   v19.2d, v23.2d,  v3.2d,  v7.2d

    abs         v16.8h, v16.8h
    abs         v20.8h, v20.8h
    abs         v17.8h, v17.8h
    abs         v21.8h, v21.8h
    abs         v18.8h, v18.8h
    abs         v22.8h, v22.8h
    abs         v19.8h, v19.8h
    abs         v23.8h, v23.8h

    umax        v16.8h, v16.8h, v20.8h
    umax        v17.8h, v17.8h, v21.8h
    umax        v18.8h, v18.8h, v22.8h
    umax        v19.8h, v19.8h, v23.8h

    add         v0.8h,  v16.8h, v17.8h
    add         v1.8h,  v18.8h, v19.8h

    ret
endfunc
.endm

function pixel_sa8d_satd_16x16_neon, export=1
    mov         x4,  x30
    bl          pixel_sa8d_satd_8x8_neon
    uaddlp      v30.4s, v0.8h
    uaddlp      v31.4s, v1.8h
    uaddlp      v28.4s, v26.8h
    uaddlp      v29.4s, v27.8h
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    sub         x0,  x0,  x1,  lsl #4
    sub         x2,  x2,  x3,  lsl #4
    add         x0,  x0,  #8
    add         x2,  x2,  #8
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    add         v0.4s,  v30.4s, v31.4s  // sa8d
    add         v1.4s,  v28.4s, v29.4s  // satd
    addv        s0,  v0.4s
    addv        s1,  v1.4s
    urshr       v0.4s,  v0.4s,  #1
    fmov        w0,  s0
    fmov        w1,  s1
    add         x0,  x0,  x1, lsl #32
    ret         x4
endfunc

.macro HADAMARD_AC w h
function pixel_hadamard_ac_\w\()x\h\()_neon, export=1
    movrel      x5, mask_ac_4_8
    mov         x4,  x30
    ld1         {v30.8h,v31.8h}, [x5]
    movi        v28.16b, #0
    movi        v29.16b, #0

    bl          hadamard_ac_8x8_neon
.if \h > 8
    bl          hadamard_ac_8x8_neon
.endif
.if \w > 8
    sub         x0,  x0,  x1,  lsl #3
    add         x0,  x0,  #8
    bl          hadamard_ac_8x8_neon
.endif
.if \w * \h == 256
    sub         x0,  x0,  x1,  lsl #4
    bl          hadamard_ac_8x8_neon
.endif

    addv        s1,  v29.4s
    addv        s0,  v28.4s
    mov         w1,  v1.s[0]
    mov         w0,  v0.s[0]
    lsr         w1,  w1,  #2
    lsr         w0,  w0,  #1
    orr         x0,  x0,  x1, lsl #32
    ret         x4
endfunc
.endm

// v28: satd  v29: sa8d  v30: mask_ac4  v31: mask_ac8
function hadamard_ac_8x8_neon
    ld1         {v16.8b}, [x0], x1
    ld1         {v17.8b}, [x0], x1
    ld1         {v18.8b}, [x0], x1
    ld1         {v19.8b}, [x0], x1
    SUMSUBL_AB  v0.8h,  v1.8h, v16.8b, v17.8b
    ld1         {v20.8b}, [x0], x1
    ld1         {v21.8b}, [x0], x1
    SUMSUBL_AB  v2.8h,  v3.8h, v18.8b, v19.8b
    ld1         {v22.8b}, [x0], x1
    ld1         {v23.8b}, [x0], x1
    SUMSUBL_AB  v4.8h,  v5.8h, v20.8b, v21.8b
    SUMSUBL_AB  v6.8h,  v7.8h, v22.8b, v23.8b

    SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h,  v2.8h,  v1.8h,  v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h

    transpose   v0.8h,  v1.8h,  v16.8h,  v17.8h
    transpose   v2.8h,  v3.8h,  v18.8h,  v19.8h
    transpose   v4.8h,  v5.8h,  v20.8h,  v21.8h
    transpose   v6.8h,  v7.8h,  v22.8h,  v23.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h

    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s

    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h

    abs         v0.8h,  v16.8h
    abs         v4.8h,  v20.8h
    abs         v1.8h,  v17.8h
    abs         v5.8h,  v21.8h
    abs         v2.8h,  v18.8h
    abs         v6.8h,  v22.8h
    abs         v3.8h,  v19.8h
    abs         v7.8h,  v23.8h

    add         v0.8h,  v0.8h,  v4.8h
    add         v1.8h,  v1.8h,  v5.8h
    and         v0.16b, v0.16b, v30.16b
    add         v2.8h,  v2.8h,  v6.8h
    add         v3.8h,  v3.8h,  v7.8h
    add         v0.8h,  v0.8h,  v2.8h
    add         v1.8h,  v1.8h,  v3.8h
    uadalp      v28.4s, v0.8h
    uadalp      v28.4s, v1.8h

    SUMSUB_AB   v6.8h,  v7.8h,  v23.8h, v19.8h
    SUMSUB_AB   v4.8h,  v5.8h,  v22.8h, v18.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v21.8h, v17.8h
    SUMSUB_AB   v1.8h,  v0.8h,  v16.8h,  v20.8h

    transpose   v16.2d, v17.2d,  v6.2d,  v7.2d
    transpose   v18.2d, v19.2d,  v4.2d,  v5.2d
    transpose   v20.2d, v21.2d,  v2.2d,  v3.2d

    abs         v16.8h,  v16.8h
    abs         v17.8h,  v17.8h
    abs         v18.8h,  v18.8h
    abs         v19.8h,  v19.8h
    abs         v20.8h,  v20.8h
    abs         v21.8h,  v21.8h

    transpose   v7.2d,  v6.2d,  v1.2d,  v0.2d

    umax        v3.8h,  v16.8h,  v17.8h
    umax        v2.8h,  v18.8h,  v19.8h
    umax        v1.8h,  v20.8h,  v21.8h

    SUMSUB_AB   v4.8h,  v5.8h,  v7.8h,  v6.8h

    add         v2.8h,  v2.8h,  v3.8h
    add         v2.8h,  v2.8h,  v1.8h
    and         v4.16b, v4.16b, v31.16b
    add         v2.8h,  v2.8h,  v2.8h
    abs         v5.8h,  v5.8h
    abs         v4.8h,  v4.8h
    add         v2.8h,  v2.8h,  v5.8h
    add         v2.8h,  v2.8h,  v4.8h
    uadalp      v29.4s, v2.8h
    ret
endfunc

function pixel_ssim_4x4x2_core_neon, export=1
    ld1         {v0.8b},  [x0], x1
    ld1         {v2.8b},  [x2], x3
    umull       v16.8h, v0.8b,  v0.8b
    umull       v17.8h, v0.8b,  v2.8b
    umull       v18.8h, v2.8b,  v2.8b

    ld1         {v28.8b}, [x0], x1
    ld1         {v29.8b}, [x2], x3
    umull       v20.8h, v28.8b, v28.8b
    umull       v21.8h, v28.8b, v29.8b
    umull       v22.8h, v29.8b, v29.8b

    uaddlp      v16.4s, v16.8h
    uaddlp      v17.4s, v17.8h
    uaddl       v0.8h,  v0.8b,  v28.8b
    uadalp      v16.4s, v18.8h
    uaddl       v1.8h,  v2.8b,  v29.8b

    ld1         {v26.8b}, [x0], x1
    ld1         {v27.8b}, [x2], x3
    umull       v23.8h, v26.8b, v26.8b
    umull       v24.8h, v26.8b, v27.8b
    umull       v25.8h, v27.8b, v27.8b

    uadalp      v16.4s, v20.8h
    uaddw       v0.8h,  v0.8h,  v26.8b
    uadalp      v17.4s, v21.8h
    uaddw       v1.8h,  v1.8h,  v27.8b
    uadalp      v16.4s, v22.8h

    ld1         {v28.8b}, [x0], x1
    ld1         {v29.8b}, [x2], x3
    umull       v20.8h, v28.8b, v28.8b
    umull       v21.8h, v28.8b, v29.8b
    umull       v22.8h, v29.8b, v29.8b

    uadalp      v16.4s, v23.8h
    uaddw       v0.8h,  v0.8h,  v28.8b
    uadalp      v17.4s, v24.8h
    uaddw       v1.8h,  v1.8h,  v29.8b
    uadalp      v16.4s, v25.8h

    uadalp      v16.4s, v20.8h
    uadalp      v17.4s, v21.8h
    uadalp      v16.4s, v22.8h

    uaddlp      v0.4s,  v0.8h
    uaddlp      v1.4s,  v1.8h

    addp        v0.4s,  v0.4s,  v0.4s
    addp        v1.4s,  v1.4s,  v1.4s
    addp        v2.4s,  v16.4s, v16.4s
    addp        v3.4s,  v17.4s, v17.4s

    st4         {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
    ret
endfunc

function pixel_ssim_end4_neon, export=1
    mov         x5,  #4
    ld1         {v16.4s,v17.4s}, [x0], #32
    ld1         {v18.4s,v19.4s}, [x1], #32
    mov         w4,  #0x99bb
    subs        x2,  x5,  w2, uxtw
    mov         w3,  #416                       // ssim_c1 = .01*.01*255*255*64
    movk        w4,  #0x03, lsl #16             // ssim_c2 = .03*.03*255*255*64*63
    add         v0.4s,  v16.4s,  v18.4s
    add         v1.4s,  v17.4s,  v19.4s
    add         v0.4s,  v0.4s,  v1.4s
    ld1         {v20.4s,v21.4s}, [x0], #32
    ld1         {v22.4s,v23.4s}, [x1], #32
    add         v2.4s,  v20.4s, v22.4s
    add         v3.4s,  v21.4s, v23.4s
    add         v1.4s,  v1.4s,  v2.4s
    ld1         {v16.4s}, [x0], #16
    ld1         {v18.4s}, [x1], #16
    add         v16.4s, v16.4s, v18.4s
    add         v2.4s,  v2.4s,  v3.4s
    add         v3.4s,  v3.4s,  v16.4s

    dup         v30.4s, w3
    dup         v31.4s, w4

    transpose   v4.4s,  v5.4s,  v0.4s,  v1.4s
    transpose   v6.4s,  v7.4s,  v2.4s,  v3.4s
    transpose   v0.2d,  v2.2d,  v4.2d,  v6.2d
    transpose   v1.2d,  v3.2d,  v5.2d,  v7.2d

    mul         v16.4s, v0.4s, v1.4s    // s1*s2
    mul         v0.4s,  v0.4s, v0.4s
    mla         v0.4s,  v1.4s, v1.4s    // s1*s1 + s2*s2

    shl         v3.4s,  v3.4s,  #7
    shl         v2.4s,  v2.4s,  #6
    add         v1.4s,  v16.4s, v16.4s

    sub         v2.4s,  v2.4s,  v0.4s    // vars
    sub         v3.4s,  v3.4s,  v1.4s    // covar*2
    add         v0.4s,  v0.4s,  v30.4s
    add         v2.4s,  v2.4s,  v31.4s
    add         v1.4s,  v1.4s,  v30.4s
    add         v3.4s,  v3.4s,  v31.4s

    scvtf       v0.4s,  v0.4s
    scvtf       v2.4s,  v2.4s
    scvtf       v1.4s,  v1.4s
    scvtf       v3.4s,  v3.4s

    fmul        v0.4s,  v0.4s,  v2.4s
    fmul        v1.4s,  v1.4s,  v3.4s

    fdiv        v0.4s,  v1.4s,  v0.4s

    b.eq        1f
    movrel      x3,  mask
    add         x3,  x3,  x2,  lsl #2
    ld1         {v29.4s}, [x3]
    and         v0.16b, v0.16b, v29.16b
1:
    faddp       v0.4s,  v0.4s,  v0.4s
    faddp       s0,  v0.2s
    ret
endfunc

#else /* BIT_DEPTH == 8 */

.macro SAD_START_4
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    uabdl       v16.4s, v0.4h, v1.4h
    uabdl2      v18.4s, v0.8h, v1.8h
.endm

.macro SAD_4
    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    uabal       v16.4s, v0.4h, v1.4h
    uabal2      v18.4s, v0.8h, v1.8h
.endm

.macro SAD_START_8
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld1         {v1.8h}, [x2], x3
    ld1         {v0.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    uabdl       v16.4s, v0.4h, v1.4h
    uabdl2      v17.4s, v0.8h, v1.8h
    uabdl       v18.4s, v2.4h, v3.4h
    uabdl2      v19.4s, v2.8h, v3.8h
.endm

.macro SAD_8
    ld1         {v1.8h}, [x2], x3
    ld1         {v0.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    uabal       v16.4s, v0.4h, v1.4h
    uabal2      v17.4s, v0.8h, v1.8h
    uabal       v18.4s, v2.4h, v3.4h
    uabal2      v19.4s, v2.8h, v3.8h
.endm

.macro SAD_START_16
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld2         {v0.8h, v1.8h}, [x2], x3
    ld2         {v2.8h, v3.8h}, [x0], x1
    ld2         {v4.8h, v5.8h}, [x2], x3
    ld2         {v6.8h, v7.8h}, [x0], x1
    uabdl       v16.4s, v0.4h, v2.4h
    uabdl2      v17.4s, v0.8h, v2.8h
    uabdl       v20.4s, v1.4h, v3.4h
    uabdl2      v21.4s, v1.8h, v3.8h
    uabdl       v18.4s, v4.4h, v6.4h
    uabdl2      v19.4s, v4.8h, v6.8h
    uabdl       v22.4s, v5.4h, v7.4h
    uabdl2      v23.4s, v5.8h, v7.8h
.endm

.macro SAD_16
    ld2         {v0.8h, v1.8h}, [x2], x3
    ld2         {v2.8h, v3.8h}, [x0], x1
    ld2         {v4.8h, v5.8h}, [x2], x3
    ld2         {v6.8h, v7.8h}, [x0], x1
    uabal       v16.4s, v0.4h, v2.4h
    uabal2      v17.4s, v0.8h, v2.8h
    uabal       v20.4s, v1.4h, v3.4h
    uabal2      v21.4s, v1.8h, v3.8h
    uabal       v18.4s, v4.4h, v6.4h
    uabal2      v19.4s, v4.8h, v6.8h
    uabal       v22.4s, v5.4h, v7.4h
    uabal2      v23.4s, v5.8h, v7.8h
.endm

.macro SAD_FUNC w, h, name
function pixel_sad\name\()_\w\()x\h\()_neon, export=1
    SAD_START_\w

.rept \h / 2 - 1
    SAD_\w
.endr
.if \w > 8
    add         v20.4s, v20.4s, v21.4s
    add         v16.4s, v16.4s, v20.4s
    add         v22.4s, v22.4s, v23.4s
    add         v18.4s, v18.4s, v22.4s
.endif
.if \w > 4
    add         v16.4s, v16.4s, v17.4s
    add         v18.4s, v18.4s, v19.4s
.endif
    add         v16.4s, v16.4s, v18.4s
    uaddlv      s0, v16.8h
    fmov        w0, s0
    ret
endfunc
.endm

.macro SAD_X_4 x, first=uaba
    ld1         {v0.d}[0], [x0], x7
    ld1         {v1.d}[0], [x1], x5
    ld1         {v0.d}[1], [x0], x7
    ld1         {v1.d}[1], [x1], x5
    ld1         {v2.d}[0], [x2], x5
    ld1         {v2.d}[1], [x2], x5
    \first      v16.8h, v1.8h, v0.8h
    ld1         {v3.d}[0], [x3], x5
    ld1         {v3.d}[1], [x3], x5
    \first      v17.8h, v2.8h, v0.8h
.if \x == 4
    ld1         {v4.d}[0], [x4], x5
    ld1         {v4.d}[1], [x4], x5
.endif
    \first      v18.8h, v3.8h, v0.8h
.if \x == 4
    \first      v19.8h, v4.8h, v0.8h
.endif
.endm

.macro SAD_X_8 x, first=uaba
    ld1         {v0.8h}, [x0], x7
    ld1         {v1.8h}, [x1], x5
    \first      v16.8h, v1.8h, v0.8h
    ld1         {v2.8h}, [x2], x5
    ld1         {v3.8h}, [x3], x5
    \first      v17.8h, v2.8h, v0.8h
    ld1         {v5.8h}, [x0], x7
    ld1         {v1.8h}, [x1], x5
    \first      v18.8h, v3.8h, v0.8h
    ld1         {v2.8h}, [x2], x5
    uaba        v16.8h, v1.8h, v5.8h
    ld1         {v3.8h}, [x3], x5
    uaba        v17.8h, v2.8h, v5.8h
.if \x == 4
    ld1         {v4.8h}, [x4], x5
    ld1         {v1.8h}, [x4], x5
.endif
    uaba        v18.8h, v3.8h, v5.8h
.if \x == 4
    \first      v19.8h, v4.8h, v0.8h
    uaba        v19.8h, v1.8h, v5.8h
.endif
.endm

.macro SAD_X_16 x, first=uaba
    ld1         {v0.8h, v1.8h}, [x0], x7
    ld1         {v2.8h, v3.8h}, [x1], x5

    ld1         {v4.8h, v5.8h}, [x2], x5
    \first      v16.8h, v2.8h, v0.8h
    \first      v20.8h, v3.8h, v1.8h
    ld1         {v24.8h, v25.8h}, [x3], x5
    \first      v17.8h, v4.8h, v0.8h
    \first      v21.8h, v5.8h, v1.8h

    ld1         {v6.8h, v7.8h}, [x0], x7
    ld1         {v2.8h, v3.8h}, [x1], x5
    \first      v18.8h, v24.8h, v0.8h
    \first      v22.8h, v25.8h, v1.8h
    ld1         {v4.8h, v5.8h}, [x2], x5
    uaba        v16.8h, v2.8h, v6.8h
    uaba        v20.8h, v3.8h, v7.8h

    ld1         {v24.8h, v25.8h}, [x3], x5
    uaba        v17.8h, v4.8h, v6.8h
    uaba        v21.8h, v5.8h, v7.8h

.if \x == 4
    ld1         {v26.8h, v27.8h}, [x4], x5
    ld1         {v28.8h, v29.8h}, [x4], x5
.endif
    uaba        v18.8h, v24.8h, v6.8h
    uaba        v22.8h, v25.8h, v7.8h
.if \x == 4
    \first      v19.8h, v26.8h, v0.8h
    \first      v23.8h, v27.8h, v1.8h

    uaba        v19.8h, v28.8h, v6.8h
    uaba        v23.8h, v29.8h, v7.8h
.endif
.endm

.macro SAD_X_FUNC x, w, h
function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
.if \x == 3
    mov         x6, x5
    mov         x5, x4
.endif
    mov         x7, #FENC_STRIDE
    lsl         x5, x5, #1
    lsl         x7, x7, #1

    SAD_X_\w    \x, uabd

.rept \h / 2 - 1
    SAD_X_\w    \x
.endr

.if \w > 8
    add         v16.8h, v16.8h, v20.8h
    add         v17.8h, v17.8h, v21.8h
    add         v18.8h, v18.8h, v22.8h
.if \x == 4
    add         v19.8h, v19.8h, v23.8h
.endif
.endif
// add up the sads
    uaddlv      s0, v16.8h
    uaddlv      s1, v17.8h
    uaddlv      s2, v18.8h

    stp         s0, s1, [x6], #8
.if \x == 3
    str         s2, [x6]
.else
    uaddlv      s3, v19.8h
    stp         s2, s3, [x6]
.endif
    ret
endfunc
.endm

function pixel_vsad_neon, export=1
    subs        w2, w2, #2
    lsl         x1, x1, #1

    ld1         {v0.8h, v1.8h}, [x0], x1
    ld1         {v2.8h, v3.8h}, [x0], x1
    uabd        v6.8h, v0.8h, v2.8h
    uabd        v7.8h, v1.8h, v3.8h
    b.le        2f
1:
    subs        w2, w2, #2

    ld1         {v0.8h, v1.8h}, [x0], x1
    uaba        v6.8h, v2.8h, v0.8h
    uaba        v7.8h, v3.8h, v1.8h
    ld1         {v2.8h, v3.8h}, [x0], x1
    b.lt        2f
    uaba        v6.8h, v0.8h, v2.8h
    uaba        v7.8h, v1.8h, v3.8h
    b.gt        1b
2:
    add         v5.8h, v6.8h, v7.8h
    uaddlv      s0, v5.8h
    fmov        w0, s0
    ret
endfunc

function pixel_asd8_neon, export=1
    sub         w4, w4, #2
    lsl         x1, x1, #1
    lsl         x3, x3, #1

    ld1         {v0.8h}, [x0], x1
    ld1         {v1.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3

    sub         v16.8h, v0.8h, v1.8h

1:
    subs        w4, w4, #2
    ld1         {v4.8h}, [x0], x1
    ld1         {v5.8h}, [x2], x3

    sub         v17.8h, v2.8h, v3.8h
    sub         v18.8h, v4.8h, v5.8h
    add         v16.8h, v16.8h, v17.8h

    ld1         {v2.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3
    add         v16.8h, v16.8h, v18.8h

    b.gt        1b

    sub         v17.8h, v2.8h, v3.8h
    add         v16.8h, v16.8h, v17.8h

    saddlv      s0, v16.8h
    abs         v0.4s, v0.4s
    fmov        w0, s0
    ret
endfunc

.macro SSD_START_4
    ld1         {v16.d}[0], [x0], x1
    ld1         {v17.d}[0], [x2], x3
    sub         v2.4h, v16.4h, v17.4h
    ld1         {v16.d}[0], [x0], x1
    ld1         {v17.d}[0], [x2], x3
    smull       v0.4s, v2.4h, v2.4h
.endm

.macro SSD_4
    sub         v2.4h, v16.4h, v17.4h
    ld1         {v16.d}[0], [x0], x1
    ld1         {v17.d}[0], [x2], x3
    smlal       v0.4s, v2.4h, v2.4h
.endm

.macro SSD_END_4
    sub         v2.4h, v16.4h, v17.4h
    smlal       v0.4s, v2.4h, v2.4h
.endm

.macro SSD_START_8
    ld1         {v16.8h}, [x0], x1
    ld1         {v17.8h}, [x2], x3
    sub         v2.8h, v16.8h, v17.8h
    ld1         {v16.8h}, [x0], x1
    ld1         {v17.8h}, [x2], x3
    smull       v0.4s, v2.4h, v2.4h
    smull2      v20.4s, v2.8h, v2.8h
.endm

.macro SSD_8
    sub         v2.8h, v16.8h, v17.8h
    ld1         {v16.8h}, [x0], x1
    ld1         {v17.8h}, [x2], x3
    smlal       v0.4s, v2.4h, v2.4h
    smlal2      v20.4s, v2.8h, v2.8h
.endm

.macro SSD_END_8
    sub         v2.8h, v16.8h, v17.8h
    smlal       v0.4s, v2.4h, v2.4h
    smlal2      v20.4s, v2.8h, v2.8h
    add         v0.4s, v0.4s, v20.4s
.endm

.macro SSD_START_16
    ld1         {v16.8h, v17.8h}, [x0], x1
    ld1         {v18.8h, v19.8h}, [x2], x3
    sub         v2.8h, v16.8h, v18.8h
    sub         v3.8h, v17.8h, v19.8h
    ld1         {v16.8h, v17.8h}, [x0], x1
    smull       v0.4s, v2.4h, v2.4h
    smull2      v20.4s, v2.8h, v2.8h
    ld1         {v18.8h, v19.8h}, [x2], x3
    smlal       v0.4s, v3.4h, v3.4h
    smlal2      v20.4s, v3.8h, v3.8h
.endm

.macro SSD_16
    sub         v2.8h, v16.8h, v18.8h
    sub         v3.8h, v17.8h, v19.8h
    ld1         {v16.8h, v17.8h}, [x0], x1
    smlal       v0.4s, v2.4h, v2.4h
    smlal2      v20.4s, v2.8h, v2.8h
    ld1         {v18.8h, v19.8h}, [x2], x3
    smlal       v0.4s, v3.4h, v3.4h
    smlal2      v20.4s, v3.8h, v3.8h
.endm

.macro SSD_END_16
    sub         v2.8h, v16.8h, v18.8h
    sub         v3.8h, v17.8h, v19.8h
    smlal       v0.4s, v2.4h, v2.4h
    smlal2      v20.4s, v2.8h, v2.8h
    smlal       v0.4s, v3.4h, v3.4h
    smlal2      v20.4s, v3.8h, v3.8h
    add         v0.4s, v0.4s, v20.4s
.endm

.macro SSD_FUNC w h
function pixel_ssd_\w\()x\h\()_neon, export=1
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    SSD_START_\w
.rept \h-2
    SSD_\w
.endr
    SSD_END_\w

    addv        s0, v0.4s
    fmov        w0, s0
    ret
endfunc
.endm

function pixel_satd_4x4_neon, export=1
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v3.d}[0], [x2], x3
    ld1         {v2.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    ld1         {v3.d}[1], [x2], x3
    ld1         {v2.d}[1], [x0], x1

    sub         v0.8h, v0.8h, v1.8h
    sub         v1.8h, v2.8h, v3.8h

    SUMSUB_AB   v2.8h, v3.8h, v0.8h, v1.8h

    zip1        v0.2d, v2.2d, v3.2d
    zip2        v1.2d, v2.2d, v3.2d
    SUMSUB_AB   v2.8h, v3.8h, v0.8h, v1.8h

    trn1        v0.8h, v2.8h, v3.8h
    trn2        v1.8h, v2.8h, v3.8h
    SUMSUB_AB   v2.8h, v3.8h, v0.8h, v1.8h

    trn1        v0.4s, v2.4s, v3.4s
    trn2        v1.4s, v2.4s, v3.4s
    abs         v0.8h, v0.8h
    abs         v1.8h, v1.8h
    umax        v0.8h, v0.8h, v1.8h
    uaddlv      s0, v0.8h
    fmov        w0, s0
    ret
endfunc

function pixel_satd_4x8_neon, export=1
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v3.d}[0], [x2], x3
    ld1         {v2.d}[0], [x0], x1
    ld1         {v5.d}[0], [x2], x3
    ld1         {v4.d}[0], [x0], x1
    ld1         {v7.d}[0], [x2], x3
    ld1         {v6.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    ld1         {v3.d}[1], [x2], x3
    ld1         {v2.d}[1], [x0], x1
    ld1         {v5.d}[1], [x2], x3
    ld1         {v4.d}[1], [x0], x1
    ld1         {v7.d}[1], [x2], x3
    ld1         {v6.d}[1], [x0], x1
    b           satd_4x8_8x4_end_neon
endfunc

function pixel_satd_8x4_neon, export=1
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    ld1         {v1.8h}, [x2], x3
    ld1         {v0.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    ld1         {v5.8h}, [x2], x3
    ld1         {v4.8h}, [x0], x1
    ld1         {v7.8h}, [x2], x3
    ld1         {v6.8h}, [x0], x1
endfunc

function satd_4x8_8x4_end_neon
    sub         v0.8h, v0.8h, v1.8h
    sub         v1.8h, v2.8h, v3.8h
    sub         v2.8h, v4.8h, v5.8h
    sub         v3.8h, v6.8h, v7.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h, v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h, v3.8h

    SUMSUB_AB   v4.8h, v6.8h, v16.8h, v18.8h
    SUMSUB_AB   v5.8h, v7.8h, v17.8h, v19.8h

    trn1        v0.8h, v4.8h, v5.8h
    trn2        v1.8h, v4.8h, v5.8h
    trn1        v2.8h, v6.8h, v7.8h
    trn2        v3.8h, v6.8h, v7.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h, v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h, v3.8h

    trn1        v0.4s, v16.4s, v18.4s
    trn2        v1.4s, v16.4s, v18.4s
    trn1        v2.4s, v17.4s, v19.4s
    trn2        v3.4s, v17.4s, v19.4s
    abs         v0.8h, v0.8h
    abs         v1.8h, v1.8h
    abs         v2.8h, v2.8h
    abs         v3.8h, v3.8h
    umax        v0.8h, v0.8h, v1.8h
    umax        v1.8h, v2.8h, v3.8h
    add         v0.8h, v0.8h, v1.8h
    uaddlv      s0, v0.8h
    mov         w0, v0.s[0]
    ret
endfunc

function pixel_satd_4x16_neon, export=1
    mov         x4, x30
    lsl         x1, x1, #1
    lsl         x3, x3, #1

    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v3.d}[0], [x2], x3
    ld1         {v2.d}[0], [x0], x1
    ld1         {v5.d}[0], [x2], x3
    ld1         {v4.d}[0], [x0], x1
    ld1         {v7.d}[0], [x2], x3
    ld1         {v6.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    ld1         {v3.d}[1], [x2], x3
    ld1         {v2.d}[1], [x0], x1
    ld1         {v5.d}[1], [x2], x3
    ld1         {v4.d}[1], [x0], x1
    ld1         {v7.d}[1], [x2], x3
    ld1         {v6.d}[1], [x0], x1
    sub         v16.8h, v0.8h, v1.8h
    sub         v17.8h, v2.8h, v3.8h
    sub         v18.8h, v4.8h, v5.8h
    sub         v19.8h, v6.8h, v7.8h
    ld1         {v1.d}[0], [x2], x3
    ld1         {v0.d}[0], [x0], x1
    ld1         {v3.d}[0], [x2], x3
    ld1         {v2.d}[0], [x0], x1
    ld1         {v5.d}[0], [x2], x3
    ld1         {v4.d}[0], [x0], x1
    ld1         {v7.d}[0], [x2], x3
    ld1         {v6.d}[0], [x0], x1
    ld1         {v1.d}[1], [x2], x3
    ld1         {v0.d}[1], [x0], x1
    ld1         {v3.d}[1], [x2], x3
    ld1         {v2.d}[1], [x0], x1
    ld1         {v5.d}[1], [x2], x3
    ld1         {v4.d}[1], [x0], x1
    ld1         {v7.d}[1], [x2], x3
    ld1         {v6.d}[1], [x0], x1
    sub         v20.8h, v0.8h, v1.8h
    sub         v21.8h, v2.8h, v3.8h
    sub         v22.8h, v4.8h, v5.8h
    sub         v23.8h, v6.8h, v7.8h

    SUMSUB_AB   v0.8h, v1.8h, v16.8h, v17.8h
    SUMSUB_AB   v2.8h, v3.8h, v18.8h, v19.8h

    bl          satd_8x4v_8x8h_neon

    add         v30.8h, v0.8h, v1.8h
    add         v31.8h, v2.8h, v3.8h
    add         v0.8h, v30.8h, v31.8h
    uaddlv      s0, v0.8h
    fmov        w0, s0
    ret         x4
endfunc

.macro load_diff_fly_8x8
    ld1         {v1.8h}, [x2], x3
    ld1         {v0.8h}, [x0], x1
    ld1         {v3.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    sub         v16.8h, v0.8h, v1.8h
    ld1         {v5.8h}, [x2], x3
    ld1         {v4.8h}, [x0], x1
    sub         v17.8h, v2.8h, v3.8h
    ld1         {v7.8h}, [x2], x3
    ld1         {v6.8h}, [x0], x1
    sub         v18.8h, v4.8h, v5.8h
    ld1         {v1.8h}, [x2], x3
    ld1         {v0.8h}, [x0], x1
    sub         v19.8h, v6.8h, v7.8h
    ld1         {v3.8h}, [x2], x3
    ld1         {v2.8h}, [x0], x1
    sub         v20.8h, v0.8h, v1.8h
    ld1         {v5.8h}, [x2], x3
    ld1         {v4.8h}, [x0], x1
    sub         v21.8h, v2.8h, v3.8h
    ld1         {v7.8h}, [x2], x3
    ld1         {v6.8h}, [x0], x1

    SUMSUB_AB   v0.8h, v1.8h, v16.8h, v17.8h
    SUMSUB_AB   v2.8h, v3.8h, v18.8h, v19.8h

    sub         v22.8h, v4.8h, v5.8h
    sub         v23.8h, v6.8h, v7.8h
.endm

function pixel_satd_8x8_neon, export=1
    mov         x4, x30

    lsl         x1, x1, #1
    lsl         x3, x3, #1

    bl          satd_8x8_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h
    add         v0.8h, v0.8h, v1.8h
    uaddlv      s0, v0.8h
    mov         w0, v0.s[0]
    ret         x4
endfunc

function pixel_satd_8x16_neon, export=1
    mov         x4, x30

    lsl         x1, x1, #1
    lsl         x3, x3, #1

    bl          satd_8x8_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h
    add         v30.8h, v0.8h, v1.8h

    bl          satd_8x8_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h
    add         v31.8h, v0.8h, v1.8h
    add         v0.8h, v30.8h, v31.8h
    uaddlv      s0, v0.8h
    mov         w0, v0.s[0]
    ret         x4
endfunc

function satd_8x8_neon
    load_diff_fly_8x8
endfunc

// one vertical hadamard pass and two horizontal
function satd_8x4v_8x8h_neon
    SUMSUB_AB   v16.8h, v18.8h, v0.8h, v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h, v3.8h
    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
    transpose   v0.8h, v1.8h, v16.8h, v17.8h
    transpose   v2.8h, v3.8h, v18.8h, v19.8h
    transpose   v4.8h, v5.8h, v20.8h, v21.8h
    transpose   v6.8h, v7.8h, v22.8h, v23.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h, v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h, v3.8h
    SUMSUB_AB   v20.8h, v21.8h, v4.8h, v5.8h
    SUMSUB_AB   v22.8h, v23.8h, v6.8h, v7.8h

    transpose   v0.4s, v2.4s, v16.4s, v18.4s
    transpose   v1.4s, v3.4s, v17.4s, v19.4s
    transpose   v4.4s, v6.4s, v20.4s, v22.4s
    transpose   v5.4s, v7.4s, v21.4s, v23.4s

    abs         v0.8h, v0.8h
    abs         v1.8h, v1.8h
    abs         v2.8h, v2.8h
    abs         v3.8h, v3.8h
    abs         v4.8h, v4.8h
    abs         v5.8h, v5.8h
    abs         v6.8h, v6.8h
    abs         v7.8h, v7.8h

    umax        v0.8h, v0.8h, v2.8h
    umax        v1.8h, v1.8h, v3.8h
    umax        v2.8h, v4.8h, v6.8h
    umax        v3.8h, v5.8h, v7.8h

    ret
endfunc

function pixel_ssd_nv12_core_neon, export=1
    sxtw        x8, w4
    add         x8, x8, #8
    and         x8, x8, #~15
    movi        v6.2d, #0
    movi        v7.2d, #0
    sub         x1, x1, x8, lsl #1
    sub         x3, x3, x8, lsl #1

    lsl         x1, x1, #1
    lsl         x3, x3, #1
    lsl         x4, x4, #1
1:
    subs        w8, w4, #32
    ld2         {v0.8h, v1.8h}, [x0], #32
    ld2         {v2.8h, v3.8h}, [x2], #32
    ld2         {v24.8h, v25.8h}, [x0], #32
    ld2         {v26.8h, v27.8h}, [x2], #32

    sub         v16.8h, v0.8h, v2.8h
    sub         v17.8h, v1.8h, v3.8h
    smull       v20.4s, v16.4h, v16.4h
    smull       v21.4s, v17.4h, v17.4h
    sub         v18.8h, v24.8h, v26.8h
    sub         v19.8h, v25.8h, v27.8h
    smlal2      v20.4s, v16.8h, v16.8h
    smlal2      v21.4s, v17.8h, v17.8h

    b.lt        4f
    b.eq        3f
2:
    smlal       v20.4s, v18.4h, v18.4h
    smlal       v21.4s, v19.4h, v19.4h
    ld2         {v0.8h, v1.8h}, [x0], #32
    ld2         {v2.8h, v3.8h}, [x2], #32
    smlal2      v20.4s, v18.8h, v18.8h
    smlal2      v21.4s, v19.8h, v19.8h

    subs        w8, w8, #32
    sub         v16.8h, v0.8h, v2.8h
    sub         v17.8h, v1.8h, v3.8h
    smlal       v20.4s, v16.4h, v16.4h
    smlal       v21.4s, v17.4h, v17.4h
    ld2         {v24.8h,v25.8h}, [x0], #32
    ld2         {v26.8h,v27.8h}, [x2], #32
    smlal2      v20.4s, v16.8h, v16.8h
    smlal2      v21.4s, v17.8h, v17.8h
    b.lt        4f

    sub         v18.8h, v24.8h, v26.8h
    sub         v19.8h, v25.8h, v27.8h
    b.gt        2b

3:
    smlal       v20.4s, v18.4h, v18.4h
    smlal       v21.4s, v19.4h, v19.4h
    smlal2      v20.4s, v18.8h, v18.8h
    smlal2      v21.4s, v19.8h, v19.8h
 4:

    subs        w5, w5, #1
    uaddw       v6.2d, v6.2d, v20.2s
    uaddw       v7.2d, v7.2d, v21.2s
    add         x0, x0, x1
    add         x2, x2, x3
    uaddw2      v6.2d, v6.2d, v20.4s
    uaddw2      v7.2d, v7.2d, v21.4s

    b.gt        1b

    addp        v6.2d, v6.2d, v7.2d
    st1         {v6.d}[0], [x6]
    st1         {v6.d}[1], [x7]

    ret
endfunc

.macro pixel_var_8 h
function pixel_var_8x\h\()_neon, export=1
    lsl         x1, x1, #1
    ld1         {v16.8h}, [x0], x1
    ld1         {v17.8h}, [x0], x1
    mov         x2, \h - 4
    umull       v1.4s, v16.4h, v16.4h
    umull2      v30.4s, v16.8h, v16.8h
    mov         v0.16b, v16.16b
    umull       v2.4s, v17.4h, v17.4h
    umull2      v31.4s, v17.8h, v17.8h
    add         v0.8h, v0.8h, v17.8h
    ld1         {v18.8h}, [x0], x1
    ld1         {v19.8h}, [x0], x1

1:  subs        x2, x2,  #4
    add         v0.8h, v0.8h, v18.8h
    umull       v24.4s, v18.4h, v18.4h
    umull2      v25.4s, v18.8h, v18.8h
    ld1         {v20.8h}, [x0], x1
    add         v0.8h, v0.8h, v19.8h
    umull       v26.4s, v19.4h, v19.4h
    umull2      v27.4s, v19.8h, v19.8h
    add         v1.4s, v1.4s, v24.4s
    add         v30.4s, v30.4s, v25.4s
    ld1         {v21.8h}, [x0], x1
    add         v0.8h, v0.8h, v20.8h
    umull       v28.4s, v20.4h, v20.4h
    umull2      v29.4s, v20.8h, v20.8h
    add         v2.4s, v2.4s, v26.4s
    add         v31.4s, v31.4s, v27.4s
    ld1         {v18.8h}, [x0], x1
    add         v0.8h, v0.8h, v21.8h
    umull       v3.4s, v21.4h, v21.4h
    umull2      v4.4s, v21.8h, v21.8h
    add         v1.4s, v1.4s, v28.4s
    add         v30.4s, v30.4s, v29.4s
    ld1         {v19.8h}, [x0], x1
    add         v2.4s, v2.4s, v3.4s
    add         v31.4s, v31.4s, v4.4s
    b.gt        1b

    add         v0.8h, v0.8h, v18.8h
    umull       v24.4s, v18.4h, v18.4h
    umull2      v25.4s, v18.8h, v18.8h
    add         v0.8h, v0.8h, v19.8h
    umull       v26.4s, v19.4h, v19.4h
    umull2      v27.4s, v19.8h, v19.8h
    add         v1.4s, v1.4s, v24.4s
    add         v30.4s, v30.4s, v25.4s
    add         v2.4s, v2.4s, v26.4s
    add         v31.4s, v31.4s, v27.4s

    b           var_end
endfunc
.endm

function pixel_var_16x16_neon, export=1
    lsl         x1, x1, #1
    ld1         {v16.8h, v17.8h}, [x0], x1
    ld1         {v18.8h, v19.8h}, [x0], x1
    mov         x2, #14

    umull       v1.4s, v16.4h, v16.4h
    umull2      v30.4s, v16.8h, v16.8h
    add         v0.8h, v16.8h, v17.8h
    umull       v2.4s, v17.4h, v17.4h
    umull2      v31.4s, v17.8h, v17.8h

1:  subs        x2, x2, #2
    ld1         {v20.8h, v21.8h}, [x0], x1

    add         v0.8h, v0.8h, v18.8h
    umlal       v1.4s, v18.4h, v18.4h
    umlal2      v30.4s, v18.8h, v18.8h
    umlal       v2.4s, v19.4h, v19.4h
    umlal2      v31.4s, v19.8h, v19.8h
    add         v0.8h, v0.8h, v19.8h
    ld1         {v18.8h, v19.8h}, [x0], x1
    add         v0.8h, v0.8h, v20.8h
    umlal       v1.4s, v20.4h, v20.4h
    umlal2      v30.4s, v20.8h, v20.8h
    umlal       v2.4s, v21.4h, v21.4h
    umlal2      v31.4s, v21.8h, v21.8h
    add         v0.8h, v0.8h, v21.8h

    b.gt        1b

    add         v0.8h, v0.8h, v18.8h
    umlal       v1.4s, v18.4h, v18.4h
    umlal2      v30.4s, v18.8h, v18.8h
    umlal       v2.4s, v19.4h, v19.4h
    umlal2      v31.4s, v19.8h, v19.8h
    add         v0.8h, v0.8h, v19.8h

endfunc

function var_end
    add         v1.4s, v1.4s, v2.4s
    add         v30.4s, v30.4s, v31.4s
    add         v1.4s, v1.4s, v30.4s
    uaddlv      s0, v0.8h
    uaddlv      d1, v1.4s
    mov         w0, v0.s[0]
    mov         x1, v1.d[0]
    orr         x0, x0, x1, lsl #32
    ret
endfunc

.macro pixel_var2_8 h
function pixel_var2_8x\h\()_neon, export=1
    mov         x3, #32
    ld1         {v16.8h}, [x0], #16
    ld1         {v18.8h}, [x1], x3
    ld1         {v17.8h}, [x0], #16
    ld1         {v19.8h}, [x1], x3
    mov         x5, \h - 2
    sub         v0.8h, v16.8h, v18.8h
    sub         v1.8h, v17.8h, v19.8h
    ld1         {v16.8h}, [x0], #16
    ld1         {v18.8h}, [x1], x3
    smull       v2.4s, v0.4h, v0.4h
    smull2      v3.4s, v0.8h, v0.8h
    smull       v4.4s, v1.4h, v1.4h
    smull2      v5.4s, v1.8h, v1.8h

    sub         v6.8h, v16.8h, v18.8h

1:  subs        x5, x5, #1
    ld1         {v17.8h}, [x0], #16
    ld1         {v19.8h}, [x1], x3
    smlal       v2.4s, v6.4h, v6.4h
    smlal2      v3.4s, v6.8h, v6.8h
    sub         v7.8h, v17.8h, v19.8h
    add         v0.8h, v0.8h, v6.8h
    ld1         {v16.8h}, [x0], #16
    ld1         {v18.8h}, [x1], x3
    smlal       v4.4s, v7.4h, v7.4h
    smlal2      v5.4s, v7.8h, v7.8h
    sub         v6.8h, v16.8h, v18.8h
    add         v1.8h, v1.8h, v7.8h
    b.gt        1b

    ld1         {v17.8h}, [x0], #16
    ld1         {v19.8h}, [x1], x3
    smlal       v2.4s, v6.4h, v6.4h
    smlal2      v3.4s, v6.8h, v6.8h
    sub         v7.8h, v17.8h, v19.8h
    add         v0.8h, v0.8h, v6.8h
    smlal       v4.4s, v7.4h, v7.4h
    add         v1.8h, v1.8h, v7.8h
    smlal2      v5.4s, v7.8h, v7.8h

    saddlv      s0, v0.8h
    saddlv      s1, v1.8h
    add         v2.4s, v2.4s, v3.4s
    add         v4.4s, v4.4s, v5.4s
    mov         w0, v0.s[0]
    mov         w1, v1.s[0]
    addv        s2, v2.4s
    addv        s4, v4.4s
    mul         w0, w0, w0
    mul         w1, w1, w1
    mov         w3, v2.s[0]
    mov         w4, v4.s[0]
    sub         w0, w3, w0, lsr # 6 + (\h >> 4)
    sub         w1, w4, w1, lsr # 6 + (\h >> 4)
    str         w3, [x2]
    add         w0, w0,  w1
    str         w4, [x2, #4]

    ret
endfunc
.endm

function pixel_satd_16x8_neon, export=1
    mov         x4, x30

    lsl         x1, x1, #1
    lsl         x3, x3, #1

    bl          satd_16x4_neon
    add         v30.8h, v0.8h, v1.8h
    add         v31.8h, v2.8h, v3.8h

    bl          satd_16x4_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h
    add         v30.8h, v30.8h, v0.8h
    add         v31.8h, v31.8h, v1.8h

    add         v0.8h, v30.8h, v31.8h
    uaddlv      s0, v0.8h
    mov         w0, v0.s[0]
    ret         x4
endfunc

function pixel_satd_16x16_neon, export=1
    mov         x4, x30

    lsl         x1, x1, #1
    lsl         x3, x3, #1

    bl          satd_16x4_neon

    uaddl       v30.4s, v0.4h, v1.4h
    uaddl       v31.4s, v2.4h, v3.4h
    uaddl2      v28.4s, v0.8h, v1.8h
    uaddl2      v29.4s, v2.8h, v3.8h
    add         v30.4s, v30.4s, v28.4s
    add         v31.4s, v31.4s, v29.4s

    bl          satd_16x4_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h

    uaddw       v30.4s, v30.4s, v0.4h
    uaddw2      v30.4s, v30.4s, v0.8h
    uaddw       v31.4s, v31.4s, v1.4h
    uaddw2      v31.4s, v31.4s, v1.8h

    bl          satd_16x4_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h

    uaddw       v30.4s, v30.4s, v0.4h
    uaddw2      v30.4s, v30.4s, v0.8h
    uaddw       v31.4s, v31.4s, v1.4h
    uaddw2      v31.4s, v31.4s, v1.8h

    bl          satd_16x4_neon
    add         v0.8h, v0.8h, v1.8h
    add         v1.8h, v2.8h, v3.8h
    uaddw       v30.4s, v30.4s, v0.4h
    uaddw2      v30.4s, v30.4s, v0.8h
    uaddw       v31.4s, v31.4s, v1.4h
    uaddw2      v31.4s, v31.4s, v1.8h

    add         v0.4s, v30.4s, v31.4s
    addv        s0, v0.4s
    mov         w0, v0.s[0]
    ret         x4
endfunc

function satd_16x4_neon
    ld1         {v0.8h, v1.8h}, [x2], x3
    ld1         {v2.8h, v3.8h}, [x0], x1

    sub         v16.8h, v2.8h, v0.8h
    sub         v20.8h, v3.8h, v1.8h

    ld1         {v4.8h, v5.8h}, [x2], x3
    ld1         {v6.8h, v7.8h}, [x0], x1

    sub         v17.8h, v6.8h, v4.8h
    sub         v21.8h, v7.8h, v5.8h

    ld1         {v0.8h, v1.8h}, [x2], x3
    ld1         {v2.8h, v3.8h}, [x0], x1

    sub         v18.8h, v2.8h, v0.8h
    sub         v22.8h, v3.8h, v1.8h

    ld1         {v4.8h, v5.8h}, [x2], x3
    ld1         {v6.8h, v7.8h}, [x0], x1

    sub         v19.8h, v6.8h, v4.8h
    sub         v23.8h, v7.8h, v5.8h

    SUMSUB_AB   v0.8h, v1.8h, v16.8h, v17.8h
    SUMSUB_AB   v2.8h, v3.8h, v18.8h, v19.8h

    b           satd_8x4v_8x8h_neon
endfunc

function pixel_sa8d_8x8_neon, export=1
    mov         x4, x30
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    bl          pixel_sa8d_8x8_neon
    add         v0.8h, v0.8h, v1.8h
    uaddlv      s0, v0.8h
    mov         w0, v0.s[0]
    add         w0, w0, #1
    lsr         w0, w0, #1
    ret         x4
endfunc

function pixel_sa8d_16x16_neon, export=1
    mov         x4, x30
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    bl          pixel_sa8d_8x8_neon
    uaddlp      v30.4s, v0.8h
    uaddlp      v31.4s, v1.8h
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    sub         x0, x0, x1, lsl #4
    sub         x2, x2, x3, lsl #4
    add         x0, x0, #16
    add         x2, x2, #16
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    bl          pixel_sa8d_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    add         v0.4s, v30.4s, v31.4s
    addv        s0, v0.4s
    mov         w0, v0.s[0]
    add         w0, w0, #1
    lsr         w0, w0, #1
    ret         x4
endfunc

.macro sa8d_satd_8x8 satd=
function pixel_sa8d_\satd\()8x8_neon
    load_diff_fly_8x8

    SUMSUB_AB   v16.8h, v18.8h, v0.8h, v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h, v3.8h

    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
.ifc \satd, satd_
    transpose   v0.8h, v1.8h, v16.8h, v17.8h
    transpose   v2.8h, v3.8h, v18.8h, v19.8h
    transpose   v4.8h, v5.8h, v20.8h, v21.8h
    transpose   v6.8h, v7.8h, v22.8h, v23.8h

    SUMSUB_AB   v24.8h, v25.8h, v0.8h, v1.8h
    SUMSUB_AB   v26.8h, v27.8h, v2.8h, v3.8h
    SUMSUB_AB   v0.8h, v1.8h, v4.8h, v5.8h
    SUMSUB_AB   v2.8h, v3.8h, v6.8h, v7.8h

    transpose   v4.4s, v6.4s, v24.4s, v26.4s
    transpose   v5.4s, v7.4s, v25.4s, v27.4s
    transpose   v24.4s, v26.4s, v0.4s, v2.4s
    transpose   v25.4s, v27.4s, v1.4s, v3.4s

    abs         v0.8h, v4.8h
    abs         v1.8h, v5.8h
    abs         v2.8h, v6.8h
    abs         v3.8h, v7.8h
    abs         v4.8h, v24.8h
    abs         v5.8h, v25.8h
    abs         v6.8h, v26.8h
    abs         v7.8h, v27.8h

    umax        v0.8h, v0.8h, v2.8h
    umax        v1.8h, v1.8h, v3.8h
    umax        v2.8h, v4.8h, v6.8h
    umax        v3.8h, v5.8h, v7.8h

    add         v26.8h, v0.8h, v1.8h
    add         v27.8h, v2.8h, v3.8h
.endif

    SUMSUB_AB   v0.8h, v16.8h, v16.8h, v20.8h
    SUMSUB_AB   v1.8h, v17.8h, v17.8h, v21.8h
    SUMSUB_AB   v2.8h, v18.8h, v18.8h, v22.8h
    SUMSUB_AB   v3.8h, v19.8h, v19.8h, v23.8h

    transpose   v20.8h, v21.8h, v16.8h, v17.8h
    transpose   v4.8h, v5.8h, v0.8h, v1.8h
    transpose   v22.8h, v23.8h, v18.8h, v19.8h
    transpose   v6.8h, v7.8h, v2.8h, v3.8h

    SUMSUB_AB   v2.8h, v3.8h, v20.8h, v21.8h
    SUMSUB_AB   v24.8h, v25.8h, v4.8h, v5.8h
    SUMSUB_AB   v0.8h, v1.8h, v22.8h, v23.8h
    SUMSUB_AB   v4.8h, v5.8h, v6.8h, v7.8h

    transpose   v20.4s, v22.4s, v2.4s, v0.4s
    transpose   v21.4s, v23.4s, v3.4s, v1.4s
    transpose   v16.4s, v18.4s, v24.4s, v4.4s
    transpose   v17.4s, v19.4s, v25.4s, v5.4s

    SUMSUB_AB   v0.8h, v2.8h, v20.8h, v22.8h
    SUMSUB_AB   v1.8h, v3.8h, v21.8h, v23.8h
    SUMSUB_AB   v4.8h, v6.8h, v16.8h, v18.8h
    SUMSUB_AB   v5.8h, v7.8h, v17.8h, v19.8h

    transpose   v16.2d, v20.2d, v0.2d, v4.2d
    transpose   v17.2d, v21.2d, v1.2d, v5.2d
    transpose   v18.2d, v22.2d, v2.2d, v6.2d
    transpose   v19.2d, v23.2d, v3.2d, v7.2d

    abs         v16.8h, v16.8h
    abs         v20.8h, v20.8h
    abs         v17.8h, v17.8h
    abs         v21.8h, v21.8h
    abs         v18.8h, v18.8h
    abs         v22.8h, v22.8h
    abs         v19.8h, v19.8h
    abs         v23.8h, v23.8h

    umax        v16.8h, v16.8h, v20.8h
    umax        v17.8h, v17.8h, v21.8h
    umax        v18.8h, v18.8h, v22.8h
    umax        v19.8h, v19.8h, v23.8h

    add         v0.8h, v16.8h, v17.8h
    add         v1.8h, v18.8h, v19.8h

    ret
endfunc
.endm

function pixel_sa8d_satd_16x16_neon, export=1
    mov         x4, x30
    lsl         x1, x1, #1
    lsl         x3, x3, #1
    bl          pixel_sa8d_satd_8x8_neon
    uaddlp      v30.4s, v0.8h
    uaddlp      v31.4s, v1.8h
    uaddlp      v28.4s, v26.8h
    uaddlp      v29.4s, v27.8h
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    sub         x0, x0, x1, lsl #4
    sub         x2, x2, x3, lsl #4
    add         x0, x0, #16
    add         x2, x2, #16
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    bl          pixel_sa8d_satd_8x8_neon
    uadalp      v30.4s, v0.8h
    uadalp      v31.4s, v1.8h
    uadalp      v28.4s, v26.8h
    uadalp      v29.4s, v27.8h
    add         v0.4s, v30.4s, v31.4s  // sa8d
    add         v1.4s, v28.4s, v29.4s  // satd
    addv        s0, v0.4s
    addv        s1, v1.4s
    urshr       v0.4s, v0.4s, #1
    fmov        w0, s0
    fmov        w1, s1
    add         x0, x0, x1, lsl #32
    ret         x4
endfunc

.macro HADAMARD_AC w h
function pixel_hadamard_ac_\w\()x\h\()_neon, export=1
    movrel      x5, mask_ac_4_8
    mov         x4, x30
    lsl         x1, x1, #1
    ld1         {v30.8h,v31.8h}, [x5]
    movi        v28.16b, #0
    movi        v29.16b, #0

    bl          hadamard_ac_8x8_neon
.if \h > 8
    bl          hadamard_ac_8x8_neon
.endif
.if \w > 8
    sub         x0, x0, x1, lsl #3
    add         x0, x0, 16
    bl          hadamard_ac_8x8_neon
.endif
.if \w * \h == 256
    sub         x0, x0, x1, lsl #4
    bl          hadamard_ac_8x8_neon
.endif

    addv        s1, v29.4s
    addv        s0, v28.4s
    mov         w1, v1.s[0]
    mov         w0, v0.s[0]
    lsr         w1, w1, #2
    lsr         w0, w0, #1
    orr         x0, x0, x1, lsl #32
    ret         x4
endfunc
.endm

// v28: satd  v29: sa8d  v30: mask_ac4  v31: mask_ac8
function hadamard_ac_8x8_neon
    ld1         {v16.8h}, [x0], x1
    ld1         {v17.8h}, [x0], x1
    ld1         {v18.8h}, [x0], x1
    ld1         {v19.8h}, [x0], x1
    SUMSUB_AB   v0.8h, v1.8h, v16.8h, v17.8h
    ld1         {v20.8h}, [x0], x1
    ld1         {v21.8h}, [x0], x1
    SUMSUB_AB   v2.8h, v3.8h, v18.8h, v19.8h
    ld1         {v22.8h}, [x0], x1
    ld1         {v23.8h}, [x0], x1
    SUMSUB_AB   v4.8h, v5.8h, v20.8h, v21.8h
    SUMSUB_AB   v6.8h, v7.8h, v22.8h, v23.8h

    SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h

    transpose   v0.8h, v1.8h, v16.8h, v17.8h
    transpose   v2.8h, v3.8h, v18.8h, v19.8h
    transpose   v4.8h, v5.8h, v20.8h, v21.8h
    transpose   v6.8h, v7.8h, v22.8h, v23.8h

    SUMSUB_AB   v16.8h, v17.8h, v0.8h, v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h, v3.8h
    SUMSUB_AB   v20.8h, v21.8h, v4.8h, v5.8h
    SUMSUB_AB   v22.8h, v23.8h, v6.8h, v7.8h

    transpose   v0.4s, v2.4s, v16.4s, v18.4s
    transpose   v1.4s, v3.4s, v17.4s, v19.4s
    transpose   v4.4s, v6.4s, v20.4s, v22.4s
    transpose   v5.4s, v7.4s, v21.4s, v23.4s

    SUMSUB_AB   v16.8h, v18.8h, v0.8h, v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h, v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h

    abs         v0.8h, v16.8h
    abs         v4.8h, v20.8h
    abs         v1.8h, v17.8h
    abs         v5.8h, v21.8h
    abs         v2.8h, v18.8h
    abs         v6.8h, v22.8h
    abs         v3.8h, v19.8h
    abs         v7.8h, v23.8h

    add         v0.8h, v0.8h, v4.8h
    add         v1.8h, v1.8h, v5.8h
    and         v0.16b, v0.16b, v30.16b
    add         v2.8h, v2.8h, v6.8h
    add         v3.8h, v3.8h, v7.8h
    add         v0.8h, v0.8h, v2.8h
    add         v1.8h, v1.8h, v3.8h
    uadalp      v28.4s, v0.8h
    uadalp      v28.4s, v1.8h

    SUMSUB_AB   v6.8h, v7.8h, v23.8h, v19.8h
    SUMSUB_AB   v4.8h, v5.8h, v22.8h, v18.8h
    SUMSUB_AB   v2.8h, v3.8h, v21.8h, v17.8h
    SUMSUB_AB   v1.8h, v0.8h, v16.8h, v20.8h

    transpose   v16.2d, v17.2d, v6.2d, v7.2d
    transpose   v18.2d, v19.2d, v4.2d, v5.2d
    transpose   v20.2d, v21.2d, v2.2d, v3.2d

    abs         v16.8h, v16.8h
    abs         v17.8h, v17.8h
    abs         v18.8h, v18.8h
    abs         v19.8h, v19.8h
    abs         v20.8h, v20.8h
    abs         v21.8h, v21.8h

    transpose   v7.2d, v6.2d, v1.2d, v0.2d

    umax        v3.8h, v16.8h, v17.8h
    umax        v2.8h, v18.8h, v19.8h
    umax        v1.8h, v20.8h, v21.8h

    SUMSUB_AB   v4.8h, v5.8h, v7.8h, v6.8h

    add         v2.8h, v2.8h, v3.8h
    add         v2.8h, v2.8h, v1.8h
    and         v4.16b, v4.16b, v31.16b
    add         v2.8h, v2.8h, v2.8h
    abs         v5.8h, v5.8h
    abs         v4.8h, v4.8h
    add         v2.8h, v2.8h, v5.8h
    add         v2.8h, v2.8h, v4.8h
    uadalp      v29.4s, v2.8h
    ret
endfunc

function pixel_ssim_4x4x2_core_neon, export=1
    lsl         x1, x1, #1
    lsl         x3, x3, #1

    ld1         {v0.8h}, [x0], x1
    ld1         {v2.8h}, [x2], x3
    ld1         {v28.8h}, [x0], x1
    ld1         {v29.8h}, [x2], x3

    umull       v16.4s, v0.4h, v0.4h
    umull2      v17.4s, v0.8h, v0.8h
    umull       v18.4s, v0.4h, v2.4h
    umull2      v19.4s, v0.8h, v2.8h
    umlal       v16.4s, v2.4h, v2.4h
    umlal2      v17.4s, v2.8h, v2.8h

    ld1         {v26.8h}, [x0], x1
    ld1         {v27.8h}, [x2], x3

    umlal       v16.4s, v28.4h, v28.4h
    umlal2      v17.4s, v28.8h, v28.8h
    umlal       v18.4s, v28.4h, v29.4h
    umlal2      v19.4s, v28.8h, v29.8h
    umlal       v16.4s, v29.4h, v29.4h
    umlal2      v17.4s, v29.8h, v29.8h

    add         v0.8h, v0.8h, v28.8h
    add         v1.8h, v2.8h, v29.8h

    umlal       v16.4s, v26.4h, v26.4h
    umlal2      v17.4s, v26.8h, v26.8h
    umlal       v18.4s, v26.4h, v27.4h
    umlal2      v19.4s, v26.8h, v27.8h
    umlal       v16.4s, v27.4h, v27.4h
    umlal2      v17.4s, v27.8h, v27.8h

    ld1         {v28.8h}, [x0], x1
    ld1         {v29.8h}, [x2], x3

    add         v0.8h, v0.8h, v26.8h
    add         v1.8h, v1.8h, v27.8h

    umlal       v16.4s, v28.4h, v28.4h
    umlal2      v17.4s, v28.8h, v28.8h
    umlal       v18.4s, v28.4h, v29.4h
    umlal2      v19.4s, v28.8h, v29.8h
    umlal       v16.4s, v29.4h, v29.4h
    umlal2      v17.4s, v29.8h, v29.8h

    add         v0.8h, v0.8h, v28.8h
    add         v1.8h, v1.8h, v29.8h

    addp        v16.4s, v16.4s, v17.4s
    addp        v17.4s, v18.4s, v19.4s

    uaddlp      v0.4s, v0.8h
    uaddlp      v1.4s, v1.8h

    addp        v0.4s, v0.4s, v0.4s
    addp        v1.4s, v1.4s, v1.4s
    addp        v2.4s, v16.4s, v16.4s
    addp        v3.4s, v17.4s, v17.4s

    st4         {v0.2s, v1.2s, v2.2s, v3.2s}, [x4]
    ret
endfunc

function pixel_ssim_end4_neon, export=1
    mov         x5, #4
    ld1         {v16.4s, v17.4s}, [x0], #32
    ld1         {v18.4s, v19.4s}, [x1], #32
    subs        x2, x5, w2, uxtw
    // These values must be stored in float, since with 10 bit depth edge cases
    // may overflow. The hexadecimal values are IEEE-754 representation of the
    // floating point numbers.
    ldr         w3, =0x45d14e49                 // ssim_c1 = .01*.01*1023*1023*64
    ldr         w4, =0x4a67ca32                 // ssim_c2 = .03*.03*1023*1023*64*63
    add         v0.4s, v16.4s, v18.4s
    add         v1.4s, v17.4s, v19.4s
    add         v0.4s, v0.4s, v1.4s
    ld1         {v20.4s, v21.4s}, [x0], #32
    ld1         {v22.4s, v23.4s}, [x1], #32
    add         v2.4s, v20.4s, v22.4s
    add         v3.4s, v21.4s, v23.4s
    add         v1.4s, v1.4s, v2.4s
    ld1         {v16.4s}, [x0], #16
    ld1         {v18.4s}, [x1], #16
    add         v16.4s, v16.4s, v18.4s
    add         v2.4s, v2.4s, v3.4s
    add         v3.4s, v3.4s, v16.4s

    dup         v30.4s, w3
    dup         v31.4s, w4

    transpose   v4.4s, v5.4s, v0.4s, v1.4s
    transpose   v6.4s, v7.4s, v2.4s, v3.4s
    transpose   v0.2d, v2.2d, v4.2d, v6.2d
    transpose   v1.2d, v3.2d, v5.2d, v7.2d

    // Conversion to floating point number must occur earlier than in 8 bit case
    // because of the range overflow
    scvtf       v0.4s, v0.4s
    scvtf       v2.4s, v2.4s
    scvtf       v1.4s, v1.4s
    scvtf       v3.4s, v3.4s

    fmul        v16.4s, v0.4s, v1.4s            // s1*s2
    fmul        v0.4s, v0.4s, v0.4s
    fmla        v0.4s, v1.4s, v1.4s             // s1*s1 + s2*s2

    // IEEE-754 hexadecimal representation of multipliers
    ldr         w3, =0x42800000                 // 64
    ldr         w4, =0x43000000                 // 128
    dup         v28.4s, w3
    dup         v29.4s, w4

    fmul        v2.4s, v2.4s, v28.4s
    fmul        v3.4s, v3.4s, v29.4s

    fadd        v1.4s, v16.4s, v16.4s

    fsub        v2.4s, v2.4s, v0.4s             // vars
    fsub        v3.4s, v3.4s, v1.4s             // covar*2
    fadd        v0.4s, v0.4s, v30.4s
    fadd        v2.4s, v2.4s, v31.4s
    fadd        v1.4s, v1.4s, v30.4s
    fadd        v3.4s, v3.4s, v31.4s

    fmul        v0.4s, v0.4s, v2.4s
    fmul        v1.4s, v1.4s, v3.4s

    fdiv        v0.4s, v1.4s, v0.4s

    b.eq        1f
    movrel      x3, mask
    add         x3, x3, x2, lsl #2
    ld1         {v29.4s}, [x3]
    and         v0.16b, v0.16b, v29.16b
1:
    faddp       v0.4s, v0.4s, v0.4s
    faddp       s0, v0.2s
    ret
endfunc

#endif /* BIT_DEPTH == 8 */

SAD_FUNC  4,  4
SAD_FUNC  4,  8
SAD_FUNC  4,  16
SAD_FUNC  8,  4
SAD_FUNC  8,  8
SAD_FUNC  8,  16
SAD_FUNC  16, 8
SAD_FUNC  16, 16

SAD_X_FUNC  3, 4,  4
SAD_X_FUNC  3, 4,  8
SAD_X_FUNC  3, 8,  4
SAD_X_FUNC  3, 8,  8
SAD_X_FUNC  3, 8,  16
SAD_X_FUNC  3, 16, 8
SAD_X_FUNC  3, 16, 16

SAD_X_FUNC  4, 4,  4
SAD_X_FUNC  4, 4,  8
SAD_X_FUNC  4, 8,  4
SAD_X_FUNC  4, 8,  8
SAD_X_FUNC  4, 8,  16
SAD_X_FUNC  4, 16, 8
SAD_X_FUNC  4, 16, 16

SSD_FUNC   4, 4
SSD_FUNC   4, 8
SSD_FUNC   4, 16
SSD_FUNC   8, 4
SSD_FUNC   8, 8
SSD_FUNC   8, 16
SSD_FUNC  16, 8
SSD_FUNC  16, 16

pixel_var_8  8
pixel_var_8 16

pixel_var2_8  8
pixel_var2_8 16

sa8d_satd_8x8
sa8d_satd_8x8 satd_

HADAMARD_AC  8, 8
HADAMARD_AC  8, 16
HADAMARD_AC 16, 8
HADAMARD_AC 16, 16

#if BIT_DEPTH == 8 && HAVE_DOTPROD
ENABLE_DOTPROD
SAD_FUNC_DOTPROD  16, 8
SAD_FUNC_DOTPROD  16, 16
SAD_X_DOTPROD_FUNC  3, 16, 8
SAD_X_DOTPROD_FUNC  3, 16, 16
SAD_X_DOTPROD_FUNC  4, 16, 8
SAD_X_DOTPROD_FUNC  4, 16, 16

SSD_DOTPROD_FUNC   8, 4
SSD_DOTPROD_FUNC   8, 8
SSD_DOTPROD_FUNC   8, 16
SSD_DOTPROD_FUNC  16, 8
SSD_DOTPROD_FUNC  16, 16
DISABLE_DOTPROD
#endif // BIT_DEPTH == 8 && HAVE_DOTPROD