3781 lines
107 KiB
ArmAsm
3781 lines
107 KiB
ArmAsm
/*****************************************************************************
|
|
* mc.S: aarch64 motion compensation
|
|
*****************************************************************************
|
|
* Copyright (C) 2009-2025 x264 project
|
|
*
|
|
* Authors: David Conrad <lessen42@gmail.com>
|
|
* Janne Grunau <janne-x264@jannau.net>
|
|
* Mans Rullgard <mans@mansr.com>
|
|
* Stefan Groenroos <stefan.gronroos@gmail.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at licensing@x264.com.
|
|
*****************************************************************************/
|
|
|
|
#include "asm.S"
|
|
#include "mc-a-common.S"
|
|
|
|
// note: prefetch stuff assumes 64-byte cacheline
|
|
|
|
// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
|
|
function prefetch_ref_aarch64, export=1
|
|
cmp w2, #1
|
|
csel x2, xzr, x1, eq
|
|
add x0, x0, #64
|
|
add x0, x0, x2, lsl #3
|
|
|
|
lsl x2, x1, #1
|
|
add x3, x1, x1, lsl #1
|
|
add x4, x0, x1, lsl #2
|
|
|
|
prfm pldl1strm, [x0]
|
|
prfm pldl1strm, [x0, x1]
|
|
prfm pldl1strm, [x0, x2]
|
|
prfm pldl1strm, [x0, x3]
|
|
prfm pldl1strm, [x4]
|
|
prfm pldl1strm, [x4, x1]
|
|
prfm pldl1strm, [x4, x2]
|
|
prfm pldl1strm, [x4, x3]
|
|
ret
|
|
endfunc
|
|
|
|
// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
|
|
// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
|
|
.macro prefetch_fenc sub
|
|
function prefetch_fenc_\sub\()_aarch64, export=1
|
|
and w6, w5, #3
|
|
and w7, w5, #3
|
|
mul x6, x6, x1
|
|
mul x7, x7, x3
|
|
add x0, x0, #64
|
|
add x2, x2, #64
|
|
|
|
add x0, x0, x6, lsl #2
|
|
add x6, x0, x1, lsl #1
|
|
prfm pldl1strm, [x0]
|
|
prfm pldl1strm, [x0, x1]
|
|
prfm pldl1strm, [x6]
|
|
prfm pldl1strm, [x6, x1]
|
|
|
|
add x2, x2, x7, lsl #1
|
|
prfm pldl1strm, [x2]
|
|
prfm pldl1strm, [x2, x3]
|
|
.ifc \sub, 422
|
|
add x7, x2, x3, lsl #1
|
|
prfm pldl1strm, [x7]
|
|
prfm pldl1strm, [x7, x3]
|
|
.endif
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
prefetch_fenc 420
|
|
prefetch_fenc 422
|
|
|
|
function mbtree_propagate_cost_neon, export=1
|
|
ld1r {v5.4s}, [x5]
|
|
8:
|
|
subs w6, w6, #8
|
|
ld1 {v1.8h}, [x1], #16
|
|
ld1 {v2.8h}, [x2], #16
|
|
ld1 {v3.8h}, [x3], #16
|
|
ld1 {v4.8h}, [x4], #16
|
|
bic v3.8h, #0xc0, lsl #8
|
|
umin v3.8h, v2.8h, v3.8h
|
|
umull v20.4s, v2.4h, v4.4h // propagate_intra
|
|
umull2 v21.4s, v2.8h, v4.8h // propagate_intra
|
|
usubl v22.4s, v2.4h, v3.4h // propagate_num
|
|
usubl2 v23.4s, v2.8h, v3.8h // propagate_num
|
|
uxtl v26.4s, v2.4h // propagate_denom
|
|
uxtl2 v27.4s, v2.8h // propagate_denom
|
|
uxtl v24.4s, v1.4h
|
|
uxtl2 v25.4s, v1.8h
|
|
ucvtf v20.4s, v20.4s
|
|
ucvtf v21.4s, v21.4s
|
|
ucvtf v26.4s, v26.4s
|
|
ucvtf v27.4s, v27.4s
|
|
ucvtf v22.4s, v22.4s
|
|
ucvtf v23.4s, v23.4s
|
|
frecpe v28.4s, v26.4s
|
|
frecpe v29.4s, v27.4s
|
|
ucvtf v24.4s, v24.4s
|
|
ucvtf v25.4s, v25.4s
|
|
frecps v30.4s, v28.4s, v26.4s
|
|
frecps v31.4s, v29.4s, v27.4s
|
|
fmla v24.4s, v20.4s, v5.4s // propagate_amount
|
|
fmla v25.4s, v21.4s, v5.4s // propagate_amount
|
|
fmul v28.4s, v28.4s, v30.4s
|
|
fmul v29.4s, v29.4s, v31.4s
|
|
fmul v16.4s, v24.4s, v22.4s
|
|
fmul v17.4s, v25.4s, v23.4s
|
|
fmul v18.4s, v16.4s, v28.4s
|
|
fmul v19.4s, v17.4s, v29.4s
|
|
fcvtns v20.4s, v18.4s
|
|
fcvtns v21.4s, v19.4s
|
|
sqxtn v0.4h, v20.4s
|
|
sqxtn2 v0.8h, v21.4s
|
|
st1 {v0.8h}, [x0], #16
|
|
b.gt 8b
|
|
ret
|
|
endfunc
|
|
|
|
const pw_0to15, align=5
|
|
.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
endconst
|
|
|
|
function mbtree_propagate_list_internal_neon, export=1
|
|
movrel x11, pw_0to15
|
|
dup v31.8h, w4 // bipred_weight
|
|
movi v30.8h, #0xc0, lsl #8
|
|
ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
|
|
movi v28.4s, #4
|
|
movi v27.8h, #31
|
|
movi v26.8h, #32
|
|
dup v24.8h, w5 // mb_y
|
|
zip1 v29.8h, v29.8h, v24.8h
|
|
8:
|
|
subs w6, w6, #8
|
|
ld1 {v1.8h}, [x1], #16 // propagate_amount
|
|
ld1 {v2.8h}, [x2], #16 // lowres_cost
|
|
and v2.16b, v2.16b, v30.16b
|
|
cmeq v25.8h, v2.8h, v30.8h
|
|
umull v16.4s, v1.4h, v31.4h
|
|
umull2 v17.4s, v1.8h, v31.8h
|
|
rshrn v16.4h, v16.4s, #6
|
|
rshrn2 v16.8h, v17.4s, #6
|
|
bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
|
|
// propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
|
|
ld1 {v4.8h,v5.8h}, [x0], #32
|
|
sshr v6.8h, v4.8h, #5
|
|
sshr v7.8h, v5.8h, #5
|
|
add v6.8h, v6.8h, v29.8h
|
|
add v29.8h, v29.8h, v28.8h
|
|
add v7.8h, v7.8h, v29.8h
|
|
add v29.8h, v29.8h, v28.8h
|
|
st1 {v6.8h,v7.8h}, [x3], #32
|
|
and v4.16b, v4.16b, v27.16b
|
|
and v5.16b, v5.16b, v27.16b
|
|
uzp1 v6.8h, v4.8h, v5.8h // x & 31
|
|
uzp2 v7.8h, v4.8h, v5.8h // y & 31
|
|
sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
|
|
sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
|
|
mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
|
|
mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
|
|
mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
|
|
mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
|
|
umull v6.4s, v19.4h, v25.4h
|
|
umull2 v7.4s, v19.8h, v25.8h
|
|
umull v4.4s, v18.4h, v25.4h
|
|
umull2 v5.4s, v18.8h, v25.8h
|
|
umull v2.4s, v17.4h, v25.4h
|
|
umull2 v3.4s, v17.8h, v25.8h
|
|
umull v0.4s, v16.4h, v25.4h
|
|
umull2 v1.4s, v16.8h, v25.8h
|
|
rshrn v19.4h, v6.4s, #10
|
|
rshrn2 v19.8h, v7.4s, #10
|
|
rshrn v18.4h, v4.4s, #10
|
|
rshrn2 v18.8h, v5.4s, #10
|
|
rshrn v17.4h, v2.4s, #10
|
|
rshrn2 v17.8h, v3.4s, #10
|
|
rshrn v16.4h, v0.4s, #10
|
|
rshrn2 v16.8h, v1.4s, #10
|
|
zip1 v0.8h, v16.8h, v17.8h
|
|
zip2 v1.8h, v16.8h, v17.8h
|
|
zip1 v2.8h, v18.8h, v19.8h
|
|
zip2 v3.8h, v18.8h, v19.8h
|
|
st1 {v0.8h,v1.8h}, [x3], #32
|
|
st1 {v2.8h,v3.8h}, [x3], #32
|
|
b.ge 8b
|
|
ret
|
|
endfunc
|
|
|
|
function memcpy_aligned_neon, export=1
|
|
tst x2, #16
|
|
b.eq 32f
|
|
sub x2, x2, #16
|
|
ldr q0, [x1], #16
|
|
str q0, [x0], #16
|
|
32:
|
|
tst x2, #32
|
|
b.eq 640f
|
|
sub x2, x2, #32
|
|
ldp q0, q1, [x1], #32
|
|
stp q0, q1, [x0], #32
|
|
640:
|
|
cbz x2, 1f
|
|
64:
|
|
subs x2, x2, #64
|
|
ldp q0, q1, [x1, #32]
|
|
ldp q2, q3, [x1], #64
|
|
stp q0, q1, [x0, #32]
|
|
stp q2, q3, [x0], #64
|
|
b.gt 64b
|
|
1:
|
|
ret
|
|
endfunc
|
|
|
|
function memzero_aligned_neon, export=1
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
1:
|
|
subs x1, x1, #128
|
|
stp q0, q1, [x0, #96]
|
|
stp q0, q1, [x0, #64]
|
|
stp q0, q1, [x0, #32]
|
|
stp q0, q1, [x0], 128
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
// void mbtree_fix8_pack( int16_t *dst, float *src, int count )
|
|
function mbtree_fix8_pack_neon, export=1
|
|
subs w3, w2, #8
|
|
b.lt 2f
|
|
1:
|
|
subs w3, w3, #8
|
|
ld1 {v0.4s,v1.4s}, [x1], #32
|
|
fcvtzs v0.4s, v0.4s, #8
|
|
fcvtzs v1.4s, v1.4s, #8
|
|
sqxtn v2.4h, v0.4s
|
|
sqxtn2 v2.8h, v1.4s
|
|
rev16 v3.16b, v2.16b
|
|
st1 {v3.8h}, [x0], #16
|
|
b.ge 1b
|
|
2:
|
|
adds w3, w3, #8
|
|
b.eq 4f
|
|
3:
|
|
subs w3, w3, #1
|
|
ldr s0, [x1], #4
|
|
fcvtzs w4, s0, #8
|
|
rev16 w5, w4
|
|
strh w5, [x0], #2
|
|
b.gt 3b
|
|
4:
|
|
ret
|
|
endfunc
|
|
|
|
// void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
|
|
function mbtree_fix8_unpack_neon, export=1
|
|
subs w3, w2, #8
|
|
b.lt 2f
|
|
1:
|
|
subs w3, w3, #8
|
|
ld1 {v0.8h}, [x1], #16
|
|
rev16 v1.16b, v0.16b
|
|
sxtl v2.4s, v1.4h
|
|
sxtl2 v3.4s, v1.8h
|
|
scvtf v4.4s, v2.4s, #8
|
|
scvtf v5.4s, v3.4s, #8
|
|
st1 {v4.4s,v5.4s}, [x0], #32
|
|
b.ge 1b
|
|
2:
|
|
adds w3, w3, #8
|
|
b.eq 4f
|
|
3:
|
|
subs w3, w3, #1
|
|
ldrh w4, [x1], #2
|
|
rev16 w5, w4
|
|
sxth w6, w5
|
|
scvtf s0, w6, #8
|
|
str s0, [x0], #4
|
|
b.gt 3b
|
|
4:
|
|
ret
|
|
endfunc
|
|
|
|
#if BIT_DEPTH == 8
|
|
|
|
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
|
|
// uint8_t *src1, intptr_t src1_stride,
|
|
// uint8_t *src2, intptr_t src2_stride, int weight );
|
|
.macro AVGH w h
|
|
function pixel_avg_\w\()x\h\()_neon, export=1
|
|
mov w10, #64
|
|
cmp w6, #32
|
|
mov w9, #\h
|
|
b.eq pixel_avg_w\w\()_neon
|
|
subs w7, w10, w6
|
|
b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
|
|
cmp w6, #0
|
|
b.ge pixel_avg_weight_w\w\()_add_add_neon
|
|
b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
|
|
endfunc
|
|
.endm
|
|
|
|
AVGH 4, 2
|
|
AVGH 4, 4
|
|
AVGH 4, 8
|
|
AVGH 4, 16
|
|
AVGH 8, 4
|
|
AVGH 8, 8
|
|
AVGH 8, 16
|
|
AVGH 16, 8
|
|
AVGH 16, 16
|
|
|
|
// 0 < weight < 64
|
|
.macro weight_add_add dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s1, v30.16b
|
|
umlal2 \dst, \s2, v31.16b
|
|
.else
|
|
umull \dst, \s1, v30.8b
|
|
umlal \dst, \s2, v31.8b
|
|
.endif
|
|
.endm
|
|
|
|
// weight > 64
|
|
.macro weight_add_sub dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s1, v30.16b
|
|
umlsl2 \dst, \s2, v31.16b
|
|
.else
|
|
umull \dst, \s1, v30.8b
|
|
umlsl \dst, \s2, v31.8b
|
|
.endif
|
|
.endm
|
|
|
|
// weight < 0
|
|
.macro weight_sub_add dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s2, v31.16b
|
|
umlsl2 \dst, \s1, v30.16b
|
|
.else
|
|
umull \dst, \s2, v31.8b
|
|
umlsl \dst, \s1, v30.8b
|
|
.endif
|
|
.endm
|
|
|
|
.macro AVG_WEIGHT ext
|
|
function pixel_avg_weight_w4_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.8b, w6
|
|
dup v31.8b, w7
|
|
1: // height loop
|
|
subs w9, w9, #2
|
|
ld1 {v0.s}[0], [x2], x3
|
|
ld1 {v1.s}[0], [x4], x5
|
|
weight_\ext v4.8h, v0.8b, v1.8b
|
|
ld1 {v2.s}[0], [x2], x3
|
|
ld1 {v3.s}[0], [x4], x5
|
|
sqrshrun v0.8b, v4.8h, #6
|
|
weight_\ext v5.8h, v2.8b, v3.8b
|
|
st1 {v0.s}[0], [x0], x1
|
|
sqrshrun v1.8b, v5.8h, #6
|
|
st1 {v1.s}[0], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_weight_w8_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.8b, w6
|
|
dup v31.8b, w7
|
|
1: // height loop
|
|
subs w9, w9, #4
|
|
ld1 {v0.8b}, [x2], x3
|
|
ld1 {v1.8b}, [x4], x5
|
|
weight_\ext v16.8h, v0.8b, v1.8b
|
|
ld1 {v2.8b}, [x2], x3
|
|
ld1 {v3.8b}, [x4], x5
|
|
weight_\ext v17.8h, v2.8b, v3.8b
|
|
ld1 {v4.8b}, [x2], x3
|
|
ld1 {v5.8b}, [x4], x5
|
|
weight_\ext v18.8h, v4.8b, v5.8b
|
|
ld1 {v6.8b}, [x2], x3
|
|
ld1 {v7.8b}, [x4], x5
|
|
weight_\ext v19.8h, v6.8b, v7.8b
|
|
sqrshrun v0.8b, v16.8h, #6
|
|
sqrshrun v1.8b, v17.8h, #6
|
|
sqrshrun v2.8b, v18.8h, #6
|
|
sqrshrun v3.8b, v19.8h, #6
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v1.8b}, [x0], x1
|
|
st1 {v2.8b}, [x0], x1
|
|
st1 {v3.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_weight_w16_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.16b, w6
|
|
dup v31.16b, w7
|
|
1: // height loop
|
|
subs w9, w9, #2
|
|
ld1 {v0.16b}, [x2], x3
|
|
ld1 {v1.16b}, [x4], x5
|
|
weight_\ext v16.8h, v0.8b, v1.8b
|
|
weight_\ext v17.8h, v0.16b, v1.16b, 2
|
|
ld1 {v2.16b}, [x2], x3
|
|
ld1 {v3.16b}, [x4], x5
|
|
weight_\ext v18.8h, v2.8b, v3.8b
|
|
weight_\ext v19.8h, v2.16b, v3.16b, 2
|
|
sqrshrun v0.8b, v16.8h, #6
|
|
sqrshrun v1.8b, v18.8h, #6
|
|
sqrshrun2 v0.16b, v17.8h, #6
|
|
sqrshrun2 v1.16b, v19.8h, #6
|
|
st1 {v0.16b}, [x0], x1
|
|
st1 {v1.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
AVG_WEIGHT add_add
|
|
AVG_WEIGHT add_sub
|
|
AVG_WEIGHT sub_add
|
|
|
|
function pixel_avg_w8_neon
|
|
1: subs w9, w9, #4
|
|
ld1 {v0.8b}, [x2], x3
|
|
ld1 {v1.8b}, [x4], x5
|
|
ld1 {v2.8b}, [x2], x3
|
|
urhadd v0.8b, v0.8b, v1.8b
|
|
ld1 {v3.8b}, [x4], x5
|
|
st1 {v0.8b}, [x0], x1
|
|
ld1 {v4.8b}, [x2], x3
|
|
urhadd v1.8b, v2.8b, v3.8b
|
|
ld1 {v5.8b}, [x4], x5
|
|
st1 {v1.8b}, [x0], x1
|
|
ld1 {v6.8b}, [x2], x3
|
|
ld1 {v7.8b}, [x4], x5
|
|
urhadd v2.8b, v4.8b, v5.8b
|
|
urhadd v3.8b, v6.8b, v7.8b
|
|
st1 {v2.8b}, [x0], x1
|
|
st1 {v3.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_w16_neon
|
|
1: subs w9, w9, #4
|
|
ld1 {v0.16b}, [x2], x3
|
|
ld1 {v1.16b}, [x4], x5
|
|
ld1 {v2.16b}, [x2], x3
|
|
urhadd v0.16b, v0.16b, v1.16b
|
|
ld1 {v3.16b}, [x4], x5
|
|
st1 {v0.16b}, [x0], x1
|
|
ld1 {v4.16b}, [x2], x3
|
|
urhadd v1.16b, v2.16b, v3.16b
|
|
ld1 {v5.16b}, [x4], x5
|
|
st1 {v1.16b}, [x0], x1
|
|
ld1 {v6.16b}, [x2], x3
|
|
ld1 {v7.16b}, [x4], x5
|
|
urhadd v2.16b, v4.16b, v5.16b
|
|
urhadd v3.16b, v6.16b, v7.16b
|
|
st1 {v2.16b}, [x0], x1
|
|
st1 {v3.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w4_neon, export=1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.s}[0], [x2], x3
|
|
ld1 {v2.s}[0], [x4], x3
|
|
urhadd v0.8b, v0.8b, v2.8b
|
|
ld1 {v1.s}[0], [x2], x3
|
|
ld1 {v3.s}[0], [x4], x3
|
|
urhadd v1.8b, v1.8b, v3.8b
|
|
st1 {v0.s}[0], [x0], x1
|
|
st1 {v1.s}[0], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w8_neon, export=1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.8b}, [x2], x3
|
|
ld1 {v2.8b}, [x4], x3
|
|
urhadd v0.8b, v0.8b, v2.8b
|
|
ld1 {v1.8b}, [x2], x3
|
|
ld1 {v3.8b}, [x4], x3
|
|
urhadd v1.8b, v1.8b, v3.8b
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v1.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w16_neon, export=1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.16b}, [x2], x3
|
|
ld1 {v2.16b}, [x4], x3
|
|
urhadd v0.16b, v0.16b, v2.16b
|
|
ld1 {v1.16b}, [x2], x3
|
|
ld1 {v3.16b}, [x4], x3
|
|
urhadd v1.16b, v1.16b, v3.16b
|
|
st1 {v0.16b}, [x0], x1
|
|
st1 {v1.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w20_neon, export=1
|
|
sub x1, x1, #16
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.16b,v1.16b}, [x2], x3
|
|
ld1 {v2.16b,v3.16b}, [x4], x3
|
|
urhadd v0.16b, v0.16b, v2.16b
|
|
urhadd v1.8b, v1.8b, v3.8b
|
|
ld1 {v4.16b,v5.16b}, [x2], x3
|
|
ld1 {v6.16b,v7.16b}, [x4], x3
|
|
urhadd v4.16b, v4.16b, v6.16b
|
|
urhadd v5.8b, v5.8b, v7.8b
|
|
st1 {v0.16b}, [x0], #16
|
|
st1 {v1.s}[0], [x0], x1
|
|
st1 {v4.16b}, [x0], #16
|
|
st1 {v5.s}[0], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro weight_prologue type
|
|
mov w9, w5 // height
|
|
.ifc \type, full
|
|
ldr w12, [x4, #32] // denom
|
|
.endif
|
|
ldp w4, w5, [x4, #32+4] // scale, offset
|
|
dup v0.16b, w4
|
|
dup v1.8h, w5
|
|
.ifc \type, full
|
|
neg w12, w12
|
|
dup v2.8h, w12
|
|
.endif
|
|
.endm
|
|
|
|
// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
|
|
// intptr_t dst_stride, const x264_weight_t *weight, int h )
|
|
function mc_weight_w20_neon, export=1
|
|
weight_prologue full
|
|
sub x1, x1, #16
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
|
|
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
|
|
umull v22.8h, v16.8b, v0.8b
|
|
umull v23.8h, v17.8b, v0.8b
|
|
zip1 v18.2s, v18.2s, v21.2s
|
|
umull v25.8h, v19.8b, v0.8b
|
|
umull v26.8h, v20.8b, v0.8b
|
|
umull v24.8h, v18.8b, v0.8b
|
|
srshl v22.8h, v22.8h, v2.8h
|
|
srshl v23.8h, v23.8h, v2.8h
|
|
srshl v24.8h, v24.8h, v2.8h
|
|
srshl v25.8h, v25.8h, v2.8h
|
|
srshl v26.8h, v26.8h, v2.8h
|
|
add v22.8h, v22.8h, v1.8h
|
|
add v23.8h, v23.8h, v1.8h
|
|
add v24.8h, v24.8h, v1.8h
|
|
add v25.8h, v25.8h, v1.8h
|
|
add v26.8h, v26.8h, v1.8h
|
|
sqxtun v4.8b, v22.8h
|
|
sqxtun2 v4.16b, v23.8h
|
|
sqxtun v6.8b, v24.8h
|
|
sqxtun v5.8b, v25.8h
|
|
sqxtun2 v5.16b, v26.8h
|
|
st1 {v4.16b}, [x0], #16
|
|
st1 {v6.s}[0], [x0], x1
|
|
st1 {v5.16b}, [x0], #16
|
|
st1 {v6.s}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_neon, export=1
|
|
weight_prologue full
|
|
weight16_loop:
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v4.16b}, [x2], x3
|
|
ld1 {v5.16b}, [x2], x3
|
|
umull v22.8h, v4.8b, v0.8b
|
|
umull2 v23.8h, v4.16b, v0.16b
|
|
umull v24.8h, v5.8b, v0.8b
|
|
umull2 v25.8h, v5.16b, v0.16b
|
|
srshl v22.8h, v22.8h, v2.8h
|
|
srshl v23.8h, v23.8h, v2.8h
|
|
srshl v24.8h, v24.8h, v2.8h
|
|
srshl v25.8h, v25.8h, v2.8h
|
|
add v22.8h, v22.8h, v1.8h
|
|
add v23.8h, v23.8h, v1.8h
|
|
add v24.8h, v24.8h, v1.8h
|
|
add v25.8h, v25.8h, v1.8h
|
|
sqxtun v4.8b, v22.8h
|
|
sqxtun2 v4.16b, v23.8h
|
|
sqxtun v5.8b, v24.8h
|
|
sqxtun2 v5.16b, v25.8h
|
|
st1 {v4.16b}, [x0], x1
|
|
st1 {v5.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_neon, export=1
|
|
weight_prologue full
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8b}, [x2], x3
|
|
ld1 {v17.8b}, [x2], x3
|
|
umull v4.8h, v16.8b, v0.8b
|
|
umull v5.8h, v17.8b, v0.8b
|
|
srshl v4.8h, v4.8h, v2.8h
|
|
srshl v5.8h, v5.8h, v2.8h
|
|
add v4.8h, v4.8h, v1.8h
|
|
add v5.8h, v5.8h, v1.8h
|
|
sqxtun v16.8b, v4.8h
|
|
sqxtun v17.8b, v5.8h
|
|
st1 {v16.8b}, [x0], x1
|
|
st1 {v17.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_neon, export=1
|
|
weight_prologue full
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.s}[0], [x2], x3
|
|
ld1 {v16.s}[1], [x2], x3
|
|
umull v4.8h, v16.8b, v0.8b
|
|
srshl v4.8h, v4.8h, v2.8h
|
|
add v4.8h, v4.8h, v1.8h
|
|
sqxtun v16.8b, v4.8h
|
|
st1 {v16.s}[0], [x0], x1
|
|
st1 {v16.s}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w20_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
sub x1, x1, #16
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
|
|
mov v27.16b, v1.16b
|
|
mov v28.16b, v1.16b
|
|
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
|
|
mov v31.16b, v1.16b
|
|
mov v29.16b, v1.16b
|
|
mov v30.16b, v1.16b
|
|
zip1 v18.2s, v18.2s, v21.2s
|
|
umlal v27.8h, v16.8b, v0.8b
|
|
umlal v28.8h, v17.8b, v0.8b
|
|
umlal v31.8h, v18.8b, v0.8b
|
|
umlal v29.8h, v19.8b, v0.8b
|
|
umlal v30.8h, v20.8b, v0.8b
|
|
sqxtun v4.8b, v27.8h
|
|
sqxtun2 v4.16b, v28.8h
|
|
sqxtun v5.8b, v29.8h
|
|
sqxtun2 v5.16b, v30.8h
|
|
sqxtun v6.8b, v31.8h
|
|
st1 {v4.16b}, [x0], #16
|
|
st1 {v6.s}[0], [x0], x1
|
|
st1 {v5.16b}, [x0], #16
|
|
st1 {v6.s}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v6.16b}, [x2], x3
|
|
mov v27.16b, v1.16b
|
|
mov v28.16b, v1.16b
|
|
ld1 {v7.16b}, [x2], x3
|
|
mov v29.16b, v1.16b
|
|
mov v30.16b, v1.16b
|
|
umlal v27.8h, v6.8b, v0.8b
|
|
umlal2 v28.8h, v6.16b, v0.16b
|
|
umlal v29.8h, v7.8b, v0.8b
|
|
umlal2 v30.8h, v7.16b, v0.16b
|
|
sqxtun v4.8b, v27.8h
|
|
sqxtun2 v4.16b, v28.8h
|
|
sqxtun v5.8b, v29.8h
|
|
sqxtun2 v5.16b, v30.8h
|
|
st1 {v4.16b}, [x0], x1
|
|
st1 {v5.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8b}, [x2], x3
|
|
mov v27.16b, v1.16b
|
|
ld1 {v17.8b}, [x2], x3
|
|
mov v29.16b, v1.16b
|
|
umlal v27.8h, v16.8b, v0.8b
|
|
umlal v29.8h, v17.8b, v0.8b
|
|
sqxtun v4.8b, v27.8h
|
|
sqxtun v5.8b, v29.8h
|
|
st1 {v4.8b}, [x0], x1
|
|
st1 {v5.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.s}[0], [x2], x3
|
|
ld1 {v16.s}[1], [x2], x3
|
|
mov v27.16b, v1.16b
|
|
umlal v27.8h, v16.8b, v0.8b
|
|
sqxtun v4.8b, v27.8h
|
|
st1 {v4.s}[0], [x0], x1
|
|
st1 {v4.s}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro weight_simple_prologue
|
|
ldr w6, [x4] // offset
|
|
dup v1.16b, w6
|
|
.endm
|
|
|
|
.macro weight_simple name op
|
|
function mc_weight_w20_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
1:
|
|
subs w5, w5, #2
|
|
ldr s18, [x2, #16]
|
|
ld1 {v16.16b}, [x2], x3
|
|
ldr s19, [x2, #16]
|
|
ld1 {v17.16b}, [x2], x3
|
|
\op v18.8b, v18.8b, v1.8b
|
|
\op v16.16b, v16.16b, v1.16b
|
|
\op v19.8b, v19.8b, v1.8b
|
|
\op v17.16b, v17.16b, v1.16b
|
|
str s18, [x0, #16]
|
|
st1 {v16.16b}, [x0], x1
|
|
str s19, [x0, #16]
|
|
st1 {v17.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.16b}, [x2], x3
|
|
ld1 {v17.16b}, [x2], x3
|
|
\op v16.16b, v16.16b, v1.16b
|
|
\op v17.16b, v17.16b, v1.16b
|
|
st1 {v16.16b}, [x0], x1
|
|
st1 {v17.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.8b}, [x2], x3
|
|
ld1 {v17.8b}, [x2], x3
|
|
\op v16.8b, v16.8b, v1.8b
|
|
\op v17.8b, v17.8b, v1.8b
|
|
st1 {v16.8b}, [x0], x1
|
|
st1 {v17.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.s}[0], [x2], x3
|
|
ld1 {v16.s}[1], [x2], x3
|
|
\op v16.8b, v16.8b, v1.8b
|
|
st1 {v16.s}[0], [x0], x1
|
|
st1 {v16.s}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
weight_simple offsetadd, uqadd
|
|
weight_simple offsetsub, uqsub
|
|
|
|
|
|
// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
|
|
function mc_copy_w4_neon, export=1
|
|
1:
|
|
subs w4, w4, #4
|
|
ld1 {v0.s}[0], [x2], x3
|
|
ld1 {v1.s}[0], [x2], x3
|
|
ld1 {v2.s}[0], [x2], x3
|
|
ld1 {v3.s}[0], [x2], x3
|
|
st1 {v0.s}[0], [x0], x1
|
|
st1 {v1.s}[0], [x0], x1
|
|
st1 {v2.s}[0], [x0], x1
|
|
st1 {v3.s}[0], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_copy_w8_neon, export=1
|
|
1: subs w4, w4, #4
|
|
ld1 {v0.8b}, [x2], x3
|
|
ld1 {v1.8b}, [x2], x3
|
|
ld1 {v2.8b}, [x2], x3
|
|
ld1 {v3.8b}, [x2], x3
|
|
st1 {v0.8b}, [x0], x1
|
|
st1 {v1.8b}, [x0], x1
|
|
st1 {v2.8b}, [x0], x1
|
|
st1 {v3.8b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_copy_w16_neon, export=1
|
|
1: subs w4, w4, #4
|
|
ld1 {v0.16b}, [x2], x3
|
|
ld1 {v1.16b}, [x2], x3
|
|
ld1 {v2.16b}, [x2], x3
|
|
ld1 {v3.16b}, [x2], x3
|
|
st1 {v0.16b}, [x0], x1
|
|
st1 {v1.16b}, [x0], x1
|
|
st1 {v2.16b}, [x0], x1
|
|
st1 {v3.16b}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
// void mc_chroma( uint8_t *dst_u, uint8_t *dst_v,
|
|
// intptr_t i_dst_stride,
|
|
// uint8_t *src, intptr_t i_src_stride,
|
|
// int dx, int dy, int i_width, int i_height );
|
|
function mc_chroma_neon, export=1
|
|
ldr w15, [sp] // height
|
|
sbfx x12, x6, #3, #29 // asr(3) and sign extend
|
|
sbfx x11, x5, #3, #29 // asr(3) and sign extend
|
|
cmp w7, #4
|
|
mul x12, x12, x4
|
|
add x3, x3, x11, lsl #1
|
|
|
|
and w5, w5, #7
|
|
and w6, w6, #7
|
|
|
|
add x3, x3, x12
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
b.gt mc_chroma_w8_neon
|
|
b.eq mc_chroma_w4_neon
|
|
endfunc
|
|
|
|
.macro CHROMA_MC_START r00, r01, r10, r11
|
|
mul w12, w5, w6 // cD = d8x *d8y
|
|
lsl w13, w5, #3
|
|
add w9, w12, #64
|
|
lsl w14, w6, #3
|
|
tst w12, w12
|
|
sub w9, w9, w13
|
|
sub w10, w13, w12 // cB = d8x *(8-d8y);
|
|
sub w11, w14, w12 // cC = (8-d8x)*d8y
|
|
sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
|
|
.endm
|
|
|
|
.macro CHROMA_MC width, vsize
|
|
function mc_chroma_w\width\()_neon
|
|
// since the element size varies, there's a different index for the 2nd store
|
|
.if \width == 4
|
|
.set idx2, 1
|
|
.else
|
|
.set idx2, 2
|
|
.endif
|
|
CHROMA_MC_START
|
|
b.eq 2f
|
|
|
|
ld2 {v28.8b,v29.8b}, [x3], x4
|
|
dup v0.8b, w9 // cA
|
|
dup v1.8b, w10 // cB
|
|
|
|
ext v6.8b, v28.8b, v6.8b, #1
|
|
ext v7.8b, v29.8b, v7.8b, #1
|
|
|
|
ld2 {v30.8b,v31.8b}, [x3], x4
|
|
dup v2.8b, w11 // cC
|
|
dup v3.8b, w12 // cD
|
|
|
|
ext v22.8b, v30.8b, v22.8b, #1
|
|
ext v23.8b, v31.8b, v23.8b, #1
|
|
|
|
trn1 v0.2s, v0.2s, v1.2s
|
|
trn1 v2.2s, v2.2s, v3.2s
|
|
|
|
trn1 v4.2s, v28.2s, v6.2s
|
|
trn1 v5.2s, v29.2s, v7.2s
|
|
trn1 v20.2s, v30.2s, v22.2s
|
|
trn1 v21.2s, v31.2s, v23.2s
|
|
1: // height loop, interpolate xy
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b
|
|
umlal v16.8h, v20.8b, v2.8b
|
|
umull v17.8h, v5.8b, v0.8b
|
|
umlal v17.8h, v21.8b, v2.8b
|
|
|
|
ld2 {v28.8b,v29.8b}, [x3], x4
|
|
transpose v24.2d, v25.2d, v16.2d, v17.2d
|
|
|
|
ext v6.8b, v28.8b, v6.8b, #1
|
|
ext v7.8b, v29.8b, v7.8b, #1
|
|
|
|
trn1 v4.2s, v28.2s, v6.2s
|
|
trn1 v5.2s, v29.2s, v7.2s
|
|
|
|
add v16.8h, v24.8h, v25.8h
|
|
|
|
umull v18.8h, v20.8b, v0.8b
|
|
umlal v18.8h, v4.8b, v2.8b
|
|
umull v19.8h, v21.8b, v0.8b
|
|
umlal v19.8h, v5.8b, v2.8b
|
|
|
|
ld2 {v30.8b,v31.8b}, [x3], x4
|
|
transpose v26.2d, v27.2d, v18.2d, v19.2d
|
|
|
|
ext v22.8b, v30.8b, v22.8b, #1
|
|
ext v23.8b, v31.8b, v23.8b, #1
|
|
trn1 v20.2s, v30.2s, v22.2s
|
|
trn1 v21.2s, v31.2s, v23.2s
|
|
|
|
add v17.8h, v26.8h, v27.8h
|
|
|
|
rshrn v16.8b, v16.8h, #6
|
|
rshrn v17.8b, v17.8h, #6
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.\vsize}[0], [x0], x2
|
|
st1 {v16.\vsize}[idx2], [x1], x2
|
|
st1 {v17.\vsize}[0], [x0], x2
|
|
st1 {v17.\vsize}[idx2], [x1], x2
|
|
b.gt 1b
|
|
|
|
ret
|
|
2: // dx or dy are 0
|
|
tst w11, w11
|
|
add w10, w10, w11
|
|
dup v0.8b, w9
|
|
dup v1.8b, w10
|
|
|
|
b.eq 4f
|
|
|
|
ld1 {v4.8b}, [x3], x4
|
|
ld1 {v6.8b}, [x3], x4
|
|
3: // vertical interpolation loop
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b
|
|
ld1 {v4.8b}, [x3], x4
|
|
umlal v16.8h, v6.8b, v1.8b
|
|
umull v17.8h, v6.8b, v0.8b
|
|
ld1 {v6.8b}, [x3], x4
|
|
umlal v17.8h, v4.8b, v1.8b
|
|
|
|
rshrn v20.8b, v16.8h, #6 // uvuvuvuv
|
|
rshrn v21.8b, v17.8h, #6 // uvuvuvuv
|
|
|
|
uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.\vsize}[0], [x0], x2
|
|
st1 {v16.\vsize}[idx2], [x0], x2
|
|
st1 {v17.\vsize}[0], [x1], x2
|
|
st1 {v17.\vsize}[idx2], [x1], x2
|
|
b.gt 3b
|
|
|
|
ret
|
|
|
|
4: // dy is 0
|
|
ld1 {v4.8b,v5.8b}, [x3], x4
|
|
ld1 {v6.8b,v7.8b}, [x3], x4
|
|
|
|
ext v5.8b, v4.8b, v5.8b, #2
|
|
ext v7.8b, v6.8b, v7.8b, #2
|
|
5: // horizontal interpolation loop
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b
|
|
umlal v16.8h, v5.8b, v1.8b
|
|
umull v17.8h, v6.8b, v0.8b
|
|
umlal v17.8h, v7.8b, v1.8b
|
|
|
|
ld1 {v4.8b,v5.8b}, [x3], x4
|
|
ld1 {v6.8b,v7.8b}, [x3], x4
|
|
rshrn v20.8b, v16.8h, #6
|
|
rshrn v21.8b, v17.8h, #6
|
|
ext v5.8b, v4.8b, v5.8b, #2
|
|
ext v7.8b, v6.8b, v7.8b, #2
|
|
uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.\vsize}[0], [x0], x2
|
|
st1 {v16.\vsize}[idx2], [x0], x2
|
|
st1 {v17.\vsize}[0], [x1], x2
|
|
st1 {v17.\vsize}[idx2], [x1], x2
|
|
b.gt 5b
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
CHROMA_MC 2, h
|
|
CHROMA_MC 4, s
|
|
|
|
function mc_chroma_w8_neon
|
|
CHROMA_MC_START
|
|
b.eq 2f
|
|
ld2 {v4.16b,v5.16b}, [x3], x4
|
|
ld2 {v20.16b,v21.16b}, [x3], x4
|
|
dup v0.8b, w9 // cA
|
|
dup v1.8b, w10 // cB
|
|
|
|
ext v6.16b, v4.16b, v4.16b, #1
|
|
ext v7.16b, v5.16b, v5.16b, #1
|
|
|
|
dup v2.8b, w11 // cC
|
|
dup v3.8b, w12 // cD
|
|
|
|
ext v22.16b, v20.16b, v20.16b, #1
|
|
ext v23.16b, v21.16b, v21.16b, #1
|
|
|
|
1: // height loop, interpolate xy
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b
|
|
umlal v16.8h, v6.8b, v1.8b
|
|
umlal v16.8h, v20.8b, v2.8b
|
|
umlal v16.8h, v22.8b, v3.8b
|
|
|
|
umull v17.8h, v5.8b, v0.8b
|
|
umlal v17.8h, v7.8b, v1.8b
|
|
umlal v17.8h, v21.8b, v2.8b
|
|
umlal v17.8h, v23.8b, v3.8b
|
|
|
|
ld2 {v4.16b,v5.16b}, [x3], x4
|
|
|
|
ext v6.16b, v4.16b, v4.16b, #1
|
|
ext v7.16b, v5.16b, v5.16b, #1
|
|
|
|
umull v18.8h, v20.8b, v0.8b
|
|
umlal v18.8h, v22.8b, v1.8b
|
|
umlal v18.8h, v4.8b, v2.8b
|
|
umlal v18.8h, v6.8b, v3.8b
|
|
|
|
umull v19.8h, v21.8b, v0.8b
|
|
umlal v19.8h, v23.8b, v1.8b
|
|
umlal v19.8h, v5.8b, v2.8b
|
|
umlal v19.8h, v7.8b, v3.8b
|
|
|
|
ld2 {v20.16b,v21.16b}, [x3], x4
|
|
|
|
rshrn v16.8b, v16.8h, #6
|
|
rshrn v17.8b, v17.8h, #6
|
|
rshrn v18.8b, v18.8h, #6
|
|
rshrn v19.8b, v19.8h, #6
|
|
|
|
ext v22.16b, v20.16b, v20.16b, #1
|
|
ext v23.16b, v21.16b, v21.16b, #1
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.8b}, [x0], x2
|
|
st1 {v17.8b}, [x1], x2
|
|
st1 {v18.8b}, [x0], x2
|
|
st1 {v19.8b}, [x1], x2
|
|
b.gt 1b
|
|
|
|
ret
|
|
2: // dx or dy are 0
|
|
tst w11, w11
|
|
add w10, w10, w11
|
|
dup v0.8b, w9
|
|
dup v1.8b, w10
|
|
|
|
b.eq 4f
|
|
|
|
ld2 {v4.8b,v5.8b}, [x3], x4
|
|
ld2 {v6.8b,v7.8b}, [x3], x4
|
|
3: // vertical interpolation loop
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b //U
|
|
umlal v16.8h, v6.8b, v1.8b
|
|
umull v17.8h, v5.8b, v0.8b //V
|
|
umlal v17.8h, v7.8b, v1.8b
|
|
|
|
ld2 {v4.8b,v5.8b}, [x3], x4
|
|
|
|
umull v18.8h, v6.8b, v0.8b
|
|
umlal v18.8h, v4.8b, v1.8b
|
|
umull v19.8h, v7.8b, v0.8b
|
|
umlal v19.8h, v5.8b, v1.8b
|
|
|
|
ld2 {v6.8b,v7.8b}, [x3], x4
|
|
|
|
rshrn v16.8b, v16.8h, #6
|
|
rshrn v17.8b, v17.8h, #6
|
|
rshrn v18.8b, v18.8h, #6
|
|
rshrn v19.8b, v19.8h, #6
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.8b}, [x0], x2
|
|
st1 {v17.8b}, [x1], x2
|
|
st1 {v18.8b}, [x0], x2
|
|
st1 {v19.8b}, [x1], x2
|
|
b.gt 3b
|
|
|
|
ret
|
|
4: // dy is 0
|
|
ld2 {v4.16b,v5.16b}, [x3], x4
|
|
ext v6.16b, v4.16b, v4.16b, #1
|
|
ext v7.16b, v5.16b, v5.16b, #1
|
|
ld2 {v20.16b,v21.16b}, [x3], x4
|
|
ext v22.16b, v20.16b, v20.16b, #1
|
|
ext v23.16b, v21.16b, v21.16b, #1
|
|
5: // horizontal interpolation loop
|
|
subs w15, w15, #2
|
|
umull v16.8h, v4.8b, v0.8b //U
|
|
umlal v16.8h, v6.8b, v1.8b
|
|
umull v17.8h, v5.8b, v0.8b //V
|
|
umlal v17.8h, v7.8b, v1.8b
|
|
|
|
ld2 {v4.16b,v5.16b}, [x3], x4
|
|
|
|
umull v18.8h, v20.8b, v0.8b
|
|
umlal v18.8h, v22.8b, v1.8b
|
|
umull v19.8h, v21.8b, v0.8b
|
|
umlal v19.8h, v23.8b, v1.8b
|
|
|
|
ld2 {v20.16b,v21.16b}, [x3], x4
|
|
|
|
rshrn v16.8b, v16.8h, #6
|
|
rshrn v17.8b, v17.8h, #6
|
|
rshrn v18.8b, v18.8h, #6
|
|
rshrn v19.8b, v19.8h, #6
|
|
|
|
ext v6.16b, v4.16b, v4.16b, #1
|
|
ext v7.16b, v5.16b, v5.16b, #1
|
|
ext v22.16b, v20.16b, v20.16b, #1
|
|
ext v23.16b, v21.16b, v21.16b, #1
|
|
|
|
//pld [x3]
|
|
//pld [x3, x4]
|
|
|
|
st1 {v16.8b}, [x0], x2
|
|
st1 {v17.8b}, [x1], x2
|
|
st1 {v18.8b}, [x0], x2
|
|
st1 {v19.8b}, [x1], x2
|
|
b.gt 5b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
// void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
|
|
// intptr_t stride, int width, int height, int16_t *buf )
|
|
function hpel_filter_neon, export=1
|
|
ubfm x9, x3, #0, #3
|
|
add w15, w5, w9
|
|
sub x13, x3, x9 // align src
|
|
sub x10, x0, x9
|
|
sub x11, x1, x9
|
|
sub x12, x2, x9
|
|
movi v30.16b, #5
|
|
movi v31.16b, #20
|
|
1: // line start
|
|
mov x3, x13
|
|
mov x2, x12
|
|
mov x1, x11
|
|
mov x0, x10
|
|
add x7, x3, #16 // src pointer next 16b for horiz filter
|
|
mov x5, x15 // restore width
|
|
sub x3, x3, x4, lsl #1 // src - 2*stride
|
|
ld1 {v28.16b}, [x7], #16 // src[16:31]
|
|
|
|
add x9, x3, x5 // holds src - 2*stride + width
|
|
|
|
ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
|
|
ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
|
|
ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
|
|
ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
|
|
ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
|
|
ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
|
|
|
|
ext v22.16b, v7.16b, v18.16b, #14
|
|
uaddl v1.8h, v16.8b, v21.8b
|
|
ext v26.16b, v18.16b, v28.16b, #3
|
|
umlsl v1.8h, v17.8b, v30.8b
|
|
ext v23.16b, v7.16b, v18.16b, #15
|
|
umlal v1.8h, v18.8b, v31.8b
|
|
ext v24.16b, v18.16b, v28.16b, #1
|
|
umlal v1.8h, v19.8b, v31.8b
|
|
ext v25.16b, v18.16b, v28.16b, #2
|
|
umlsl v1.8h, v20.8b, v30.8b
|
|
2: // next 16 pixel of line
|
|
subs x5, x5, #16
|
|
sub x3, x9, x5 // src - 2*stride += 16
|
|
|
|
uaddl v4.8h, v22.8b, v26.8b
|
|
uaddl2 v5.8h, v22.16b, v26.16b
|
|
sqrshrun v6.8b, v1.8h, #5
|
|
umlsl v4.8h, v23.8b, v30.8b
|
|
umlsl2 v5.8h, v23.16b, v30.16b
|
|
umlal v4.8h, v18.8b, v31.8b
|
|
umlal2 v5.8h, v18.16b, v31.16b
|
|
umlal v4.8h, v24.8b, v31.8b
|
|
umlal2 v5.8h, v24.16b, v31.16b
|
|
umlsl v4.8h, v25.8b, v30.8b
|
|
umlsl2 v5.8h, v25.16b, v30.16b
|
|
|
|
uaddl2 v2.8h, v16.16b, v21.16b
|
|
sqrshrun v4.8b, v4.8h, #5
|
|
mov v7.16b, v18.16b
|
|
sqrshrun2 v4.16b, v5.8h, #5
|
|
|
|
umlsl2 v2.8h, v17.16b, v30.16b
|
|
ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
|
|
umlal2 v2.8h, v18.16b, v31.16b
|
|
ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
|
|
umlal2 v2.8h, v19.16b, v31.16b
|
|
ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
|
|
umlsl2 v2.8h, v20.16b, v30.16b
|
|
ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
|
|
st1 {v4.16b}, [x0], #16
|
|
sqrshrun2 v6.16b, v2.8h, #5
|
|
ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
|
|
ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
|
|
|
|
ext v22.16b, v0.16b, v1.16b, #12
|
|
ext v26.16b, v1.16b, v2.16b, #6
|
|
ext v23.16b, v0.16b, v1.16b, #14
|
|
st1 {v6.16b}, [x1], #16
|
|
uaddl v3.8h, v16.8b, v21.8b
|
|
ext v25.16b, v1.16b, v2.16b, #4
|
|
umlsl v3.8h, v17.8b, v30.8b
|
|
ext v24.16b, v1.16b, v2.16b, #2
|
|
|
|
umlal v3.8h, v18.8b, v31.8b
|
|
add v4.8h, v22.8h, v26.8h
|
|
umlal v3.8h, v19.8b, v31.8b
|
|
add v5.8h, v23.8h, v25.8h
|
|
umlsl v3.8h, v20.8b, v30.8b
|
|
add v6.8h, v24.8h, v1.8h
|
|
|
|
ext v22.16b, v1.16b, v2.16b, #12
|
|
ext v26.16b, v2.16b, v3.16b, #6
|
|
ext v23.16b, v1.16b, v2.16b, #14
|
|
ext v25.16b, v2.16b, v3.16b, #4
|
|
ext v24.16b, v2.16b, v3.16b, #2
|
|
|
|
add v22.8h, v22.8h, v26.8h
|
|
add v23.8h, v23.8h, v25.8h
|
|
add v24.8h, v24.8h, v2.8h
|
|
|
|
sub v4.8h, v4.8h, v5.8h // a-b
|
|
sub v5.8h, v5.8h, v6.8h // b-c
|
|
|
|
sub v22.8h, v22.8h, v23.8h // a-b
|
|
sub v23.8h, v23.8h, v24.8h // b-c
|
|
|
|
sshr v4.8h, v4.8h, #2 // (a-b)/4
|
|
sshr v22.8h, v22.8h, #2 // (a-b)/4
|
|
sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
|
|
sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
|
|
sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
|
|
sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
|
|
add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
|
add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
|
|
|
sqrshrun v4.8b, v4.8h, #6
|
|
ld1 {v28.16b}, [x7], #16 // src[16:31]
|
|
mov v0.16b, v2.16b
|
|
ext v23.16b, v7.16b, v18.16b, #15
|
|
sqrshrun2 v4.16b, v22.8h, #6
|
|
mov v1.16b, v3.16b
|
|
ext v22.16b, v7.16b, v18.16b, #14
|
|
ext v24.16b, v18.16b, v28.16b, #1
|
|
ext v25.16b, v18.16b, v28.16b, #2
|
|
ext v26.16b, v18.16b, v28.16b, #3
|
|
|
|
st1 {v4.16b}, [x2], #16
|
|
b.gt 2b
|
|
|
|
subs w6, w6, #1
|
|
add x10, x10, x4
|
|
add x11, x11, x4
|
|
add x12, x12, x4
|
|
add x13, x13, x4
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
|
|
// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
|
|
// intptr_t dst_stride, int width, int height )
|
|
function frame_init_lowres_core_neon, export=1
|
|
ldr w8, [sp]
|
|
sub x10, x6, w7, uxtw // dst_stride - width
|
|
and x10, x10, #~15
|
|
|
|
1:
|
|
mov w9, w7 // width
|
|
mov x11, x0 // src0
|
|
add x12, x0, x5 // src1 = src0 + src_stride
|
|
add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
|
|
|
|
ld2 {v0.16b,v1.16b}, [x11], #32
|
|
ld2 {v2.16b,v3.16b}, [x12], #32
|
|
ld2 {v4.16b,v5.16b}, [x13], #32
|
|
|
|
urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
|
|
urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
|
|
2:
|
|
subs w9, w9, #16
|
|
urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
|
|
urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
|
|
|
|
ld2 {v0.16b,v1.16b}, [x11], #32
|
|
ld2 {v2.16b,v3.16b}, [x12], #32
|
|
ld2 {v4.16b,v5.16b}, [x13], #32
|
|
urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
|
|
urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
|
|
ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
|
|
ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
|
|
|
|
urhadd v16.16b, v20.16b, v21.16b
|
|
urhadd v18.16b, v22.16b, v23.16b
|
|
urhadd v17.16b, v21.16b, v24.16b
|
|
urhadd v19.16b, v23.16b, v25.16b
|
|
|
|
st1 {v16.16b}, [x1], #16
|
|
st1 {v18.16b}, [x3], #16
|
|
st1 {v17.16b}, [x2], #16
|
|
st1 {v19.16b}, [x4], #16
|
|
b.le 3f
|
|
|
|
subs w9, w9, #16
|
|
urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
|
|
urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
|
|
|
|
ld2 {v0.16b,v1.16b}, [x11], #32
|
|
ld2 {v2.16b,v3.16b}, [x12], #32
|
|
ld2 {v4.16b,v5.16b}, [x13], #32
|
|
urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
|
|
urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
|
|
ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
|
|
ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
|
|
|
|
urhadd v16.16b, v30.16b, v21.16b
|
|
urhadd v18.16b, v31.16b, v23.16b
|
|
urhadd v17.16b, v21.16b, v24.16b
|
|
urhadd v19.16b, v23.16b, v25.16b
|
|
|
|
st1 {v16.16b}, [x1], #16
|
|
st1 {v18.16b}, [x3], #16
|
|
st1 {v17.16b}, [x2], #16
|
|
st1 {v19.16b}, [x4], #16
|
|
b.gt 2b
|
|
3:
|
|
subs w8, w8, #1
|
|
add x0, x0, x5, lsl #1
|
|
add x1, x1, x10
|
|
add x2, x2, x10
|
|
add x3, x3, x10
|
|
add x4, x4, x10
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function load_deinterleave_chroma_fenc_neon, export=1
|
|
mov x4, #FENC_STRIDE/2
|
|
b load_deinterleave_chroma
|
|
endfunc
|
|
|
|
function load_deinterleave_chroma_fdec_neon, export=1
|
|
mov x4, #FDEC_STRIDE/2
|
|
load_deinterleave_chroma:
|
|
ld2 {v0.8b,v1.8b}, [x1], x2
|
|
ld2 {v2.8b,v3.8b}, [x1], x2
|
|
subs w3, w3, #2
|
|
st1 {v0.8b}, [x0], x4
|
|
st1 {v1.8b}, [x0], x4
|
|
st1 {v2.8b}, [x0], x4
|
|
st1 {v3.8b}, [x0], x4
|
|
b.gt load_deinterleave_chroma
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_core_neon, export=1
|
|
add w8, w4, #15 // 32-bit write clears the upper 32-bit the register
|
|
and w4, w8, #~15
|
|
// safe use of the full reg since negative width makes no sense
|
|
sub x1, x1, x4
|
|
sub x3, x3, x4
|
|
1:
|
|
mov w8, w4
|
|
16:
|
|
tst w8, #16
|
|
b.eq 32f
|
|
subs w8, w8, #16
|
|
ldr q0, [x2], #16
|
|
str q0, [x0], #16
|
|
b.eq 0f
|
|
32:
|
|
subs w8, w8, #32
|
|
ldp q0, q1, [x2], #32
|
|
stp q0, q1, [x0], #32
|
|
b.gt 32b
|
|
0:
|
|
subs w5, w5, #1
|
|
add x2, x2, x3
|
|
add x0, x0, x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_swap_core_neon, export=1
|
|
lsl w4, w4, #1
|
|
sub x1, x1, x4
|
|
sub x3, x3, x4
|
|
1:
|
|
mov w8, w4
|
|
tbz w4, #4, 32f
|
|
subs w8, w8, #16
|
|
ld1 {v0.16b}, [x2], #16
|
|
rev16 v0.16b, v0.16b
|
|
st1 {v0.16b}, [x0], #16
|
|
b.eq 0f
|
|
32:
|
|
subs w8, w8, #32
|
|
ld1 {v0.16b,v1.16b}, [x2], #32
|
|
rev16 v0.16b, v0.16b
|
|
rev16 v1.16b, v1.16b
|
|
st1 {v0.16b,v1.16b}, [x0], #32
|
|
b.gt 32b
|
|
0:
|
|
subs w5, w5, #1
|
|
add x2, x2, x3
|
|
add x0, x0, x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_deinterleave_neon, export=1
|
|
add w9, w6, #15
|
|
and w9, w9, #0xfffffff0
|
|
sub x1, x1, x9
|
|
sub x3, x3, x9
|
|
sub x5, x5, x9, lsl #1
|
|
1:
|
|
ld2 {v0.16b,v1.16b}, [x4], #32
|
|
subs w9, w9, #16
|
|
st1 {v0.16b}, [x0], #16
|
|
st1 {v1.16b}, [x2], #16
|
|
b.gt 1b
|
|
|
|
add x4, x4, x5
|
|
subs w7, w7, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
mov w9, w6
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro deinterleave_rgb
|
|
subs x11, x11, #8
|
|
st1 {v0.8b}, [x0], #8
|
|
st1 {v1.8b}, [x2], #8
|
|
st1 {v2.8b}, [x4], #8
|
|
b.gt 1b
|
|
|
|
subs w10, w10, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
add x4, x4, x5
|
|
add x6, x6, x7
|
|
mov x11, x9
|
|
b.gt 1b
|
|
.endm
|
|
|
|
function plane_copy_deinterleave_rgb_neon, export=1
|
|
#if SYS_MACOSX
|
|
ldr w8, [sp]
|
|
ldp w9, w10, [sp, #4]
|
|
#else
|
|
ldr x8, [sp]
|
|
ldp x9, x10, [sp, #8]
|
|
#endif
|
|
cmp w8, #3
|
|
uxtw x9, w9
|
|
add x11, x9, #7
|
|
and x11, x11, #~7
|
|
sub x1, x1, x11
|
|
sub x3, x3, x11
|
|
sub x5, x5, x11
|
|
b.ne 4f
|
|
sub x7, x7, x11, lsl #1
|
|
sub x7, x7, x11
|
|
1:
|
|
ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
|
|
deinterleave_rgb
|
|
|
|
ret
|
|
4:
|
|
sub x7, x7, x11, lsl #2
|
|
1:
|
|
ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
|
|
deinterleave_rgb
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_interleave_core_neon, export=1
|
|
add w9, w6, #15
|
|
and w9, w9, #0xfffffff0
|
|
sub x1, x1, x9, lsl #1
|
|
sub x3, x3, x9
|
|
sub x5, x5, x9
|
|
1:
|
|
ld1 {v0.16b}, [x2], #16
|
|
ld1 {v1.16b}, [x4], #16
|
|
subs w9, w9, #16
|
|
st2 {v0.16b,v1.16b}, [x0], #32
|
|
b.gt 1b
|
|
|
|
subs w7, w7, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
add x4, x4, x5
|
|
mov w9, w6
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function store_interleave_chroma_neon, export=1
|
|
mov x5, #FDEC_STRIDE
|
|
1:
|
|
ld1 {v0.8b}, [x2], x5
|
|
ld1 {v1.8b}, [x3], x5
|
|
ld1 {v2.8b}, [x2], x5
|
|
ld1 {v3.8b}, [x3], x5
|
|
subs w4, w4, #2
|
|
zip1 v4.16b, v0.16b, v1.16b
|
|
zip1 v5.16b, v2.16b, v3.16b
|
|
st1 {v4.16b}, [x0], x1
|
|
st1 {v5.16b}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro integral4h p1, p2
|
|
ext v1.8b, \p1\().8b, \p2\().8b, #1
|
|
ext v2.8b, \p1\().8b, \p2\().8b, #2
|
|
ext v3.8b, \p1\().8b, \p2\().8b, #3
|
|
uaddl v0.8h, \p1\().8b, v1.8b
|
|
uaddl v4.8h, v2.8b, v3.8b
|
|
add v0.8h, v0.8h, v4.8h
|
|
add v0.8h, v0.8h, v5.8h
|
|
.endm
|
|
|
|
function integral_init4h_neon, export=1
|
|
sub x3, x0, x2, lsl #1
|
|
ld1 {v6.8b,v7.8b}, [x1], #16
|
|
1:
|
|
subs x2, x2, #16
|
|
ld1 {v5.8h}, [x3], #16
|
|
integral4h v6, v7
|
|
ld1 {v6.8b}, [x1], #8
|
|
ld1 {v5.8h}, [x3], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
integral4h v7, v6
|
|
ld1 {v7.8b}, [x1], #8
|
|
st1 {v0.8h}, [x0], #16
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro integral8h p1, p2, s
|
|
ext v1.8b, \p1\().8b, \p2\().8b, #1
|
|
ext v2.8b, \p1\().8b, \p2\().8b, #2
|
|
ext v3.8b, \p1\().8b, \p2\().8b, #3
|
|
ext v4.8b, \p1\().8b, \p2\().8b, #4
|
|
ext v5.8b, \p1\().8b, \p2\().8b, #5
|
|
ext v6.8b, \p1\().8b, \p2\().8b, #6
|
|
ext v7.8b, \p1\().8b, \p2\().8b, #7
|
|
uaddl v0.8h, \p1\().8b, v1.8b
|
|
uaddl v2.8h, v2.8b, v3.8b
|
|
uaddl v4.8h, v4.8b, v5.8b
|
|
uaddl v6.8h, v6.8b, v7.8b
|
|
add v0.8h, v0.8h, v2.8h
|
|
add v4.8h, v4.8h, v6.8h
|
|
add v0.8h, v0.8h, v4.8h
|
|
add v0.8h, v0.8h, \s\().8h
|
|
.endm
|
|
|
|
function integral_init8h_neon, export=1
|
|
sub x3, x0, x2, lsl #1
|
|
ld1 {v16.8b,v17.8b}, [x1], #16
|
|
1:
|
|
subs x2, x2, #16
|
|
ld1 {v18.8h}, [x3], #16
|
|
integral8h v16, v17, v18
|
|
ld1 {v16.8b}, [x1], #8
|
|
ld1 {v18.8h}, [x3], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
integral8h v17, v16, v18
|
|
ld1 {v17.8b}, [x1], #8
|
|
st1 {v0.8h}, [x0], #16
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function integral_init4v_neon, export=1
|
|
mov x3, x0
|
|
add x4, x0, x2, lsl #3
|
|
add x8, x0, x2, lsl #4
|
|
sub x2, x2, #8
|
|
ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
|
|
ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
|
|
1:
|
|
subs x2, x2, #16
|
|
ld1 {v24.8h,v25.8h}, [x4], #32
|
|
ext v0.16b, v20.16b, v21.16b, #8
|
|
ext v1.16b, v21.16b, v22.16b, #8
|
|
ext v2.16b, v16.16b, v17.16b, #8
|
|
ext v3.16b, v17.16b, v18.16b, #8
|
|
sub v24.8h, v24.8h, v20.8h
|
|
sub v25.8h, v25.8h, v21.8h
|
|
add v0.8h, v0.8h, v20.8h
|
|
add v1.8h, v1.8h, v21.8h
|
|
add v2.8h, v2.8h, v16.8h
|
|
add v3.8h, v3.8h, v17.8h
|
|
st1 {v24.8h}, [x1], #16
|
|
st1 {v25.8h}, [x1], #16
|
|
mov v20.16b, v22.16b
|
|
mov v16.16b, v18.16b
|
|
sub v0.8h, v2.8h, v0.8h
|
|
sub v1.8h, v3.8h, v1.8h
|
|
ld1 {v21.8h,v22.8h}, [x3], #32
|
|
ld1 {v17.8h,v18.8h}, [x8], #32
|
|
st1 {v0.8h}, [x0], #16
|
|
st1 {v1.8h}, [x0], #16
|
|
b.gt 1b
|
|
2:
|
|
ret
|
|
endfunc
|
|
|
|
function integral_init8v_neon, export=1
|
|
add x2, x0, x1, lsl #4
|
|
sub x1, x1, #8
|
|
ands x3, x1, #16 - 1
|
|
b.eq 1f
|
|
subs x1, x1, #8
|
|
ld1 {v0.8h}, [x0]
|
|
ld1 {v2.8h}, [x2], #16
|
|
sub v4.8h, v2.8h, v0.8h
|
|
st1 {v4.8h}, [x0], #16
|
|
b.le 2f
|
|
1:
|
|
subs x1, x1, #16
|
|
ld1 {v0.8h,v1.8h}, [x0]
|
|
ld1 {v2.8h,v3.8h}, [x2], #32
|
|
sub v4.8h, v2.8h, v0.8h
|
|
sub v5.8h, v3.8h, v1.8h
|
|
st1 {v4.8h}, [x0], #16
|
|
st1 {v5.8h}, [x0], #16
|
|
b.gt 1b
|
|
2:
|
|
ret
|
|
endfunc
|
|
|
|
#else // BIT_DEPTH == 8
|
|
|
|
// void pixel_avg( pixel *dst, intptr_t dst_stride,
|
|
// pixel *src1, intptr_t src1_stride,
|
|
// pixel *src2, intptr_t src2_stride, int weight );
|
|
.macro AVGH w h
|
|
function pixel_avg_\w\()x\h\()_neon, export=1
|
|
mov w10, #64
|
|
cmp w6, #32
|
|
mov w9, #\h
|
|
b.eq pixel_avg_w\w\()_neon
|
|
subs w7, w10, w6
|
|
b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
|
|
cmp w6, #0
|
|
b.ge pixel_avg_weight_w\w\()_add_add_neon
|
|
b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
|
|
endfunc
|
|
.endm
|
|
|
|
AVGH 4, 2
|
|
AVGH 4, 4
|
|
AVGH 4, 8
|
|
AVGH 4, 16
|
|
AVGH 8, 4
|
|
AVGH 8, 8
|
|
AVGH 8, 16
|
|
AVGH 16, 8
|
|
AVGH 16, 16
|
|
|
|
// 0 < weight < 64
|
|
.macro load_weights_add_add
|
|
mov w6, w6
|
|
.endm
|
|
.macro weight_add_add dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s1, v30.8h
|
|
umlal2 \dst, \s2, v31.8h
|
|
.else
|
|
umull \dst, \s1, v30.4h
|
|
umlal \dst, \s2, v31.4h
|
|
.endif
|
|
.endm
|
|
|
|
// weight > 64
|
|
.macro load_weights_add_sub
|
|
neg w7, w7
|
|
.endm
|
|
.macro weight_add_sub dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s1, v30.8h
|
|
umlsl2 \dst, \s2, v31.8h
|
|
.else
|
|
umull \dst, \s1, v30.4h
|
|
umlsl \dst, \s2, v31.4h
|
|
.endif
|
|
.endm
|
|
|
|
// weight < 0
|
|
.macro load_weights_sub_add
|
|
neg w6, w6
|
|
.endm
|
|
.macro weight_sub_add dst, s1, s2, h=
|
|
.ifc \h, 2
|
|
umull2 \dst, \s2, v31.8h
|
|
umlsl2 \dst, \s1, v30.8h
|
|
.else
|
|
umull \dst, \s2, v31.4h
|
|
umlsl \dst, \s1, v30.4h
|
|
.endif
|
|
.endm
|
|
|
|
.macro AVG_WEIGHT ext
|
|
function pixel_avg_weight_w4_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.8h, w6
|
|
dup v31.8h, w7
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
lsl x1, x1, #1
|
|
1: // height loop
|
|
subs w9, w9, #2
|
|
ld1 {v0.d}[0], [x2], x3
|
|
ld1 {v1.d}[0], [x4], x5
|
|
weight_\ext v4.4s, v0.4h, v1.4h
|
|
ld1 {v2.d}[0], [x2], x3
|
|
ld1 {v3.d}[0], [x4], x5
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
sqrshrun v4.4h, v4.4s, #6
|
|
weight_\ext v5.4s, v2.4h, v3.4h
|
|
smin v4.4h, v4.4h, v28.4h
|
|
sqrshrun v5.4h, v5.4s, #6
|
|
|
|
st1 {v4.d}[0], [x0], x1
|
|
|
|
smin v5.4h, v5.4h, v28.4h
|
|
|
|
st1 {v5.d}[0], [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_weight_w8_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.8h, w6
|
|
dup v31.8h, w7
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
1: // height loop
|
|
subs w9, w9, #4
|
|
ld1 {v0.8h}, [x2], x3
|
|
ld1 {v1.8h}, [x4], x5
|
|
weight_\ext v16.4s, v0.4h, v1.4h
|
|
weight_\ext v17.4s, v0.8h, v1.8h, 2
|
|
ld1 {v2.8h}, [x2], x3
|
|
ld1 {v3.8h}, [x4], x5
|
|
weight_\ext v18.4s, v2.4h, v3.4h
|
|
weight_\ext v19.4s, v2.8h, v3.8h, 2
|
|
ld1 {v4.8h}, [x2], x3
|
|
ld1 {v5.8h}, [x4], x5
|
|
weight_\ext v20.4s, v4.4h, v5.4h
|
|
weight_\ext v21.4s, v4.8h, v5.8h, 2
|
|
ld1 {v6.8h}, [x2], x3
|
|
ld1 {v7.8h}, [x4], x5
|
|
weight_\ext v22.4s, v6.4h, v7.4h
|
|
weight_\ext v23.4s, v6.8h, v7.8h, 2
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
sqrshrun v0.4h, v16.4s, #6
|
|
sqrshrun v2.4h, v18.4s, #6
|
|
sqrshrun v4.4h, v20.4s, #6
|
|
sqrshrun2 v0.8h, v17.4s, #6
|
|
sqrshrun v6.4h, v22.4s, #6
|
|
sqrshrun2 v2.8h, v19.4s, #6
|
|
sqrshrun2 v4.8h, v21.4s, #6
|
|
smin v0.8h, v0.8h, v28.8h
|
|
smin v2.8h, v2.8h, v28.8h
|
|
sqrshrun2 v6.8h, v23.4s, #6
|
|
smin v4.8h, v4.8h, v28.8h
|
|
smin v6.8h, v6.8h, v28.8h
|
|
|
|
st1 {v0.8h}, [x0], x1
|
|
st1 {v2.8h}, [x0], x1
|
|
st1 {v4.8h}, [x0], x1
|
|
st1 {v6.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_weight_w16_\ext\()_neon
|
|
load_weights_\ext
|
|
dup v30.8h, w6
|
|
dup v31.8h, w7
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
1: // height loop
|
|
subs w9, w9, #2
|
|
|
|
ld1 {v0.8h, v1.8h}, [x2], x3
|
|
ld1 {v2.8h, v3.8h}, [x4], x5
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
ld1 {v6.8h, v7.8h}, [x4], x5
|
|
|
|
weight_\ext v16.4s, v0.4h, v2.4h
|
|
weight_\ext v17.4s, v0.8h, v2.8h, 2
|
|
weight_\ext v18.4s, v1.4h, v3.4h
|
|
weight_\ext v19.4s, v1.8h, v3.8h, 2
|
|
weight_\ext v20.4s, v4.4h, v6.4h
|
|
weight_\ext v21.4s, v4.8h, v6.8h, 2
|
|
weight_\ext v22.4s, v5.4h, v7.4h
|
|
weight_\ext v23.4s, v5.8h, v7.8h, 2
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
sqrshrun v0.4h, v16.4s, #6
|
|
sqrshrun v1.4h, v18.4s, #6
|
|
sqrshrun v2.4h, v20.4s, #6
|
|
sqrshrun2 v0.8h, v17.4s, #6
|
|
sqrshrun2 v1.8h, v19.4s, #6
|
|
sqrshrun2 v2.8h, v21.4s, #6
|
|
smin v0.8h, v0.8h, v28.8h
|
|
smin v1.8h, v1.8h, v28.8h
|
|
sqrshrun v3.4h, v22.4s, #6
|
|
smin v2.8h, v2.8h, v28.8h
|
|
sqrshrun2 v3.8h, v23.4s, #6
|
|
smin v3.8h, v3.8h, v28.8h
|
|
|
|
st1 {v0.8h, v1.8h}, [x0], x1
|
|
st1 {v2.8h, v3.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
AVG_WEIGHT add_add
|
|
AVG_WEIGHT add_sub
|
|
AVG_WEIGHT sub_add
|
|
|
|
function pixel_avg_w4_neon
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
|
|
1: subs w9, w9, #2
|
|
ld1 {v0.d}[0], [x2], x3
|
|
ld1 {v2.d}[0], [x4], x5
|
|
ld1 {v0.d}[1], [x2], x3
|
|
ld1 {v2.d}[1], [x4], x5
|
|
urhadd v0.8h, v0.8h, v2.8h
|
|
st1 {v0.d}[0], [x0], x1
|
|
st1 {v0.d}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_w8_neon
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
1: subs w9, w9, #4
|
|
ld1 {v0.8h}, [x2], x3
|
|
ld1 {v1.8h}, [x4], x5
|
|
ld1 {v2.8h}, [x2], x3
|
|
urhadd v0.8h, v0.8h, v1.8h
|
|
ld1 {v3.8h}, [x4], x5
|
|
st1 {v0.8h}, [x0], x1
|
|
ld1 {v4.8h}, [x2], x3
|
|
urhadd v1.8h, v2.8h, v3.8h
|
|
ld1 {v5.8h}, [x4], x5
|
|
st1 {v1.8h}, [x0], x1
|
|
ld1 {v6.8h}, [x2], x3
|
|
ld1 {v7.8h}, [x4], x5
|
|
urhadd v2.8h, v4.8h, v5.8h
|
|
urhadd v3.8h, v6.8h, v7.8h
|
|
st1 {v2.8h}, [x0], x1
|
|
st1 {v3.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg_w16_neon
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
|
|
1: subs w9, w9, #4
|
|
|
|
ld1 {v0.8h, v1.8h}, [x2], x3
|
|
ld1 {v2.8h, v3.8h}, [x4], x5
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
urhadd v0.8h, v0.8h, v2.8h
|
|
urhadd v1.8h, v1.8h, v3.8h
|
|
ld1 {v6.8h, v7.8h}, [x4], x5
|
|
ld1 {v20.8h, v21.8h}, [x2], x3
|
|
st1 {v0.8h, v1.8h}, [x0], x1
|
|
urhadd v4.8h, v4.8h, v6.8h
|
|
urhadd v5.8h, v5.8h, v7.8h
|
|
ld1 {v22.8h, v23.8h}, [x4], x5
|
|
ld1 {v24.8h, v25.8h}, [x2], x3
|
|
st1 {v4.8h, v5.8h}, [x0], x1
|
|
ld1 {v26.8h, v27.8h}, [x4], x5
|
|
urhadd v20.8h, v20.8h, v22.8h
|
|
urhadd v21.8h, v21.8h, v23.8h
|
|
urhadd v24.8h, v24.8h, v26.8h
|
|
urhadd v25.8h, v25.8h, v27.8h
|
|
st1 {v20.8h, v21.8h}, [x0], x1
|
|
st1 {v24.8h, v25.8h}, [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w4_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.4h}, [x2], x3
|
|
ld1 {v2.4h}, [x4], x3
|
|
ld1 {v1.4h}, [x2], x3
|
|
ld1 {v3.4h}, [x4], x3
|
|
urhadd v0.4h, v0.4h, v2.4h
|
|
urhadd v1.4h, v1.4h, v3.4h
|
|
|
|
st1 {v0.4h}, [x0], x1
|
|
st1 {v1.4h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w8_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.8h}, [x2], x3
|
|
ld1 {v2.8h}, [x4], x3
|
|
ld1 {v1.8h}, [x2], x3
|
|
ld1 {v3.8h}, [x4], x3
|
|
urhadd v0.8h, v0.8h, v2.8h
|
|
urhadd v1.8h, v1.8h, v3.8h
|
|
|
|
st1 {v0.8h}, [x0], x1
|
|
st1 {v1.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w16_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v0.8h, v1.8h}, [x2], x3
|
|
ld1 {v2.8h, v3.8h}, [x4], x3
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
ld1 {v6.8h, v7.8h}, [x4], x3
|
|
urhadd v0.8h, v0.8h, v2.8h
|
|
urhadd v1.8h, v1.8h, v3.8h
|
|
urhadd v4.8h, v4.8h, v6.8h
|
|
urhadd v5.8h, v5.8h, v7.8h
|
|
|
|
st1 {v0.8h, v1.8h}, [x0], x1
|
|
st1 {v4.8h, v5.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function pixel_avg2_w20_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
sub x1, x1, #32
|
|
1:
|
|
subs w5, w5, #2
|
|
|
|
ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3
|
|
ld1 {v3.8h, v4.8h, v5.8h}, [x4], x3
|
|
ld1 {v20.8h, v21.8h, v22.8h}, [x2], x3
|
|
ld1 {v23.8h, v24.8h, v25.8h}, [x4], x3
|
|
|
|
urhadd v0.8h, v0.8h, v3.8h
|
|
urhadd v1.8h, v1.8h, v4.8h
|
|
urhadd v2.4h, v2.4h, v5.4h
|
|
urhadd v20.8h, v20.8h, v23.8h
|
|
urhadd v21.8h, v21.8h, v24.8h
|
|
urhadd v22.4h, v22.4h, v25.4h
|
|
|
|
st1 {v0.8h, v1.8h}, [x0], #32
|
|
st1 {v2.4h}, [x0], x1
|
|
st1 {v20.8h, v21.8h}, [x0], #32
|
|
st1 {v22.4h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
// void mc_copy( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int height )
|
|
function mc_copy_w4_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w4, w4, #4
|
|
ld1 {v0.d}[0], [x2], x3
|
|
ld1 {v1.d}[0], [x2], x3
|
|
ld1 {v2.d}[0], [x2], x3
|
|
ld1 {v3.d}[0], [x2], x3
|
|
st1 {v0.d}[0], [x0], x1
|
|
st1 {v1.d}[0], [x0], x1
|
|
st1 {v2.d}[0], [x0], x1
|
|
st1 {v3.d}[0], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_copy_w8_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1: subs w4, w4, #4
|
|
ld1 {v0.8h}, [x2], x3
|
|
ld1 {v1.8h}, [x2], x3
|
|
ld1 {v2.8h}, [x2], x3
|
|
ld1 {v3.8h}, [x2], x3
|
|
st1 {v0.8h}, [x0], x1
|
|
st1 {v1.8h}, [x0], x1
|
|
st1 {v2.8h}, [x0], x1
|
|
st1 {v3.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_copy_w16_neon, export=1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1: subs w4, w4, #4
|
|
ld1 {v0.8h, v1.8h}, [x2], x3
|
|
ld1 {v2.8h, v3.8h}, [x2], x3
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
ld1 {v6.8h, v7.8h}, [x2], x3
|
|
st1 {v0.8h, v1.8h}, [x0], x1
|
|
st1 {v2.8h, v3.8h}, [x0], x1
|
|
st1 {v4.8h, v5.8h}, [x0], x1
|
|
st1 {v6.8h, v7.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro weight_prologue type
|
|
mov w9, w5 // height
|
|
.ifc \type, full
|
|
ldr w12, [x4, #32] // denom
|
|
.endif
|
|
ldp w4, w5, [x4, #32+4] // scale, offset
|
|
dup v0.8h, w4
|
|
lsl w5, w5, #2
|
|
dup v1.4s, w5
|
|
.ifc \type, full
|
|
neg w12, w12
|
|
dup v2.4s, w12
|
|
.endif
|
|
.endm
|
|
|
|
// void mc_weight( pixel *src, intptr_t src_stride, pixel *dst,
|
|
// intptr_t dst_stride, const x264_weight_t *weight, int h )
|
|
function mc_weight_w20_neon, export=1
|
|
weight_prologue full
|
|
lsl x3, x3, #1
|
|
lsl x1, x1, #1
|
|
sub x1, x1, #32
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3
|
|
ld1 {v19.8h, v20.8h, v21.8h}, [x2], x3
|
|
|
|
umull v22.4s, v16.4h, v0.4h
|
|
umull2 v23.4s, v16.8h, v0.8h
|
|
umull v24.4s, v17.4h, v0.4h
|
|
umull2 v25.4s, v17.8h, v0.8h
|
|
umull v26.4s, v18.4h, v0.4h
|
|
umull v27.4s, v21.4h, v0.4h
|
|
|
|
srshl v22.4s, v22.4s, v2.4s
|
|
srshl v23.4s, v23.4s, v2.4s
|
|
srshl v24.4s, v24.4s, v2.4s
|
|
srshl v25.4s, v25.4s, v2.4s
|
|
srshl v26.4s, v26.4s, v2.4s
|
|
srshl v27.4s, v27.4s, v2.4s
|
|
add v22.4s, v22.4s, v1.4s
|
|
add v23.4s, v23.4s, v1.4s
|
|
add v24.4s, v24.4s, v1.4s
|
|
add v25.4s, v25.4s, v1.4s
|
|
add v26.4s, v26.4s, v1.4s
|
|
add v27.4s, v27.4s, v1.4s
|
|
|
|
sqxtun v22.4h, v22.4s
|
|
sqxtun2 v22.8h, v23.4s
|
|
sqxtun v23.4h, v24.4s
|
|
sqxtun2 v23.8h, v25.4s
|
|
sqxtun v24.4h, v26.4s
|
|
sqxtun2 v24.8h, v27.4s
|
|
|
|
umull v16.4s, v19.4h, v0.4h
|
|
umull2 v17.4s, v19.8h, v0.8h
|
|
umull v18.4s, v20.4h, v0.4h
|
|
umull2 v19.4s, v20.8h, v0.8h
|
|
|
|
srshl v16.4s, v16.4s, v2.4s
|
|
srshl v17.4s, v17.4s, v2.4s
|
|
srshl v18.4s, v18.4s, v2.4s
|
|
srshl v19.4s, v19.4s, v2.4s
|
|
add v16.4s, v16.4s, v1.4s
|
|
add v17.4s, v17.4s, v1.4s
|
|
add v18.4s, v18.4s, v1.4s
|
|
add v19.4s, v19.4s, v1.4s
|
|
|
|
sqxtun v16.4h, v16.4s
|
|
sqxtun2 v16.8h, v17.4s
|
|
sqxtun v17.4h, v18.4s
|
|
sqxtun2 v17.8h, v19.4s
|
|
|
|
mvni v31.8h, #0xfc, lsl #8
|
|
|
|
umin v22.8h, v22.8h, v31.8h
|
|
umin v23.8h, v23.8h, v31.8h
|
|
umin v24.8h, v24.8h, v31.8h
|
|
umin v16.8h, v16.8h, v31.8h
|
|
umin v17.8h, v17.8h, v31.8h
|
|
|
|
st1 {v22.8h, v23.8h}, [x0], #32
|
|
st1 {v24.d}[0], [x0], x1
|
|
st1 {v16.8h, v17.8h}, [x0], #32
|
|
st1 {v24.d}[1], [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_neon, export=1
|
|
weight_prologue full
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
ld1 {v6.8h, v7.8h}, [x2], x3
|
|
|
|
umull v22.4s, v4.4h, v0.4h
|
|
umull2 v23.4s, v4.8h, v0.8h
|
|
umull v24.4s, v5.4h, v0.4h
|
|
umull2 v25.4s, v5.8h, v0.8h
|
|
|
|
srshl v22.4s, v22.4s, v2.4s
|
|
srshl v23.4s, v23.4s, v2.4s
|
|
srshl v24.4s, v24.4s, v2.4s
|
|
srshl v25.4s, v25.4s, v2.4s
|
|
|
|
add v22.4s, v22.4s, v1.4s
|
|
add v23.4s, v23.4s, v1.4s
|
|
add v24.4s, v24.4s, v1.4s
|
|
add v25.4s, v25.4s, v1.4s
|
|
|
|
sqxtun v22.4h, v22.4s
|
|
sqxtun2 v22.8h, v23.4s
|
|
sqxtun v23.4h, v24.4s
|
|
sqxtun2 v23.8h, v25.4s
|
|
|
|
umull v26.4s, v6.4h, v0.4h
|
|
umull2 v27.4s, v6.8h, v0.8h
|
|
umull v28.4s, v7.4h, v0.4h
|
|
umull2 v29.4s, v7.8h, v0.8h
|
|
|
|
srshl v26.4s, v26.4s, v2.4s
|
|
srshl v27.4s, v27.4s, v2.4s
|
|
srshl v28.4s, v28.4s, v2.4s
|
|
srshl v29.4s, v29.4s, v2.4s
|
|
|
|
add v26.4s, v26.4s, v1.4s
|
|
add v27.4s, v27.4s, v1.4s
|
|
add v28.4s, v28.4s, v1.4s
|
|
add v29.4s, v29.4s, v1.4s
|
|
|
|
sqxtun v26.4h, v26.4s
|
|
sqxtun2 v26.8h, v27.4s
|
|
sqxtun v27.4h, v28.4s
|
|
sqxtun2 v27.8h, v29.4s
|
|
|
|
mvni v31.8h, 0xfc, lsl #8
|
|
|
|
umin v22.8h, v22.8h, v31.8h
|
|
umin v23.8h, v23.8h, v31.8h
|
|
umin v26.8h, v26.8h, v31.8h
|
|
umin v27.8h, v27.8h, v31.8h
|
|
|
|
st1 {v22.8h, v23.8h}, [x0], x1
|
|
st1 {v26.8h, v27.8h}, [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_neon, export=1
|
|
weight_prologue full
|
|
lsl x3, x3, #1
|
|
lsl x1, x1, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8h}, [x2], x3
|
|
ld1 {v17.8h}, [x2], x3
|
|
|
|
umull v4.4s, v16.4h, v0.4h
|
|
umull2 v5.4s, v16.8h, v0.8h
|
|
umull v6.4s, v17.4h, v0.4h
|
|
umull2 v7.4s, v17.8h, v0.8h
|
|
|
|
srshl v4.4s, v4.4s, v2.4s
|
|
srshl v5.4s, v5.4s, v2.4s
|
|
srshl v6.4s, v6.4s, v2.4s
|
|
srshl v7.4s, v7.4s, v2.4s
|
|
|
|
add v4.4s, v4.4s, v1.4s
|
|
add v5.4s, v5.4s, v1.4s
|
|
add v6.4s, v6.4s, v1.4s
|
|
add v7.4s, v7.4s, v1.4s
|
|
|
|
sqxtun v16.4h, v4.4s
|
|
sqxtun2 v16.8h, v5.4s
|
|
sqxtun v17.4h, v6.4s
|
|
sqxtun2 v17.8h, v7.4s
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
umin v16.8h, v16.8h, v28.8h
|
|
umin v17.8h, v17.8h, v28.8h
|
|
|
|
st1 {v16.8h}, [x0], x1
|
|
st1 {v17.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_neon, export=1
|
|
weight_prologue full
|
|
lsl x3, x3, #1
|
|
lsl x1, x1, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.d}[0], [x2], x3
|
|
ld1 {v16.d}[1], [x2], x3
|
|
umull v4.4s, v16.4h, v0.4h
|
|
umull2 v5.4s, v16.8h, v0.8h
|
|
srshl v4.4s, v4.4s, v2.4s
|
|
srshl v5.4s, v5.4s, v2.4s
|
|
add v4.4s, v4.4s, v1.4s
|
|
add v5.4s, v5.4s, v1.4s
|
|
|
|
sqxtun v16.4h, v4.4s
|
|
sqxtun2 v16.8h, v5.4s
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
umin v16.8h, v16.8h, v28.8h
|
|
|
|
st1 {v16.d}[0], [x0], x1
|
|
st1 {v16.d}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w20_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
lsl x3, x3, #1
|
|
lsl x1, x1, #1
|
|
sub x1, x1, #32
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3
|
|
mov v20.16b, v1.16b
|
|
mov v21.16b, v1.16b
|
|
mov v22.16b, v1.16b
|
|
mov v23.16b, v1.16b
|
|
mov v24.16b, v1.16b
|
|
mov v25.16b, v1.16b
|
|
ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3
|
|
mov v26.16b, v1.16b
|
|
mov v27.16b, v1.16b
|
|
mov v28.16b, v1.16b
|
|
mov v29.16b, v1.16b
|
|
|
|
umlal v20.4s, v16.4h, v0.4h
|
|
umlal2 v21.4s, v16.8h, v0.8h
|
|
umlal v22.4s, v17.4h, v0.4h
|
|
umlal2 v23.4s, v17.8h, v0.8h
|
|
umlal v24.4s, v18.4h, v0.4h
|
|
umlal v25.4s, v4.4h, v0.4h
|
|
umlal v26.4s, v2.4h, v0.4h
|
|
umlal2 v27.4s, v2.8h, v0.8h
|
|
umlal v28.4s, v3.4h, v0.4h
|
|
umlal2 v29.4s, v3.8h, v0.8h
|
|
|
|
sqxtun v2.4h, v20.4s
|
|
sqxtun2 v2.8h, v21.4s
|
|
sqxtun v3.4h, v22.4s
|
|
sqxtun2 v3.8h, v23.4s
|
|
sqxtun v4.4h, v24.4s
|
|
sqxtun2 v4.8h, v25.4s
|
|
sqxtun v5.4h, v26.4s
|
|
sqxtun2 v5.8h, v27.4s
|
|
sqxtun v6.4h, v28.4s
|
|
sqxtun2 v6.8h, v29.4s
|
|
|
|
mvni v31.8h, 0xfc, lsl #8
|
|
|
|
umin v2.8h, v2.8h, v31.8h
|
|
umin v3.8h, v3.8h, v31.8h
|
|
umin v4.8h, v4.8h, v31.8h
|
|
umin v5.8h, v5.8h, v31.8h
|
|
umin v6.8h, v6.8h, v31.8h
|
|
|
|
st1 {v2.8h, v3.8h}, [x0], #32
|
|
st1 {v4.d}[0], [x0], x1
|
|
st1 {v5.8h, v6.8h}, [x0], #32
|
|
st1 {v4.d}[1], [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v2.8h, v3.8h}, [x2], x3
|
|
mov v27.16b, v1.16b
|
|
mov v28.16b, v1.16b
|
|
mov v29.16b, v1.16b
|
|
mov v30.16b, v1.16b
|
|
ld1 {v4.8h, v5.8h}, [x2], x3
|
|
mov v20.16b, v1.16b
|
|
mov v21.16b, v1.16b
|
|
mov v22.16b, v1.16b
|
|
mov v23.16b, v1.16b
|
|
|
|
umlal v27.4s, v2.4h, v0.4h
|
|
umlal2 v28.4s, v2.8h, v0.8h
|
|
umlal v29.4s, v3.4h, v0.4h
|
|
umlal2 v30.4s, v3.8h, v0.8h
|
|
|
|
umlal v20.4s, v4.4h, v0.4h
|
|
umlal2 v21.4s, v4.8h, v0.8h
|
|
umlal v22.4s, v5.4h, v0.4h
|
|
umlal2 v23.4s, v5.8h, v0.8h
|
|
|
|
sqxtun v2.4h, v27.4s
|
|
sqxtun2 v2.8h, v28.4s
|
|
sqxtun v3.4h, v29.4s
|
|
sqxtun2 v3.8h, v30.4s
|
|
|
|
sqxtun v4.4h, v20.4s
|
|
sqxtun2 v4.8h, v21.4s
|
|
sqxtun v5.4h, v22.4s
|
|
sqxtun2 v5.8h, v23.4s
|
|
|
|
mvni v31.8h, 0xfc, lsl #8
|
|
|
|
umin v2.8h, v2.8h, v31.8h
|
|
umin v3.8h, v3.8h, v31.8h
|
|
umin v4.8h, v4.8h, v31.8h
|
|
umin v5.8h, v5.8h, v31.8h
|
|
|
|
st1 {v2.8h, v3.8h}, [x0], x1
|
|
st1 {v4.8h, v5.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.8h}, [x2], x3
|
|
mov v27.16b, v1.16b
|
|
ld1 {v17.8h}, [x2], x3
|
|
mov v28.16b, v1.16b
|
|
mov v29.16b, v1.16b
|
|
mov v30.16b, v1.16b
|
|
|
|
umlal v27.4s, v16.4h, v0.4h
|
|
umlal2 v28.4s, v16.8h, v0.8h
|
|
umlal v29.4s, v17.4h, v0.4h
|
|
umlal2 v30.4s, v17.8h, v0.8h
|
|
|
|
sqxtun v4.4h, v27.4s
|
|
sqxtun2 v4.8h, v28.4s
|
|
sqxtun v5.4h, v29.4s
|
|
sqxtun2 v5.8h, v30.4s
|
|
|
|
mvni v31.8h, 0xfc, lsl #8
|
|
|
|
umin v4.8h, v4.8h, v31.8h
|
|
umin v5.8h, v5.8h, v31.8h
|
|
|
|
st1 {v4.8h}, [x0], x1
|
|
st1 {v5.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_nodenom_neon, export=1
|
|
weight_prologue nodenom
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w9, w9, #2
|
|
ld1 {v16.d}[0], [x2], x3
|
|
ld1 {v16.d}[1], [x2], x3
|
|
mov v27.16b, v1.16b
|
|
mov v28.16b, v1.16b
|
|
umlal v27.4s, v16.4h, v0.4h
|
|
umlal2 v28.4s, v16.8h, v0.8h
|
|
|
|
sqxtun v4.4h, v27.4s
|
|
sqxtun2 v4.8h, v28.4s
|
|
|
|
mvni v31.8h, 0xfc, lsl #8
|
|
|
|
umin v4.8h, v4.8h, v31.8h
|
|
|
|
st1 {v4.d}[0], [x0], x1
|
|
st1 {v4.d}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro weight_simple_prologue
|
|
ldr w6, [x4] // offset
|
|
lsl w6, w6, #2
|
|
dup v1.8h, w6
|
|
.endm
|
|
|
|
.macro weight_simple name op
|
|
function mc_weight_w20_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
sub x1, x1, #32
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3
|
|
ld1 {v5.8h, v6.8h, v7.8h}, [x2], x3
|
|
|
|
zip1 v4.2d, v4.2d, v7.2d
|
|
|
|
\op v2.8h, v2.8h, v1.8h
|
|
\op v3.8h, v3.8h, v1.8h
|
|
\op v4.8h, v4.8h, v1.8h
|
|
\op v5.8h, v5.8h, v1.8h
|
|
\op v6.8h, v6.8h, v1.8h
|
|
|
|
mvni v31.8h, #0xfc, lsl #8
|
|
|
|
umin v2.8h, v2.8h, v28.8h
|
|
umin v3.8h, v3.8h, v28.8h
|
|
umin v4.8h, v4.8h, v28.8h
|
|
umin v5.8h, v5.8h, v28.8h
|
|
umin v6.8h, v6.8h, v28.8h
|
|
|
|
st1 {v2.8h, v3.8h}, [x0], #32
|
|
st1 {v4.d}[0], [x0], x1
|
|
st1 {v5.8h, v6.8h}, [x0], #32
|
|
st1 {v4.d}[1], [x0], x1
|
|
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w16_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.8h, v17.8h}, [x2], x3
|
|
ld1 {v18.8h, v19.8h}, [x2], x3
|
|
|
|
\op v16.8h, v16.8h, v1.8h
|
|
\op v17.8h, v17.8h, v1.8h
|
|
\op v18.8h, v18.8h, v1.8h
|
|
\op v19.8h, v19.8h, v1.8h
|
|
|
|
mvni v28.8h, #0xfc, lsl #8
|
|
|
|
umin v16.8h, v16.8h, v28.8h
|
|
umin v17.8h, v17.8h, v28.8h
|
|
umin v18.8h, v18.8h, v28.8h
|
|
umin v19.8h, v19.8h, v28.8h
|
|
|
|
st1 {v16.8h, v17.8h}, [x0], x1
|
|
st1 {v18.8h, v19.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w8_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.8h}, [x2], x3
|
|
ld1 {v17.8h}, [x2], x3
|
|
\op v16.8h, v16.8h, v1.8h
|
|
\op v17.8h, v17.8h, v1.8h
|
|
|
|
mvni v28.8h, 0xfc, lsl #8
|
|
|
|
umin v16.8h, v16.8h, v28.8h
|
|
umin v17.8h, v17.8h, v28.8h
|
|
|
|
st1 {v16.8h}, [x0], x1
|
|
st1 {v17.8h}, [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function mc_weight_w4_\name\()_neon, export=1
|
|
weight_simple_prologue
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
subs w5, w5, #2
|
|
ld1 {v16.d}[0], [x2], x3
|
|
ld1 {v16.d}[1], [x2], x3
|
|
\op v16.8h, v16.8h, v1.8h
|
|
mvni v28.8h, 0xfc, lsl #8
|
|
|
|
umin v16.8h, v16.8h, v28.8h
|
|
|
|
st1 {v16.d}[0], [x0], x1
|
|
st1 {v16.d}[1], [x0], x1
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
weight_simple offsetadd, uqadd
|
|
weight_simple offsetsub, uqsub
|
|
|
|
// void mc_chroma( pixel *dst_u, pixel *dst_v,
|
|
// intptr_t i_dst_stride,
|
|
// pixel *src, intptr_t i_src_stride,
|
|
// int dx, int dy, int i_width, int i_height );
|
|
function mc_chroma_neon, export=1
|
|
ldr w15, [sp] // height
|
|
sbfx x12, x6, #3, #29 // asr(3) and sign extend
|
|
sbfx x11, x5, #3, #29 // asr(3) and sign extend
|
|
cmp w7, #4
|
|
lsl x4, x4, #1
|
|
mul x12, x12, x4
|
|
add x3, x3, x11, lsl #2
|
|
|
|
and w5, w5, #7
|
|
and w6, w6, #7
|
|
|
|
add x3, x3, x12
|
|
|
|
b.gt mc_chroma_w8_neon
|
|
b.eq mc_chroma_w4_neon
|
|
endfunc
|
|
|
|
.macro CHROMA_MC_START r00, r01, r10, r11
|
|
mul w12, w5, w6 // cD = d8x *d8y
|
|
lsl w13, w5, #3
|
|
add w9, w12, #64
|
|
lsl w14, w6, #3
|
|
tst w12, w12
|
|
sub w9, w9, w13
|
|
sub w10, w13, w12 // cB = d8x *(8-d8y);
|
|
sub w11, w14, w12 // cC = (8-d8x)*d8y
|
|
sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
|
|
.endm
|
|
|
|
.macro CHROMA_MC width, vsize
|
|
function mc_chroma_w\width\()_neon
|
|
lsl x2, x2, #1
|
|
// since the element size varies, there's a different index for the 2nd store
|
|
.if \width == 4
|
|
.set idx2, 1
|
|
.else
|
|
.set idx2, 2
|
|
.endif
|
|
CHROMA_MC_START
|
|
b.eq 2f
|
|
|
|
ld2 {v28.8h, v29.8h}, [x3], x4
|
|
dup v0.8h, w9 // cA
|
|
dup v1.8h, w10 // cB
|
|
|
|
ext v6.16b, v28.16b, v28.16b, #2
|
|
ext v7.16b, v29.16b, v29.16b, #2
|
|
|
|
ld2 {v30.8h, v31.8h}, [x3], x4
|
|
dup v2.8h, w11 // cC
|
|
dup v3.8h, w12 // cD
|
|
|
|
ext v22.16b, v30.16b, v30.16b, #2
|
|
ext v23.16b, v31.16b, v31.16b, #2
|
|
|
|
trn1 v0.2d, v0.2d, v1.2d
|
|
trn1 v2.2d, v2.2d, v3.2d
|
|
|
|
trn1 v4.2d, v28.2d, v6.2d
|
|
trn1 v5.2d, v29.2d, v7.2d
|
|
trn1 v20.2d, v30.2d, v22.2d
|
|
trn1 v21.2d, v31.2d, v23.2d
|
|
1: // height loop, interpolate xy
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mul v17.8h, v5.8h, v0.8h
|
|
mla v16.8h, v20.8h, v2.8h
|
|
mla v17.8h, v21.8h, v2.8h
|
|
|
|
ld2 {v28.8h, v29.8h}, [x3], x4
|
|
transpose v24.2d, v25.2d, v16.2d, v17.2d
|
|
|
|
ext v6.16b, v28.16b, v28.16b, #2
|
|
ext v7.16b, v29.16b, v29.16b, #2
|
|
trn1 v4.2d, v28.2d, v6.2d
|
|
trn1 v5.2d, v29.2d, v7.2d
|
|
|
|
add v16.8h, v24.8h, v25.8h
|
|
urshr v16.8h, v16.8h, #6
|
|
|
|
mul v18.8h, v20.8h, v0.8h
|
|
mul v19.8h, v21.8h, v0.8h
|
|
mla v18.8h, v4.8h, v2.8h
|
|
mla v19.8h, v5.8h, v2.8h
|
|
|
|
ld2 {v30.8h, v31.8h}, [x3], x4
|
|
|
|
transpose v26.2d, v27.2d, v18.2d, v19.2d
|
|
add v18.8h, v26.8h, v27.8h
|
|
urshr v18.8h, v18.8h, #6
|
|
|
|
ext v22.16b, v30.16b, v30.16b, #2
|
|
ext v23.16b, v31.16b, v31.16b, #2
|
|
trn1 v20.2d, v30.2d, v22.2d
|
|
trn1 v21.2d, v31.2d, v23.2d
|
|
|
|
st1 {v16.\vsize}[0], [x0], x2
|
|
st1 {v16.\vsize}[idx2], [x1], x2
|
|
st1 {v18.\vsize}[0], [x0], x2
|
|
st1 {v18.\vsize}[idx2], [x1], x2
|
|
b.gt 1b
|
|
|
|
ret
|
|
2: // dx or dy are 0
|
|
tst w11, w11
|
|
add w10, w10, w11
|
|
dup v0.8h, w9
|
|
dup v1.8h, w10
|
|
|
|
b.eq 4f
|
|
|
|
ld1 {v4.8h}, [x3], x4
|
|
ld1 {v6.8h}, [x3], x4
|
|
3: // vertical interpolation loop
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mla v16.8h, v6.8h, v1.8h
|
|
ld1 {v4.8h}, [x3], x4
|
|
mul v17.8h, v6.8h, v0.8h
|
|
mla v17.8h, v4.8h, v1.8h
|
|
ld1 {v6.8h}, [x3], x4
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
|
|
st1 {v18.\vsize}[0], [x0], x2
|
|
st1 {v18.\vsize}[idx2], [x0], x2
|
|
st1 {v19.\vsize}[0], [x1], x2
|
|
st1 {v19.\vsize}[idx2], [x1], x2
|
|
b.gt 3b
|
|
|
|
ret
|
|
|
|
4: // dy is 0
|
|
ld1 {v4.8h, v5.8h}, [x3], x4
|
|
ld1 {v6.8h, v7.8h}, [x3], x4
|
|
|
|
ext v5.16b, v4.16b, v5.16b, #4
|
|
ext v7.16b, v6.16b, v7.16b, #4
|
|
5: // horizontal interpolation loop
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mla v16.8h, v5.8h, v1.8h
|
|
mul v17.8h, v6.8h, v0.8h
|
|
mla v17.8h, v7.8h, v1.8h
|
|
|
|
ld1 {v4.8h, v5.8h}, [x3], x4
|
|
ld1 {v6.8h, v7.8h}, [x3], x4
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
ext v5.16b, v4.16b, v5.16b, #4
|
|
ext v7.16b, v6.16b, v7.16b, #4
|
|
uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv
|
|
|
|
st1 {v18.\vsize}[0], [x0], x2
|
|
st1 {v18.\vsize}[idx2], [x0], x2
|
|
st1 {v19.\vsize}[0], [x1], x2
|
|
st1 {v19.\vsize}[idx2], [x1], x2
|
|
b.gt 5b
|
|
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
CHROMA_MC 2, s
|
|
CHROMA_MC 4, d
|
|
|
|
function mc_chroma_w8_neon
|
|
lsl x2, x2, #1
|
|
CHROMA_MC_START
|
|
|
|
b.eq 2f
|
|
sub x4, x4, #32
|
|
ld2 {v4.8h, v5.8h}, [x3], #32
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
|
|
ld2 {v20.8h, v21.8h}, [x3], #32
|
|
ld2 {v22.8h, v23.8h}, [x3], x4
|
|
|
|
dup v0.8h, w9 // cA
|
|
dup v1.8h, w10 // cB
|
|
|
|
ext v24.16b, v4.16b, v6.16b, #2
|
|
ext v26.16b, v6.16b, v4.16b, #2
|
|
ext v28.16b, v20.16b, v22.16b, #2
|
|
ext v30.16b, v22.16b, v20.16b, #2
|
|
|
|
ext v25.16b, v5.16b, v7.16b, #2
|
|
ext v27.16b, v7.16b, v5.16b, #2
|
|
ext v29.16b, v21.16b, v23.16b, #2
|
|
ext v31.16b, v23.16b, v21.16b, #2
|
|
|
|
dup v2.8h, w11 // cC
|
|
dup v3.8h, w12 // cD
|
|
|
|
1: // height loop, interpolate xy
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mul v17.8h, v5.8h, v0.8h
|
|
mla v16.8h, v24.8h, v1.8h
|
|
mla v17.8h, v25.8h, v1.8h
|
|
mla v16.8h, v20.8h, v2.8h
|
|
mla v17.8h, v21.8h, v2.8h
|
|
mla v16.8h, v28.8h, v3.8h
|
|
mla v17.8h, v29.8h, v3.8h
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
|
|
ld2 {v4.8h, v5.8h}, [x3], #32
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
|
|
mul v16.8h, v20.8h, v0.8h
|
|
mul v17.8h, v21.8h, v0.8h
|
|
ext v24.16b, v4.16b, v6.16b, #2
|
|
ext v26.16b, v6.16b, v4.16b, #2
|
|
mla v16.8h, v28.8h, v1.8h
|
|
mla v17.8h, v29.8h, v1.8h
|
|
ext v25.16b, v5.16b, v7.16b, #2
|
|
ext v27.16b, v7.16b, v5.16b, #2
|
|
mla v16.8h, v4.8h, v2.8h
|
|
mla v17.8h, v5.8h, v2.8h
|
|
mla v16.8h, v24.8h, v3.8h
|
|
mla v17.8h, v25.8h, v3.8h
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
ld2 {v20.8h, v21.8h}, [x3], #32
|
|
ld2 {v22.8h, v23.8h}, [x3], x4
|
|
ext v28.16b, v20.16b, v22.16b, #2
|
|
ext v30.16b, v22.16b, v20.16b, #2
|
|
ext v29.16b, v21.16b, v23.16b, #2
|
|
ext v31.16b, v23.16b, v21.16b, #2
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
b.gt 1b
|
|
|
|
ret
|
|
2: // dx or dy are 0
|
|
tst w11, w11
|
|
add w10, w10, w11
|
|
dup v0.8h, w9
|
|
dup v1.8h, w10
|
|
|
|
b.eq 4f
|
|
|
|
ld2 {v4.8h, v5.8h}, [x3], x4
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
3: // vertical interpolation loop
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mul v17.8h, v5.8h, v0.8h
|
|
mla v16.8h, v6.8h, v1.8h
|
|
mla v17.8h, v7.8h, v1.8h
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
|
|
ld2 {v4.8h, v5.8h}, [x3], x4
|
|
|
|
mul v16.8h, v6.8h, v0.8h
|
|
mul v17.8h, v7.8h, v0.8h
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
mla v16.8h, v4.8h, v1.8h
|
|
mla v17.8h, v5.8h, v1.8h
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
b.gt 3b
|
|
|
|
ret
|
|
4: // dy is 0
|
|
sub x4, x4, #32
|
|
|
|
ld2 {v4.8h, v5.8h}, [x3], #32
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
ext v24.16b, v4.16b, v6.16b, #2
|
|
ext v26.16b, v6.16b, v4.16b, #2
|
|
ld2 {v20.8h, v21.8h}, [x3], #32
|
|
ld2 {v22.8h, v23.8h}, [x3], x4
|
|
ext v28.16b, v20.16b, v22.16b, #2
|
|
ext v30.16b, v22.16b, v20.16b, #2
|
|
|
|
ext v25.16b, v5.16b, v7.16b, #2
|
|
ext v27.16b, v7.16b, v5.16b, #2
|
|
ext v29.16b, v21.16b, v23.16b, #2
|
|
ext v31.16b, v23.16b, v21.16b, #2
|
|
|
|
5: // horizontal interpolation loop
|
|
subs w15, w15, #2
|
|
|
|
mul v16.8h, v4.8h, v0.8h
|
|
mul v17.8h, v5.8h, v0.8h
|
|
mla v16.8h, v24.8h, v1.8h
|
|
mla v17.8h, v25.8h, v1.8h
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
|
|
mul v16.8h, v20.8h, v0.8h
|
|
mul v17.8h, v21.8h, v0.8h
|
|
ld2 {v4.8h, v5.8h}, [x3], #32
|
|
ld2 {v6.8h, v7.8h}, [x3], x4
|
|
mla v16.8h, v28.8h, v1.8h
|
|
mla v17.8h, v29.8h, v1.8h
|
|
ld2 {v20.8h,v21.8h}, [x3], #32
|
|
ld2 {v22.8h,v23.8h}, [x3], x4
|
|
|
|
urshr v16.8h, v16.8h, #6
|
|
urshr v17.8h, v17.8h, #6
|
|
|
|
ext v24.16b, v4.16b, v6.16b, #2
|
|
ext v26.16b, v6.16b, v4.16b, #2
|
|
ext v28.16b, v20.16b, v22.16b, #2
|
|
ext v30.16b, v22.16b, v20.16b, #2
|
|
ext v29.16b, v21.16b, v23.16b, #2
|
|
ext v31.16b, v23.16b, v21.16b, #2
|
|
ext v25.16b, v5.16b, v7.16b, #2
|
|
ext v27.16b, v7.16b, v5.16b, #2
|
|
|
|
st1 {v16.8h}, [x0], x2
|
|
st1 {v17.8h}, [x1], x2
|
|
b.gt 5b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro integral4h p1, p2
|
|
ext v1.16b, \p1\().16b, \p2\().16b, #2
|
|
ext v2.16b, \p1\().16b, \p2\().16b, #4
|
|
ext v3.16b, \p1\().16b, \p2\().16b, #6
|
|
add v0.8h, \p1\().8h, v1.8h
|
|
add v4.8h, v2.8h, v3.8h
|
|
add v0.8h, v0.8h, v4.8h
|
|
add v0.8h, v0.8h, v5.8h
|
|
.endm
|
|
|
|
function integral_init4h_neon, export=1
|
|
sub x3, x0, x2, lsl #1
|
|
lsl x2, x2, #1
|
|
ld1 {v6.8h,v7.8h}, [x1], #32
|
|
1:
|
|
subs x2, x2, #32
|
|
ld1 {v5.8h}, [x3], #16
|
|
integral4h v6, v7
|
|
ld1 {v6.8h}, [x1], #16
|
|
ld1 {v5.8h}, [x3], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
integral4h v7, v6
|
|
ld1 {v7.8h}, [x1], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
.macro integral8h p1, p2, s
|
|
ext v1.16b, \p1\().16b, \p2\().16b, #2
|
|
ext v2.16b, \p1\().16b, \p2\().16b, #4
|
|
ext v3.16b, \p1\().16b, \p2\().16b, #6
|
|
ext v4.16b, \p1\().16b, \p2\().16b, #8
|
|
ext v5.16b, \p1\().16b, \p2\().16b, #10
|
|
ext v6.16b, \p1\().16b, \p2\().16b, #12
|
|
ext v7.16b, \p1\().16b, \p2\().16b, #14
|
|
add v0.8h, \p1\().8h, v1.8h
|
|
add v2.8h, v2.8h, v3.8h
|
|
add v4.8h, v4.8h, v5.8h
|
|
add v6.8h, v6.8h, v7.8h
|
|
add v0.8h, v0.8h, v2.8h
|
|
add v4.8h, v4.8h, v6.8h
|
|
add v0.8h, v0.8h, v4.8h
|
|
add v0.8h, v0.8h, \s\().8h
|
|
.endm
|
|
|
|
function integral_init8h_neon, export=1
|
|
sub x3, x0, x2, lsl #1
|
|
lsl x2, x2, #1
|
|
|
|
ld1 {v16.8h, v17.8h}, [x1], #32
|
|
1:
|
|
subs x2, x2, #32
|
|
ld1 {v18.8h}, [x3], #16
|
|
integral8h v16, v17, v18
|
|
ld1 {v16.8h}, [x1], #16
|
|
ld1 {v18.8h}, [x3], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
integral8h v17, v16, v18
|
|
ld1 {v17.8h}, [x1], #16
|
|
st1 {v0.8h}, [x0], #16
|
|
b.gt 1b
|
|
ret
|
|
endfunc
|
|
|
|
function integral_init4v_neon, export=1
|
|
mov x3, x0
|
|
add x4, x0, x2, lsl #3
|
|
add x8, x0, x2, lsl #4
|
|
lsl x2, x2, #1
|
|
sub x2, x2, #16
|
|
ld1 {v20.8h, v21.8h, v22.8h}, [x3], #48
|
|
ld1 {v16.8h, v17.8h, v18.8h}, [x8], #48
|
|
1:
|
|
subs x2, x2, #32
|
|
ld1 {v24.8h, v25.8h}, [x4], #32
|
|
ext v0.16b, v20.16b, v21.16b, #8
|
|
ext v1.16b, v21.16b, v22.16b, #8
|
|
ext v2.16b, v16.16b, v17.16b, #8
|
|
ext v3.16b, v17.16b, v18.16b, #8
|
|
sub v24.8h, v24.8h, v20.8h
|
|
sub v25.8h, v25.8h, v21.8h
|
|
add v0.8h, v0.8h, v20.8h
|
|
add v1.8h, v1.8h, v21.8h
|
|
add v2.8h, v2.8h, v16.8h
|
|
add v3.8h, v3.8h, v17.8h
|
|
st1 {v24.8h}, [x1], #16
|
|
st1 {v25.8h}, [x1], #16
|
|
mov v20.16b, v22.16b
|
|
mov v16.16b, v18.16b
|
|
sub v0.8h, v2.8h, v0.8h
|
|
sub v1.8h, v3.8h, v1.8h
|
|
ld1 {v21.8h, v22.8h}, [x3], #32
|
|
ld1 {v17.8h, v18.8h}, [x8], #32
|
|
st1 {v0.8h}, [x0], #16
|
|
st1 {v1.8h}, [x0], #16
|
|
b.gt 1b
|
|
2:
|
|
ret
|
|
endfunc
|
|
|
|
function integral_init8v_neon, export=1
|
|
add x2, x0, x1, lsl #4
|
|
sub x1, x1, #8
|
|
ands x3, x1, #16 - 1
|
|
b.eq 1f
|
|
subs x1, x1, #8
|
|
ld1 {v0.8h}, [x0]
|
|
ld1 {v2.8h}, [x2], #16
|
|
sub v4.8h, v2.8h, v0.8h
|
|
st1 {v4.8h}, [x0], #16
|
|
b.le 2f
|
|
1:
|
|
subs x1, x1, #16
|
|
ld1 {v0.8h,v1.8h}, [x0]
|
|
ld1 {v2.8h,v3.8h}, [x2], #32
|
|
sub v4.8h, v2.8h, v0.8h
|
|
sub v5.8h, v3.8h, v1.8h
|
|
st1 {v4.8h}, [x0], #16
|
|
st1 {v5.8h}, [x0], #16
|
|
b.gt 1b
|
|
2:
|
|
ret
|
|
endfunc
|
|
|
|
// frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth,
|
|
// pixel *dstv, pixel *dstc, intptr_t src_stride,
|
|
// intptr_t dst_stride, int width, int height )
|
|
function frame_init_lowres_core_neon, export=1
|
|
ldr w8, [sp]
|
|
lsl x5, x5, #1
|
|
sub x10, x6, w7, uxtw // dst_stride - width
|
|
lsl x10, x10, #1
|
|
and x10, x10, #~31
|
|
|
|
stp d8, d9, [sp, #-0x40]!
|
|
stp d10, d11, [sp, #0x10]
|
|
stp d12, d13, [sp, #0x20]
|
|
stp d14, d15, [sp, #0x30]
|
|
|
|
1:
|
|
mov w9, w7 // width
|
|
mov x11, x0 // src0
|
|
add x12, x0, x5 // src1 = src0 + src_stride
|
|
add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
|
|
|
|
ld2 {v0.8h, v1.8h}, [x11], #32
|
|
ld2 {v2.8h, v3.8h}, [x11], #32
|
|
ld2 {v4.8h, v5.8h}, [x12], #32
|
|
ld2 {v6.8h, v7.8h}, [x12], #32
|
|
ld2 {v28.8h, v29.8h}, [x13], #32
|
|
ld2 {v30.8h, v31.8h}, [x13], #32
|
|
|
|
urhadd v20.8h, v0.8h, v4.8h
|
|
urhadd v21.8h, v2.8h, v6.8h
|
|
urhadd v22.8h, v4.8h, v28.8h
|
|
urhadd v23.8h, v6.8h, v30.8h
|
|
2:
|
|
subs w9, w9, #16
|
|
|
|
urhadd v24.8h, v1.8h, v5.8h
|
|
urhadd v25.8h, v3.8h, v7.8h
|
|
urhadd v26.8h, v5.8h, v29.8h
|
|
urhadd v27.8h, v7.8h, v31.8h
|
|
|
|
ld2 {v0.8h, v1.8h}, [x11], #32
|
|
ld2 {v2.8h, v3.8h}, [x11], #32
|
|
ld2 {v4.8h, v5.8h}, [x12], #32
|
|
ld2 {v6.8h, v7.8h}, [x12], #32
|
|
ld2 {v28.8h, v29.8h}, [x13], #32
|
|
ld2 {v30.8h, v31.8h}, [x13], #32
|
|
|
|
urhadd v16.8h, v0.8h, v4.8h
|
|
urhadd v17.8h, v2.8h, v6.8h
|
|
urhadd v18.8h, v4.8h, v28.8h
|
|
urhadd v19.8h, v6.8h, v30.8h
|
|
|
|
ext v8.16b, v20.16b, v21.16b, #2
|
|
ext v9.16b, v21.16b, v16.16b, #2
|
|
ext v10.16b, v22.16b, v23.16b, #2
|
|
ext v11.16b, v23.16b, v18.16b, #2
|
|
|
|
urhadd v12.8h, v20.8h, v24.8h
|
|
urhadd v8.8h, v24.8h, v8.8h
|
|
|
|
urhadd v24.8h, v21.8h, v25.8h
|
|
urhadd v22.8h, v22.8h, v26.8h
|
|
urhadd v10.8h, v26.8h, v10.8h
|
|
urhadd v26.8h, v23.8h, v27.8h
|
|
urhadd v9.8h, v25.8h, v9.8h
|
|
urhadd v11.8h, v27.8h, v11.8h
|
|
|
|
st1 {v12.8h}, [x1], #16
|
|
st1 {v24.8h}, [x1], #16
|
|
st1 {v22.8h}, [x3], #16
|
|
st1 {v26.8h}, [x3], #16
|
|
st1 {v8.8h, v9.8h}, [x2], #32
|
|
st1 {v10.8h, v11.8h}, [x4], #32
|
|
|
|
b.le 3f
|
|
|
|
subs w9, w9, #16
|
|
|
|
urhadd v24.8h, v1.8h, v5.8h
|
|
urhadd v25.8h, v3.8h, v7.8h
|
|
urhadd v26.8h, v5.8h, v29.8h
|
|
urhadd v27.8h, v7.8h, v31.8h
|
|
|
|
ld2 {v0.8h, v1.8h}, [x11], #32
|
|
ld2 {v2.8h, v3.8h}, [x11], #32
|
|
ld2 {v4.8h, v5.8h}, [x12], #32
|
|
ld2 {v6.8h, v7.8h}, [x12], #32
|
|
ld2 {v28.8h, v29.8h}, [x13], #32
|
|
ld2 {v30.8h, v31.8h}, [x13], #32
|
|
|
|
urhadd v20.8h, v0.8h, v4.8h
|
|
urhadd v21.8h, v2.8h, v6.8h
|
|
urhadd v22.8h, v4.8h, v28.8h
|
|
urhadd v23.8h, v6.8h, v30.8h
|
|
|
|
ext v8.16b, v16.16b, v17.16b, #2
|
|
ext v9.16b, v17.16b, v20.16b, #2
|
|
ext v10.16b, v18.16b, v19.16b, #2
|
|
ext v11.16b, v19.16b, v22.16b, #2
|
|
|
|
urhadd v12.8h, v16.8h, v24.8h
|
|
urhadd v13.8h, v17.8h, v25.8h
|
|
|
|
urhadd v14.8h, v18.8h, v26.8h
|
|
urhadd v15.8h, v19.8h, v27.8h
|
|
|
|
urhadd v16.8h, v24.8h, v8.8h
|
|
urhadd v17.8h, v25.8h, v9.8h
|
|
|
|
urhadd v18.8h, v26.8h, v10.8h
|
|
urhadd v19.8h, v27.8h, v11.8h
|
|
|
|
st1 {v12.8h, v13.8h}, [x1], #32
|
|
st1 {v14.8h, v15.8h}, [x3], #32
|
|
st1 {v16.8h, v17.8h}, [x2], #32
|
|
st1 {v18.8h, v19.8h}, [x4], #32
|
|
b.gt 2b
|
|
3:
|
|
subs w8, w8, #1
|
|
add x0, x0, x5, lsl #1
|
|
add x1, x1, x10
|
|
add x2, x2, x10
|
|
add x3, x3, x10
|
|
add x4, x4, x10
|
|
b.gt 1b
|
|
|
|
ldp d8, d9, [sp]
|
|
ldp d10, d11, [sp, #0x10]
|
|
ldp d12, d13, [sp, #0x20]
|
|
ldp d14, d15, [sp, #0x30]
|
|
|
|
add sp, sp, #0x40
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function load_deinterleave_chroma_fenc_neon, export=1
|
|
mov x4, #FENC_STRIDE/2
|
|
lsl x4, x4, #1
|
|
lsl x2, x2, #1
|
|
b load_deinterleave_chroma
|
|
endfunc
|
|
|
|
function load_deinterleave_chroma_fdec_neon, export=1
|
|
mov x4, #FDEC_STRIDE/2
|
|
lsl x4, x4, #1
|
|
lsl x2, x2, #1
|
|
load_deinterleave_chroma:
|
|
ld2 {v0.8h, v1.8h}, [x1], x2
|
|
ld2 {v2.8h, v3.8h}, [x1], x2
|
|
subs w3, w3, #2
|
|
st1 {v0.8h}, [x0], x4
|
|
st1 {v1.8h}, [x0], x4
|
|
st1 {v2.8h}, [x0], x4
|
|
st1 {v3.8h}, [x0], x4
|
|
b.gt load_deinterleave_chroma
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function store_interleave_chroma_neon, export=1
|
|
mov x5, #FDEC_STRIDE
|
|
lsl x5, x5, #1
|
|
lsl x1, x1, #1
|
|
1:
|
|
ld1 {v0.8h}, [x2], x5
|
|
ld1 {v1.8h}, [x3], x5
|
|
ld1 {v2.8h}, [x2], x5
|
|
ld1 {v3.8h}, [x3], x5
|
|
subs w4, w4, #2
|
|
zip1 v4.8h, v0.8h, v1.8h
|
|
zip1 v6.8h, v2.8h, v3.8h
|
|
zip2 v5.8h, v0.8h, v1.8h
|
|
zip2 v7.8h, v2.8h, v3.8h
|
|
|
|
st1 {v4.8h, v5.8h}, [x0], x1
|
|
st1 {v6.8h, v7.8h}, [x0], x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_core_neon, export=1
|
|
add w8, w4, #31 // 32-bit write clears the upper 32-bit the register
|
|
and w4, w8, #~31
|
|
// safe use of the full reg since negative width makes no sense
|
|
sub x1, x1, x4
|
|
sub x3, x3, x4
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
mov w8, w4
|
|
16:
|
|
tst w8, #16
|
|
b.eq 32f
|
|
subs w8, w8, #16
|
|
ldp q0, q1, [x2], #32
|
|
stp q0, q1, [x0], #32
|
|
b.eq 0f
|
|
32:
|
|
subs w8, w8, #32
|
|
ldp q0, q1, [x2], #32
|
|
ldp q2, q3, [x2], #32
|
|
stp q0, q1, [x0], #32
|
|
stp q2, q3, [x0], #32
|
|
b.gt 32b
|
|
0:
|
|
subs w5, w5, #1
|
|
add x2, x2, x3
|
|
add x0, x0, x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_swap_core_neon, export=1
|
|
lsl w4, w4, #1
|
|
add w8, w4, #31 // 32-bit write clears the upper 32-bit the register
|
|
and w4, w8, #~31
|
|
sub x1, x1, x4
|
|
sub x3, x3, x4
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
1:
|
|
mov w8, w4
|
|
tbz w4, #4, 32f
|
|
subs w8, w8, #16
|
|
ld1 {v0.8h, v1.8h}, [x2], #32
|
|
rev32 v0.8h, v0.8h
|
|
rev32 v1.8h, v1.8h
|
|
st1 {v0.8h, v1.8h}, [x0], #32
|
|
b.eq 0f
|
|
32:
|
|
subs w8, w8, #32
|
|
ld1 {v0.8h ,v1.8h, v2.8h, v3.8h}, [x2], #64
|
|
rev32 v20.8h, v0.8h
|
|
rev32 v21.8h, v1.8h
|
|
rev32 v22.8h, v2.8h
|
|
rev32 v23.8h, v3.8h
|
|
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
|
b.gt 32b
|
|
0:
|
|
subs w5, w5, #1
|
|
add x2, x2, x3
|
|
add x0, x0, x1
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_deinterleave_neon, export=1
|
|
add w9, w6, #15
|
|
and w9, w9, #~15
|
|
sub x1, x1, x9
|
|
sub x3, x3, x9
|
|
sub x5, x5, x9, lsl #1
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
1:
|
|
ld2 {v0.8h, v1.8h}, [x4], #32
|
|
ld2 {v2.8h, v3.8h}, [x4], #32
|
|
subs w9, w9, #16
|
|
st1 {v0.8h}, [x0], #16
|
|
st1 {v2.8h}, [x0], #16
|
|
st1 {v1.8h}, [x2], #16
|
|
st1 {v3.8h}, [x2], #16
|
|
b.gt 1b
|
|
|
|
add x4, x4, x5
|
|
subs w7, w7, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
mov w9, w6
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function plane_copy_interleave_core_neon, export=1
|
|
add w9, w6, #15
|
|
and w9, w9, #0xfffffff0
|
|
sub x1, x1, x9, lsl #1
|
|
sub x3, x3, x9
|
|
sub x5, x5, x9
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
1:
|
|
ld1 {v0.8h}, [x2], #16
|
|
ld1 {v1.8h}, [x4], #16
|
|
ld1 {v2.8h}, [x2], #16
|
|
ld1 {v3.8h}, [x4], #16
|
|
subs w9, w9, #16
|
|
st2 {v0.8h, v1.8h}, [x0], #32
|
|
st2 {v2.8h, v3.8h}, [x0], #32
|
|
b.gt 1b
|
|
|
|
subs w7, w7, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
add x4, x4, x5
|
|
mov w9, w6
|
|
b.gt 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
.macro deinterleave_rgb
|
|
subs x11, x11, #8
|
|
st1 {v0.8h}, [x0], #16
|
|
st1 {v1.8h}, [x2], #16
|
|
st1 {v2.8h}, [x4], #16
|
|
b.gt 1b
|
|
|
|
subs w10, w10, #1
|
|
add x0, x0, x1
|
|
add x2, x2, x3
|
|
add x4, x4, x5
|
|
add x6, x6, x7
|
|
mov x11, x9
|
|
b.gt 1b
|
|
.endm
|
|
|
|
function plane_copy_deinterleave_rgb_neon, export=1
|
|
#if SYS_MACOSX
|
|
ldr w8, [sp]
|
|
ldp w9, w10, [sp, #4]
|
|
#else
|
|
ldr x8, [sp]
|
|
ldp x9, x10, [sp, #8]
|
|
#endif
|
|
cmp w8, #3
|
|
uxtw x9, w9
|
|
add x11, x9, #7
|
|
and x11, x11, #~7
|
|
sub x1, x1, x11
|
|
sub x3, x3, x11
|
|
sub x5, x5, x11
|
|
lsl x1, x1, #1
|
|
lsl x3, x3, #1
|
|
lsl x5, x5, #1
|
|
b.ne 4f
|
|
sub x7, x7, x11, lsl #1
|
|
sub x7, x7, x11
|
|
lsl x7, x7, #1
|
|
1:
|
|
ld3 {v0.8h, v1.8h, v2.8h}, [x6], #48
|
|
deinterleave_rgb
|
|
|
|
ret
|
|
4:
|
|
sub x7, x7, x11, lsl #2
|
|
lsl x7, x7, #1
|
|
1:
|
|
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
|
|
deinterleave_rgb
|
|
|
|
ret
|
|
endfunc
|
|
|
|
// void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
|
|
// intptr_t stride, int width, int height, int16_t *buf )
|
|
function hpel_filter_neon, export=1
|
|
lsl x5, x5, #1
|
|
ubfm x9, x3, #3, #7
|
|
add w15, w5, w9
|
|
sub x13, x3, x9 // align src
|
|
sub x10, x0, x9
|
|
sub x11, x1, x9
|
|
sub x12, x2, x9
|
|
movi v30.8h, #5
|
|
movi v31.8h, #20
|
|
|
|
lsl x4, x4, #1
|
|
stp d8, d9, [sp, #-0x40]!
|
|
stp d10, d11, [sp, #0x10]
|
|
stp d12, d13, [sp, #0x20]
|
|
stp d14, d15, [sp, #0x30]
|
|
|
|
str q0, [sp, #-0x50]!
|
|
|
|
1: // line start
|
|
mov x3, x13
|
|
mov x2, x12
|
|
mov x1, x11
|
|
mov x0, x10
|
|
add x7, x3, #32 // src pointer next 16b for horiz filter
|
|
mov x5, x15 // restore width
|
|
sub x3, x3, x4, lsl #1 // src - 2*stride
|
|
ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31]
|
|
add x9, x3, x5 // holds src - 2*stride + width
|
|
|
|
ld1 {v8.8h, v9.8h}, [x3], x4 // src-2*stride[0:15]
|
|
ld1 {v10.8h, v11.8h}, [x3], x4 // src-1*stride[0:15]
|
|
ld1 {v12.8h, v13.8h}, [x3], x4 // src-0*stride[0:15]
|
|
ld1 {v14.8h, v15.8h}, [x3], x4 // src+1*stride[0:15]
|
|
ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15]
|
|
ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15]
|
|
|
|
ext v22.16b, v7.16b, v12.16b, #12
|
|
ext v23.16b, v12.16b, v13.16b, #12
|
|
uaddl v1.4s, v8.4h, v18.4h
|
|
uaddl2 v20.4s, v8.8h, v18.8h
|
|
ext v24.16b, v12.16b, v13.16b, #6
|
|
ext v25.16b, v13.16b, v28.16b, #6
|
|
umlsl v1.4s, v10.4h, v30.4h
|
|
umlsl2 v20.4s, v10.8h, v30.8h
|
|
ext v26.16b, v7.16b, v12.16b, #14
|
|
ext v27.16b, v12.16b, v13.16b, #14
|
|
umlal v1.4s, v12.4h, v31.4h
|
|
umlal2 v20.4s, v12.8h, v31.8h
|
|
ext v3.16b, v12.16b, v13.16b, #2
|
|
ext v4.16b, v13.16b, v28.16b, #2
|
|
umlal v1.4s, v14.4h, v31.4h
|
|
umlal2 v20.4s, v14.8h, v31.8h
|
|
ext v21.16b, v12.16b, v13.16b, #4
|
|
ext v5.16b, v13.16b, v28.16b, #4
|
|
umlsl v1.4s, v16.4h, v30.4h
|
|
umlsl2 v20.4s, v16.8h, v30.8h
|
|
|
|
2: // next 16 pixel of line
|
|
subs x5, x5, #32
|
|
sub x3, x9, x5 // src - 2*stride += 16
|
|
|
|
uaddl v8.4s, v22.4h, v24.4h
|
|
uaddl2 v22.4s, v22.8h, v24.8h
|
|
uaddl v10.4s, v23.4h, v25.4h
|
|
uaddl2 v23.4s, v23.8h, v25.8h
|
|
|
|
umlsl v8.4s, v26.4h, v30.4h
|
|
umlsl2 v22.4s, v26.8h, v30.8h
|
|
umlsl v10.4s, v27.4h, v30.4h
|
|
umlsl2 v23.4s, v27.8h, v30.8h
|
|
|
|
umlal v8.4s, v12.4h, v31.4h
|
|
umlal2 v22.4s, v12.8h, v31.8h
|
|
umlal v10.4s, v13.4h, v31.4h
|
|
umlal2 v23.4s, v13.8h, v31.8h
|
|
|
|
umlal v8.4s, v3.4h, v31.4h
|
|
umlal2 v22.4s, v3.8h, v31.8h
|
|
umlal v10.4s, v4.4h, v31.4h
|
|
umlal2 v23.4s, v4.8h, v31.8h
|
|
|
|
umlsl v8.4s, v21.4h, v30.4h
|
|
umlsl2 v22.4s, v21.8h, v30.8h
|
|
umlsl v10.4s, v5.4h, v30.4h
|
|
umlsl2 v23.4s, v5.8h, v30.8h
|
|
|
|
uaddl v5.4s, v9.4h, v19.4h
|
|
uaddl2 v2.4s, v9.8h, v19.8h
|
|
|
|
sqrshrun v8.4h, v8.4s, #5
|
|
sqrshrun2 v8.8h, v22.4s, #5
|
|
sqrshrun v10.4h, v10.4s, #5
|
|
sqrshrun2 v10.8h, v23.4s, #5
|
|
|
|
mov v6.16b, v12.16b
|
|
mov v7.16b, v13.16b
|
|
|
|
mvni v23.8h, #0xfc, lsl #8
|
|
|
|
umin v8.8h, v8.8h, v23.8h
|
|
umin v10.8h, v10.8h, v23.8h
|
|
|
|
st1 {v8.8h}, [x0], #16
|
|
st1 {v10.8h}, [x0], #16
|
|
|
|
umlsl v5.4s, v11.4h, v30.4h
|
|
umlsl2 v2.4s, v11.8h, v30.8h
|
|
|
|
ld1 {v8.8h, v9.8h}, [x3], x4
|
|
umlal v5.4s, v13.4h, v31.4h
|
|
umlal2 v2.4s, v13.8h, v31.8h
|
|
ld1 {v10.8h, v11.8h}, [x3], x4
|
|
umlal v5.4s, v15.4h, v31.4h
|
|
umlal2 v2.4s, v15.8h, v31.8h
|
|
ld1 {v12.8h, v13.8h}, [x3], x4
|
|
umlsl v5.4s, v17.4h, v30.4h
|
|
umlsl2 v2.4s, v17.8h, v30.8h
|
|
ld1 {v14.8h, v15.8h}, [x3], x4
|
|
|
|
sqrshrun v4.4h, v5.4s, #5
|
|
sqrshrun2 v4.8h, v2.4s, #5
|
|
sqrshrun v18.4h, v1.4s, #5
|
|
sqrshrun2 v18.8h, v20.4s, #5
|
|
|
|
mvni v17.8h, #0xfc, lsl #8
|
|
|
|
smin v4.8h, v4.8h, v17.8h
|
|
smin v18.8h, v18.8h, v17.8h
|
|
|
|
st1 {v18.8h}, [x1], #16
|
|
st1 {v4.8h}, [x1], #16
|
|
|
|
ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15]
|
|
ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15]
|
|
|
|
str q9, [sp, #0x10]
|
|
str q15, [sp, #0x20]
|
|
str q17, [sp, #0x30]
|
|
str q19, [sp, #0x40]
|
|
|
|
ldr q28, [sp]
|
|
|
|
ext v22.16b, v28.16b, v1.16b, #8
|
|
ext v9.16b, v1.16b, v20.16b, #8
|
|
ext v26.16b, v1.16b, v20.16b, #12
|
|
ext v17.16b, v20.16b, v5.16b, #12
|
|
ext v23.16b, v28.16b, v1.16b, #12
|
|
ext v19.16b, v1.16b, v20.16b, #12
|
|
|
|
uaddl v3.4s, v8.4h, v18.4h
|
|
uaddl2 v15.4s, v8.8h, v18.8h
|
|
umlsl v3.4s, v10.4h, v30.4h
|
|
umlsl2 v15.4s, v10.8h, v30.8h
|
|
umlal v3.4s, v12.4h, v31.4h
|
|
umlal2 v15.4s, v12.8h, v31.8h
|
|
umlal v3.4s, v14.4h, v31.4h
|
|
umlal2 v15.4s, v14.8h, v31.8h
|
|
umlsl v3.4s, v16.4h, v30.4h
|
|
umlsl2 v15.4s, v16.8h, v30.8h
|
|
|
|
add v4.4s, v22.4s, v26.4s
|
|
add v26.4s, v9.4s, v17.4s
|
|
|
|
ext v25.16b, v1.16b, v20.16b, #8
|
|
ext v22.16b, v20.16b, v5.16b, #8
|
|
ext v24.16b, v1.16b, v20.16b, #4
|
|
ext v9.16b, v20.16b, v5.16b, #4
|
|
|
|
add v31.4s, v23.4s, v25.4s
|
|
add v19.4s, v19.4s, v22.4s
|
|
add v6.4s, v24.4s, v1.4s
|
|
add v17.4s, v9.4s, v20.4s
|
|
sub v4.4s, v4.4s, v31.4s // a-b
|
|
sub v26.4s, v26.4s, v19.4s // a-b
|
|
sub v31.4s, v31.4s, v6.4s // b-c
|
|
sub v19.4s, v19.4s, v17.4s // b-c
|
|
|
|
ext v22.16b, v20.16b, v5.16b, #8
|
|
ext v9.16b, v5.16b, v2.16b, #8
|
|
ext v24.16b, v5.16b, v2.16b, #12
|
|
ext v28.16b, v2.16b, v3.16b, #12
|
|
ext v23.16b, v20.16b, v5.16b, #12
|
|
ext v30.16b, v5.16b, v2.16b, #12
|
|
ext v25.16b, v5.16b, v2.16b, #8
|
|
ext v29.16b, v2.16b, v3.16b, #8
|
|
|
|
add v22.4s, v22.4s, v24.4s
|
|
add v9.4s, v9.4s, v28.4s
|
|
add v23.4s, v23.4s, v25.4s
|
|
add v29.4s, v29.4s, v30.4s
|
|
|
|
ext v24.16b, v5.16b, v2.16b, #4
|
|
ext v28.16b, v2.16b, v3.16b, #4
|
|
|
|
add v24.4s, v24.4s, v5.4s
|
|
add v28.4s, v28.4s, v2.4s
|
|
|
|
sub v22.4s, v22.4s, v23.4s
|
|
sub v9.4s, v9.4s, v29.4s
|
|
sub v23.4s, v23.4s, v24.4s
|
|
sub v29.4s, v29.4s, v28.4s
|
|
|
|
sshr v4.4s, v4.4s, #2
|
|
sshr v0.4s, v26.4s, #2
|
|
sshr v22.4s, v22.4s, #2
|
|
sshr v9.4s, v9.4s, #2
|
|
|
|
sub v4.4s, v4.4s, v31.4s
|
|
sub v0.4s, v0.4s, v19.4s
|
|
sub v22.4s, v22.4s, v23.4s
|
|
sub v9.4s, v9.4s, v29.4s
|
|
|
|
sshr v4.4s, v4.4s, #2
|
|
sshr v0.4s, v0.4s, #2
|
|
sshr v22.4s, v22.4s, #2
|
|
sshr v9.4s, v9.4s, #2
|
|
|
|
add v4.4s, v4.4s, v6.4s
|
|
add v0.4s, v0.4s, v17.4s
|
|
add v22.4s, v22.4s, v24.4s
|
|
add v9.4s, v9.4s, v28.4s
|
|
|
|
str q2, [sp]
|
|
|
|
sqrshrun v4.4h, v4.4s, #6
|
|
sqrshrun2 v4.8h, v0.4s, #6
|
|
sqrshrun v22.4h, v22.4s, #6
|
|
sqrshrun2 v22.8h, v9.4s, #6
|
|
|
|
mov v0.16b, v5.16b
|
|
|
|
ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31]
|
|
|
|
ldr q9, [sp, #0x10]
|
|
ldr q17, [sp, #0x30]
|
|
ldr q19, [sp, #0x40]
|
|
|
|
ext v26.16b, v7.16b, v12.16b, #14
|
|
ext v27.16b, v12.16b, v13.16b, #14
|
|
|
|
mvni v25.8h, 0xfc, lsl #8
|
|
|
|
smin v22.8h, v22.8h, v25.8h
|
|
smin v4.8h, v4.8h, v25.8h
|
|
|
|
st1 {v4.8h}, [x2], #16
|
|
st1 {v22.8h}, [x2], #16
|
|
|
|
mov v1.16b, v3.16b
|
|
mov v20.16b, v15.16b
|
|
|
|
ldr q15, [sp, #0x20]
|
|
|
|
ext v22.16b, v7.16b, v12.16b, #12
|
|
ext v23.16b, v12.16b, v13.16b, #12
|
|
ext v3.16b, v12.16b, v13.16b, #2
|
|
ext v4.16b, v13.16b, v28.16b, #2
|
|
ext v21.16b, v12.16b, v13.16b, #4
|
|
ext v5.16b, v13.16b, v28.16b, #4
|
|
ext v24.16b, v12.16b, v13.16b, #6
|
|
ext v25.16b, v13.16b, v28.16b, #6
|
|
|
|
movi v30.8h, #5
|
|
movi v31.8h, #20
|
|
|
|
b.gt 2b
|
|
|
|
subs w6, w6, #1
|
|
add x10, x10, x4
|
|
add x11, x11, x4
|
|
add x12, x12, x4
|
|
add x13, x13, x4
|
|
b.gt 1b
|
|
|
|
add sp, sp, #0x50
|
|
|
|
ldp d8, d9, [sp]
|
|
ldp d10, d11, [sp, #0x10]
|
|
ldp d12, d13, [sp, #0x20]
|
|
ldp d14, d15, [sp, #0x30]
|
|
add sp, sp, #0x40
|
|
|
|
ret
|
|
endfunc
|
|
|
|
#endif
|