Spaces:
Runtime error
Runtime error
;****************************************************************************** | |
;* x86-optimized yuv2yuvX | |
;* Copyright 2020 Google LLC | |
;* Copyright (C) 2001-2011 Michael Niedermayer <[email protected]> | |
;* | |
;* This file is part of FFmpeg. | |
;* | |
;* FFmpeg is free software; you can redistribute it and/or | |
;* modify it under the terms of the GNU Lesser General Public | |
;* License as published by the Free Software Foundation; either | |
;* version 2.1 of the License, or (at your option) any later version. | |
;* | |
;* FFmpeg is distributed in the hope that it will be useful, | |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
;* Lesser General Public License for more details. | |
;* | |
;* You should have received a copy of the GNU Lesser General Public | |
;* License along with FFmpeg; if not, write to the Free Software | |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
;****************************************************************************** | |
%include "libavutil/x86/x86util.asm" | |
SECTION .text | |
;----------------------------------------------------------------------------- | |
; yuv2yuvX | |
; | |
; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize, | |
; int srcOffset, uint8_t *dest, int dstW, | |
; const uint8_t *dither, int offset); | |
; | |
;----------------------------------------------------------------------------- | |
%macro YUV2YUVX_FUNC 0 | |
cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset | |
%if notcpuflag(sse3) | |
%define movr mova | |
%define unroll 1 | |
%else | |
%define movr movdqu | |
%define unroll 2 | |
%endif | |
movsxdifnidn dstWq, dstWd | |
movsxdifnidn offsetq, offsetd | |
movsxdifnidn srcq, srcd | |
%if cpuflag(avx2) | |
vpbroadcastq m3, [ditherq] | |
%else | |
movq xm3, [ditherq] | |
%endif ; avx2 | |
cmp offsetd, 0 | |
jz .offset | |
; offset != 0 path. | |
psrlq m5, m3, $18 | |
psllq m3, m3, $28 | |
por m3, m3, m5 | |
.offset: | |
add offsetq, srcq | |
movd xm1, filterSized | |
SPLATW m1, xm1, 0 | |
pxor m0, m0, m0 | |
mov filterSizeq, filterq | |
mov srcq, [filterSizeq] | |
punpcklbw m3, m0 | |
psllw m1, m1, 3 | |
paddw m3, m3, m1 | |
psraw m7, m3, 4 | |
.outerloop: | |
mova m4, m7 | |
mova m3, m7 | |
%if cpuflag(sse3) | |
mova m6, m7 | |
mova m1, m7 | |
%endif | |
.loop: | |
%if cpuflag(avx2) | |
vpbroadcastq m0, [filterSizeq + 8] | |
%elif cpuflag(sse3) | |
movddup m0, [filterSizeq + 8] | |
%else | |
mova m0, [filterSizeq + 8] | |
%endif | |
pmulhw m2, m0, [srcq + offsetq * 2] | |
pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] | |
paddw m3, m3, m2 | |
paddw m4, m4, m5 | |
%if cpuflag(sse3) | |
pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] | |
pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] | |
paddw m6, m6, m2 | |
paddw m1, m1, m5 | |
%endif | |
add filterSizeq, $10 | |
mov srcq, [filterSizeq] | |
test srcq, srcq | |
jnz .loop | |
psraw m3, m3, 3 | |
psraw m4, m4, 3 | |
%if cpuflag(sse3) | |
psraw m6, m6, 3 | |
psraw m1, m1, 3 | |
%endif | |
packuswb m3, m3, m4 | |
%if cpuflag(sse3) | |
packuswb m6, m6, m1 | |
%endif | |
mov srcq, [filterq] | |
%if cpuflag(avx2) | |
vpermq m3, m3, 216 | |
vpermq m6, m6, 216 | |
%endif | |
movr [destq + offsetq], m3 | |
%if cpuflag(sse3) | |
movr [destq + offsetq + mmsize], m6 | |
%endif | |
add offsetq, mmsize * unroll | |
mov filterSizeq, filterq | |
cmp offsetq, dstWq | |
jb .outerloop | |
RET | |
%endmacro | |
INIT_MMX mmxext | |
YUV2YUVX_FUNC | |
INIT_XMM sse3 | |
YUV2YUVX_FUNC | |
%if HAVE_AVX2_EXTERNAL | |
INIT_YMM avx2 | |
YUV2YUVX_FUNC | |
%endif | |