Spaces:
Runtime error
Runtime error
/* | |
* Copyright (c) 2021 Loongson Technology Corporation Limited | |
* Contributed by Hao Chen <[email protected]> | |
* | |
* This file is part of FFmpeg. | |
* | |
* FFmpeg is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* FFmpeg is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with FFmpeg; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
void ff_simple_idct_lasx(int16_t *block) | |
{ | |
int32_t const_val = 1 << 10; | |
__m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, | |
0x4B42539F58C50000, 0x11A822A332493FFF}; | |
__m256i in0, in1, in2, in3; | |
__m256i w2, w3, w4, w5, w6, w7; | |
__m256i a0, a1, a2, a3; | |
__m256i b0, b1, b2, b3; | |
__m256i temp0, temp1, temp2, temp3; | |
__m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); | |
__m256i const_val1, select_vec, temp; | |
LASX_IDCTROWCONDDC | |
LASX_IDCTCOLS | |
DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, | |
in0, in1, in2, in3); | |
__lasx_xvst(in0, block, 0); | |
__lasx_xvst(in1, block, 32); | |
__lasx_xvst(in2, block, 64); | |
__lasx_xvst(in3, block, 96); | |
} | |
void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride, | |
int16_t *block) | |
{ | |
int32_t const_val = 1 << 10; | |
ptrdiff_t dst_stride_2x = dst_stride << 1; | |
ptrdiff_t dst_stride_4x = dst_stride << 2; | |
ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; | |
__m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, | |
0x4B42539F58C50000, 0x11A822A332493FFF}; | |
__m256i in0, in1, in2, in3; | |
__m256i w2, w3, w4, w5, w6, w7; | |
__m256i a0, a1, a2, a3; | |
__m256i b0, b1, b2, b3; | |
__m256i temp0, temp1, temp2, temp3; | |
__m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); | |
__m256i const_val1, select_vec, temp; | |
LASX_IDCTROWCONDDC | |
LASX_IDCTCOLS | |
DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, | |
in0, in1, in2, in3); | |
DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); | |
DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); | |
__lasx_xvstelm_d(in0, dst, 0, 0); | |
__lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); | |
__lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); | |
__lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); | |
dst += dst_stride_4x; | |
__lasx_xvstelm_d(in1, dst, 0, 0); | |
__lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); | |
__lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); | |
__lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); | |
} | |
void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride, | |
int16_t *block) | |
{ | |
int32_t const_val = 1 << 10; | |
uint8_t *dst1 = dst; | |
ptrdiff_t dst_stride_2x = dst_stride << 1; | |
ptrdiff_t dst_stride_4x = dst_stride << 2; | |
ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; | |
__m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, | |
0x4B42539F58C50000, 0x11A822A332493FFF}; | |
__m256i sh = {0x0003000200010000, 0x000B000A00090008, | |
0x0007000600050004, 0x000F000E000D000C}; | |
__m256i in0, in1, in2, in3; | |
__m256i w2, w3, w4, w5, w6, w7; | |
__m256i a0, a1, a2, a3; | |
__m256i b0, b1, b2, b3; | |
__m256i temp0, temp1, temp2, temp3; | |
__m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); | |
__m256i const_val1, select_vec, temp; | |
LASX_IDCTROWCONDDC | |
LASX_IDCTCOLS | |
a0 = __lasx_xvldrepl_d(dst1, 0); | |
a0 = __lasx_vext2xv_hu_bu(a0); | |
dst1 += dst_stride; | |
a1 = __lasx_xvldrepl_d(dst1, 0); | |
a1 = __lasx_vext2xv_hu_bu(a1); | |
dst1 += dst_stride; | |
a2 = __lasx_xvldrepl_d(dst1, 0); | |
a2 = __lasx_vext2xv_hu_bu(a2); | |
dst1 += dst_stride; | |
a3 = __lasx_xvldrepl_d(dst1, 0); | |
a3 = __lasx_vext2xv_hu_bu(a3); | |
dst1 += dst_stride; | |
b0 = __lasx_xvldrepl_d(dst1, 0); | |
b0 = __lasx_vext2xv_hu_bu(b0); | |
dst1 += dst_stride; | |
b1 = __lasx_xvldrepl_d(dst1, 0); | |
b1 = __lasx_vext2xv_hu_bu(b1); | |
dst1 += dst_stride; | |
b2 = __lasx_xvldrepl_d(dst1, 0); | |
b2 = __lasx_vext2xv_hu_bu(b2); | |
dst1 += dst_stride; | |
b3 = __lasx_xvldrepl_d(dst1, 0); | |
b3 = __lasx_vext2xv_hu_bu(b3); | |
DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2, | |
temp0, temp1, temp2, temp3); | |
DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3, | |
in0, in1, in2, in3); | |
DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, | |
in0, in1, in2, in3); | |
DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); | |
DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); | |
__lasx_xvstelm_d(in0, dst, 0, 0); | |
__lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); | |
__lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); | |
__lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); | |
dst += dst_stride_4x; | |
__lasx_xvstelm_d(in1, dst, 0, 0); | |
__lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); | |
__lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); | |
__lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); | |
} | |