Spaces:
Runtime error
Runtime error
File size: 13,242 Bytes
b86f76f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 |
/*
* Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
* Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro compute_premult
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
.endm
.macro compute_color dst_comp1 dst_comp2 pre
vadd.s16 q1, q14, \pre
vadd.s16 q2, q15, \pre
vqrshrun.s16 \dst_comp1, q1, #1
vqrshrun.s16 \dst_comp2, q2, #1
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_color \r1, \r2, q8
compute_color \g1, \g2, q9
compute_color \b1, \b2, q10
vmov.u8 \a1, #255
vmov.u8 \a2, #255
.endm
.macro compute dst ofmt
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
.ifc \ofmt,argb
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
.endif
.ifc \ofmt,rgba
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
.endif
.ifc \ofmt,abgr
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
.endif
.ifc \ofmt,bgra
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
.endif
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
vst4.8 {q3, q4}, [\dst,:128]!
vst4.8 {q5, q6}, [\dst,:128]!
.endm
.macro process_1l_internal dst src ofmt
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
compute \dst, \ofmt
.endm
.macro process_1l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
.endm
.macro process_2l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
process_1l_internal r11,r12,\ofmt
.endm
.macro load_args_nv12
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcC
ldr r7, [sp, #116] @ r7 = linesizeC
ldr r8, [sp, #120] @ r8 = table
ldr r9, [sp, #124] @ r9 = y_offset
ldr r10,[sp, #128] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
.endm
.macro load_args_nv21
load_args_nv12
.endm
.macro load_args_yuv420p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_args_yuv422p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r7, [sp, #116] @ r7 = linesizeU
ldr r12,[sp, #124] @ r12 = linesizeV
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_chroma_nv12
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_nv21
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv420p
pld [r10, #64*3]
pld [r12, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv422p
pld [r10, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro increment_and_test_nv12
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
add r6, r6, r7 @ srcC += paddingC
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_nv21
increment_and_test_nv12
.endm
.macro increment_and_test_yuv420p
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
ldr r7, [sp, #116] @ r7 = linesizeU
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
add r6, r6, r7 @ srcU += paddingU
ldr r7, [sp, #124] @ r7 = linesizeV
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
add r10, r10, r7 @ srcV += paddingV
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_yuv422p
add r6, r6, r7 @ srcU += paddingU
add r10,r10,r12 @ srcV += paddingV
subs r1, r1, #1 @ height -= 1
.endm
.macro process_nv12 ofmt
process_2l \ofmt
.endm
.macro process_nv21 ofmt
process_2l \ofmt
.endm
.macro process_yuv420p ofmt
process_2l \ofmt
.endm
.macro process_yuv422p ofmt
process_1l \ofmt
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
vdup.16 q12, r9 @ q12 = y_offset
vmov d26, d0 @ q13 = y_coeff
vmov d27, d0 @ q13 = y_coeff
1:
mov r8, r0 @ r8 = width
2:
pld [r6, #64*3]
pld [r4, #64*3]
vmov.i8 d10, #128
load_chroma_\ifmt
process_\ifmt \ofmt
subs r8, r8, #16 @ width -= 16
bgt 2b
add r2, r2, r3 @ dst += padding
add r4, r4, r5 @ srcY += paddingY
increment_and_test_\ifmt
bgt 1b
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p
|