You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
232 lines
5.2 KiB
232 lines
5.2 KiB
@*****************************************************************************
|
|
@ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
|
|
@*****************************************************************************
|
|
@ Copyright (C) 2011 Sébastien Toque
|
|
@ Rémi Denis-Courmont
|
|
@
|
|
@ This program is free software; you can redistribute it and/or modify it
|
|
@ under the terms of the GNU Lesser General Public License as published by
|
|
@ the Free Software Foundation; either version 2.1 of the License, or
|
|
@ (at your option) any later version.
|
|
@
|
|
@ This program is distributed in the hope that it will be useful,
|
|
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
@ GNU Lesser General Public License for more details.
|
|
@
|
|
@ You should have received a copy of the GNU Lesser General Public License
|
|
@ along with this program; if not, write to the Free Software Foundation,
|
|
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
|
|
@****************************************************************************/
|
|
|
|
#include "asm.S"
|
|
|
|
.syntax unified
|
|
#if HAVE_AS_FPU_DIRECTIVE
|
|
.fpu neon
|
|
#endif
|
|
.text
|
|
|
|
/* ARM */
|
|
#define O1 r0
|
|
#define O2 r1
|
|
#define WIDTH r2
|
|
#define HEIGHT r3
|
|
#define Y1 r4
|
|
#define Y2 r5
|
|
#define U r6
|
|
#define V r7
|
|
#define YPITCH r8
|
|
#define OPAD r10
|
|
#define YPAD r11
|
|
#define COUNT ip
|
|
#define OPITCH lr
|
|
|
|
/* NEON */
|
|
#define coefY D0
|
|
#define coefRV D1
|
|
#define coefGU D2
|
|
#define coefGV D3
|
|
#define coefBU D4
|
|
#define Rc Q3
|
|
#define Gc Q4
|
|
#define Bc Q5
|
|
|
|
#define u D24
|
|
#define v D25
|
|
#define y1 D18
|
|
#define y2 D19
|
|
|
|
#define chro_r Q6
|
|
#define chro_g Q7
|
|
#define chro_b Q8
|
|
#define lumi1 Q15
|
|
#define lumi2 Q10
|
|
#define red16_1 Q9
|
|
#define green16_1 Q10
|
|
#define blue16_1 Q11
|
|
#define red16_2 Q12
|
|
#define green16_2 Q13
|
|
#define blue16_2 Q14
|
|
|
|
#define red1 D25
|
|
#define green1 D26
|
|
#define blue1 D27
|
|
#define red2 D29
|
|
#define green2 D30
|
|
#define blue2 D31
|
|
|
|
#define out1l D24
|
|
#define out1h D25
|
|
#define out2l D28
|
|
#define out2h D29
|
|
|
|
coefficients:
|
|
.short -15872
|
|
.short 4992
|
|
.short -18432
|
|
|
|
.align 2
|
|
function i420_rv16_neon
|
|
push {r4-r8,r10-r11,lr}
|
|
vpush {q4-q7}
|
|
|
|
/* load arguments */
|
|
ldmia r0, {O1, OPITCH}
|
|
ldmia r1, {Y1, U, V, YPITCH}
|
|
|
|
/* round the width to be a multiple of 16 */
|
|
ands OPAD, WIDTH, #15
|
|
sub WIDTH, WIDTH, OPAD
|
|
it ne
|
|
addne WIDTH, WIDTH, #16
|
|
|
|
/* init constants (scale value by 64) */
|
|
vmov.u8 coefY, #74
|
|
vmov.u8 coefRV, #115
|
|
vmov.u8 coefGU, #14
|
|
vmov.u8 coefGV, #34
|
|
vmov.u8 coefBU, #135
|
|
adr OPAD, coefficients
|
|
vld1.s16 {d6[], d7[]}, [OPAD]!
|
|
vld1.s16 {d8[], d9[]}, [OPAD]!
|
|
vld1.s16 {d10[], d11[]}, [OPAD]!
|
|
|
|
/* init padding */
|
|
cmp HEIGHT, #0
|
|
sub OPAD, OPITCH, WIDTH, lsl #1
|
|
sub YPAD, YPITCH, WIDTH
|
|
|
|
loop_row:
|
|
it gt
|
|
movsgt COUNT, WIDTH
|
|
add O2, O1, OPITCH
|
|
add Y2, Y1, YPITCH
|
|
/* exit if all rows have been processed */
|
|
itt le
|
|
vpople {q4-q7}
|
|
pople {r4-r8,r10-r11,pc}
|
|
|
|
loop_col:
|
|
|
|
/* Common U & V */
|
|
|
|
vld1.u8 {u}, [U,:64]!
|
|
vld1.u8 {v}, [V,:64]!
|
|
|
|
/* Y Top Row */
|
|
vld2.u8 {y1,y2}, [Y1,:128]!
|
|
|
|
vmull.u8 Q14, v, coefRV
|
|
vmull.u8 Q11, u, coefGU
|
|
vmull.u8 Q13, u, coefBU
|
|
vmlal.u8 Q11, v, coefGV
|
|
|
|
vmull.u8 lumi2, y2, coefY
|
|
vmull.u8 lumi1, y1, coefY
|
|
vadd.s16 chro_r, Rc, Q14
|
|
vadd.s16 chro_b, Bc, Q13
|
|
vsub.s16 chro_g, Gc, Q11
|
|
|
|
pld [U]
|
|
pld [V]
|
|
|
|
/* chrominance + luminance */
|
|
vqadd.s16 red16_2, lumi2, chro_r
|
|
vqadd.s16 green16_2, lumi2, chro_g
|
|
vqadd.s16 blue16_2, lumi2, chro_b
|
|
vqadd.s16 red16_1, lumi1, chro_r
|
|
vqadd.s16 green16_1, lumi1, chro_g
|
|
vqadd.s16 blue16_1, lumi1, chro_b
|
|
|
|
/* clamp (divide by 64) */
|
|
vqrshrun.s16 green2, green16_2, #6
|
|
vqrshrun.s16 blue2, blue16_2, #6
|
|
vqrshrun.s16 red2, red16_2, #6
|
|
vqrshrun.s16 green1, green16_1, #6
|
|
vqrshrun.s16 red1, red16_1, #6
|
|
vqrshrun.s16 blue1, blue16_1, #6
|
|
|
|
pld [Y1]
|
|
|
|
/* pack into RGB565 */
|
|
vshl.u8 out2l, green2, #3 // low 2a
|
|
vsri.u8 out2h, green2, #5 // high 2
|
|
vshl.u8 out1l, green1, #3 // low 1a
|
|
vsri.u8 out1h, green1, #5 // high 1
|
|
vsri.u8 out2l, blue2, #3 // low 2b
|
|
vsri.u8 out1l, blue1, #3 // low 1b
|
|
|
|
/* Y Bottom Row */
|
|
vld2.u8 {y1,y2}, [Y2,:128]!
|
|
|
|
/* Top Row output */
|
|
vzip.u8 out1h, out2h
|
|
vmull.u8 lumi2, y2, coefY
|
|
vzip.u8 out1l, out2l
|
|
vmull.u8 lumi1, y1, coefY
|
|
vst2.u8 {out1l, out1h}, [O1,:128]!
|
|
vst2.u8 {out2l, out2h}, [O1,:128]!
|
|
|
|
/* chrominance + luminance */
|
|
vqadd.s16 green16_2, lumi2, chro_g
|
|
vqadd.s16 red16_2, lumi2, chro_r
|
|
vqadd.s16 blue16_2, lumi2, chro_b
|
|
vqadd.s16 red16_1, lumi1, chro_r
|
|
vqadd.s16 green16_1, lumi1, chro_g
|
|
vqadd.s16 blue16_1, lumi1, chro_b
|
|
|
|
/* clamp (divide by 64) */
|
|
vqrshrun.s16 green2, green16_2, #6
|
|
vqrshrun.s16 blue2, blue16_2, #6
|
|
vqrshrun.s16 red2, red16_2, #6
|
|
vqrshrun.s16 green1, green16_1, #6
|
|
vqrshrun.s16 red1, red16_1, #6
|
|
vqrshrun.s16 blue1, blue16_1, #6
|
|
|
|
pld [Y1]
|
|
|
|
/* pack into RGB565 */
|
|
vshl.u8 out2l, green2, #3 // low 2a
|
|
vsri.u8 out2h, green2, #5 // high 2
|
|
vshl.u8 out1l, green1, #3 // low 1a
|
|
vsri.u8 out1h, green1, #5 // high 1
|
|
vsri.u8 out2l, blue2, #3 // low 2b
|
|
vsri.u8 out1l, blue1, #3 // low 1b
|
|
|
|
vzip.u8 out1h, out2h
|
|
vzip.u8 out1l, out2l
|
|
vst2.u8 {out1l, out1h}, [O2,:128]!
|
|
vst2.u8 {out2l, out2h}, [O2,:128]!
|
|
|
|
/* next columns (x16) */
|
|
subs COUNT, COUNT, #16
|
|
bgt loop_col
|
|
|
|
/* next rows (x2) */
|
|
subs HEIGHT, #2
|
|
add O1, O2, OPAD
|
|
add Y1, Y2, YPAD
|
|
add U, U, YPAD, lsr #1
|
|
add V, V, YPAD, lsr #1
|
|
b loop_row
|
|
|