You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

232 lines
5.2 KiB

@*****************************************************************************
@ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
#include "asm.S"
.syntax unified
#if HAVE_AS_FPU_DIRECTIVE
.fpu neon
#endif
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D18
#define y2 D19
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define lumi1 Q15
#define lumi2 Q10
#define red16_1 Q9
#define green16_1 Q10
#define blue16_1 Q11
#define red16_2 Q12
#define green16_2 Q13
#define blue16_2 Q14
#define red1 D25
#define green1 D26
#define blue1 D27
#define red2 D29
#define green2 D30
#define blue2 D31
#define out1l D24
#define out1h D25
#define out2l D28
#define out2h D29
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
function i420_rv16_neon
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
it ne
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #1
sub YPAD, YPITCH, WIDTH
loop_row:
it gt
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
itt le
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld1.u8 {u}, [U,:64]!
vld1.u8 {v}, [V,:64]!
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
vmull.u8 Q14, v, coefRV
vmull.u8 Q11, u, coefGU
vmull.u8 Q13, u, coefBU
vmlal.u8 Q11, v, coefGV
vmull.u8 lumi2, y2, coefY
vmull.u8 lumi1, y1, coefY
vadd.s16 chro_r, Rc, Q14
vadd.s16 chro_b, Bc, Q13
vsub.s16 chro_g, Gc, Q11
pld [U]
pld [V]
/* chrominance + luminance */
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
/* pack into RGB565 */
vshl.u8 out2l, green2, #3 // low 2a
vsri.u8 out2h, green2, #5 // high 2
vshl.u8 out1l, green1, #3 // low 1a
vsri.u8 out1h, green1, #5 // high 1
vsri.u8 out2l, blue2, #3 // low 2b
vsri.u8 out1l, blue1, #3 // low 1b
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
/* Top Row output */
vzip.u8 out1h, out2h
vmull.u8 lumi2, y2, coefY
vzip.u8 out1l, out2l
vmull.u8 lumi1, y1, coefY
vst2.u8 {out1l, out1h}, [O1,:128]!
vst2.u8 {out2l, out2h}, [O1,:128]!
/* chrominance + luminance */
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
/* pack into RGB565 */
vshl.u8 out2l, green2, #3 // low 2a
vsri.u8 out2h, green2, #5 // high 2
vshl.u8 out1l, green1, #3 // low 1a
vsri.u8 out1h, green1, #5 // high 1
vsri.u8 out2l, blue2, #3 // low 2b
vsri.u8 out1l, blue1, #3 // low 1b
vzip.u8 out1h, out2h
vzip.u8 out1l, out2l
vst2.u8 {out1l, out1h}, [O2,:128]!
vst2.u8 {out2l, out2h}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD, lsr #1
add V, V, YPAD, lsr #1
b loop_row