vlc/modules/arm_neon/i420_rv16.S


								 @*****************************************************************************

								 @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion

								 @*****************************************************************************

								 @ Copyright (C) 2011 Sébastien Toque

								 @                    Rémi Denis-Courmont

								 @

								 @ This program is free software; you can redistribute it and/or modify it

								 @ under the terms of the GNU Lesser General Public License as published by

								 @ the Free Software Foundation; either version 2.1 of the License, or

								 @ (at your option) any later version.

								 @

								 @ This program is distributed in the hope that it will be useful,

								 @ but WITHOUT ANY WARRANTY; without even the implied warranty of

								 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

								 @ GNU Lesser General Public License for more details.

								 @

								 @ You should have received a copy of the GNU Lesser General Public License

								 @ along with this program; if not, write to the Free Software Foundation,

								 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.

								 @****************************************************************************/


								#include "asm.S"


									.syntax unified

								#if HAVE_AS_FPU_DIRECTIVE

									.fpu	neon

								#endif

									.text


								/* ARM */

								#define O1	r0

								#define O2	r1

								#define WIDTH	r2

								#define HEIGHT	r3

								#define Y1	r4

								#define Y2	r5

								#define U	r6

								#define V	r7

								#define YPITCH	r8

								#define OPAD	r10

								#define YPAD	r11

								#define COUNT	ip

								#define OPITCH	lr


								/* NEON */

								#define coefY	D0

								#define coefRV	D1

								#define coefGU	D2

								#define coefGV	D3

								#define coefBU	D4

								#define Rc	Q3

								#define Gc	Q4

								#define Bc	Q5


								#define u	D24

								#define v	D25

								#define y1	D18

								#define y2	D19


								#define chro_r	Q6

								#define chro_g	Q7

								#define chro_b	Q8

								#define lumi1	Q15

								#define lumi2	Q10

								#define red16_1		Q9

								#define green16_1	Q10

								#define blue16_1	Q11

								#define red16_2		Q12

								#define green16_2	Q13

								#define blue16_2	Q14


								#define red1	D25

								#define green1	D26

								#define blue1	D27

								#define red2	D29

								#define green2	D30

								#define blue2	D31


								#define out1l	D24

								#define out1h	D25

								#define out2l	D28

								#define out2h	D29


								coefficients:

								    .short  -15872

								    .short    4992

								    .short  -18432


									.align 2

								function i420_rv16_neon

									push		{r4-r8,r10-r11,lr}

									vpush		{q4-q7}


									/* load arguments */

									ldmia		r0,	{O1, OPITCH}

									ldmia		r1,	{Y1, U, V, YPITCH}


									/* round the width to be a multiple of 16 */

									ands		OPAD, WIDTH, #15

									sub			WIDTH, WIDTH, OPAD

									it		ne

									addne		WIDTH, WIDTH, #16


									/* init constants (scale value by 64) */

									vmov.u8		coefY, #74

									vmov.u8		coefRV, #115

									vmov.u8		coefGU, #14

									vmov.u8		coefGV, #34

									vmov.u8		coefBU, #135

									adr			OPAD, coefficients

									vld1.s16	{d6[], d7[]}, [OPAD]!

									vld1.s16	{d8[], d9[]}, [OPAD]!

									vld1.s16	{d10[], d11[]}, [OPAD]!


									/* init padding */

									cmp			HEIGHT,	#0

									sub			OPAD,	OPITCH,	WIDTH, lsl #1

									sub			YPAD,	YPITCH,	WIDTH


								loop_row:

									it	gt

									movsgt	COUNT,	WIDTH

									add		O2,	O1,	OPITCH

									add		Y2,	Y1,	YPITCH

									/* exit if all rows have been processed */

									itt	le

									vpople	{q4-q7}

									pople	{r4-r8,r10-r11,pc}


								loop_col:


									/* Common U & V */


									vld1.u8	{u}, [U,:64]!

									vld1.u8	{v}, [V,:64]!


									/* Y Top Row */

									vld2.u8	{y1,y2}, [Y1,:128]!


									vmull.u8	Q14, v, coefRV

									vmull.u8	Q11, u, coefGU

									vmull.u8	Q13, u, coefBU

									vmlal.u8	Q11, v, coefGV


									vmull.u8	lumi2, y2, coefY

									vmull.u8	lumi1, y1, coefY

									vadd.s16	chro_r, Rc, Q14

									vadd.s16	chro_b, Bc, Q13

									vsub.s16	chro_g, Gc, Q11


									pld	[U]

									pld	[V]


									/* chrominance + luminance */

									vqadd.s16	red16_2, lumi2, chro_r

									vqadd.s16	green16_2, lumi2, chro_g

									vqadd.s16	blue16_2, lumi2, chro_b

									vqadd.s16	red16_1, lumi1, chro_r

									vqadd.s16	green16_1, lumi1, chro_g

									vqadd.s16	blue16_1, lumi1, chro_b


									/* clamp (divide by 64) */

									vqrshrun.s16	green2, green16_2, #6

									vqrshrun.s16	blue2, blue16_2, #6

									vqrshrun.s16	red2, red16_2, #6

									vqrshrun.s16	green1, green16_1, #6

									vqrshrun.s16	red1, red16_1, #6

									vqrshrun.s16	blue1, blue16_1, #6


									pld	[Y1]


									/* pack into RGB565 */

									vshl.u8	out2l, green2, #3 // low 2a

									vsri.u8	out2h, green2, #5 // high 2

									vshl.u8	out1l, green1, #3 // low 1a

									vsri.u8	out1h, green1, #5 // high 1

									vsri.u8	out2l, blue2, #3 // low 2b

									vsri.u8	out1l, blue1, #3 // low 1b


									/* Y Bottom Row */

									vld2.u8	{y1,y2}, [Y2,:128]!


									/* Top Row output */

									vzip.u8	out1h, out2h

									vmull.u8	lumi2, y2, coefY

									vzip.u8	out1l, out2l

									vmull.u8	lumi1, y1, coefY

									vst2.u8	{out1l, out1h}, [O1,:128]!

									vst2.u8	{out2l, out2h}, [O1,:128]!


									/* chrominance + luminance */

									vqadd.s16	green16_2, lumi2, chro_g

									vqadd.s16	red16_2, lumi2, chro_r

									vqadd.s16	blue16_2, lumi2, chro_b

									vqadd.s16	red16_1, lumi1, chro_r

									vqadd.s16	green16_1, lumi1, chro_g

									vqadd.s16	blue16_1, lumi1, chro_b


									/* clamp (divide by 64) */

									vqrshrun.s16	green2, green16_2, #6

									vqrshrun.s16	blue2, blue16_2, #6

									vqrshrun.s16	red2, red16_2, #6

									vqrshrun.s16	green1, green16_1, #6

									vqrshrun.s16	red1, red16_1, #6

									vqrshrun.s16	blue1, blue16_1, #6


									pld	[Y1]


									/* pack into RGB565 */

									vshl.u8	out2l, green2, #3 // low 2a

									vsri.u8	out2h, green2, #5 // high 2

									vshl.u8	out1l, green1, #3 // low 1a

									vsri.u8	out1h, green1, #5 // high 1

									vsri.u8	out2l, blue2, #3 // low 2b

									vsri.u8	out1l, blue1, #3 // low 1b


									vzip.u8	out1h, out2h

									vzip.u8	out1l, out2l

									vst2.u8	{out1l, out1h}, [O2,:128]!

									vst2.u8	{out2l, out2h}, [O2,:128]!


									/* next columns (x16) */

									subs	COUNT,	COUNT,	#16

									bgt		loop_col


									/* next rows (x2) */

									subs	HEIGHT,	#2

									add		O1,	O2,	OPAD

									add		Y1,	Y2,	YPAD

									add		U,	U,	YPAD,	lsr #1

									add		V,	V,	YPAD,	lsr #1

									b		loop_row