diff --git a/src/video_decoder/video_decoder.c b/src/video_decoder/video_decoder.c index ed97e0b1df..2e4ad70573 100644 --- a/src/video_decoder/video_decoder.c +++ b/src/video_decoder/video_decoder.c @@ -206,6 +206,7 @@ static void EndThread( vdec_thread_t *p_vdec ) /******************************************************************************* * AddBlock : add a block *******************************************************************************/ +#ifndef HAVE_MMX static __inline__ void AddBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, yuv_data_t * p_data, int i_incr ) { @@ -221,10 +222,102 @@ static __inline__ void AddBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, p_data += i_incr; } } +#else +static __inline__ void AddBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, + yuv_data_t * p_data, int i_incr ) +{ + asm __volatile__ ( + "pxor %%mm7,%%mm7\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw (%1),%%mm2\n\t" + "paddw 8(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 16(%1),%%mm2\n\t" + "paddw 24(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 32(%1),%%mm2\n\t" + "paddw 40(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 48(%1),%%mm2\n\t" + "paddw 56(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 64(%1),%%mm2\n\t" + "paddw 72(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 80(%1),%%mm2\n\t" + "paddw 88(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 96(%1),%%mm2\n\t" + "paddw 104(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + "addl %2,%0\n\t" + + "movq (%0),%%mm1\n\t" + "movq %%mm1,%%mm2\n\t" + "punpckhbw %%mm7,%%mm1\n\t" + "punpcklbw %%mm7,%%mm2\n\t" + "paddw 112(%1),%%mm2\n\t" + "paddw 120(%1),%%mm1\n\t" + "packuswb %%mm1,%%mm2\n\t" + "movq %%mm2,(%0)\n\t" + + "emms" + :"+r" (p_data): "r" (p_block),"r" (i_incr+8)); +} +#endif + /******************************************************************************* * CopyBlock : copy a block *******************************************************************************/ +#ifndef HAVE_MMX static __inline__ void CopyBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, yuv_data_t * p_data, int i_incr ) { @@ -239,6 +332,54 @@ static __inline__ void CopyBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, p_data += i_incr; } } +#else +static __inline__ void CopyBlock( vdec_thread_t * p_vdec, dctelem_t * p_block, + yuv_data_t * p_data, int i_incr ) +{ + asm __volatile__ ( + "movq (%1),%%mm0\n\t" + "packuswb 8(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 16(%1),%%mm0\n\t" + "packuswb 24(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 32(%1),%%mm0\n\t" + "packuswb 40(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 48(%1),%%mm0\n\t" + "packuswb 56(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 64(%1),%%mm0\n\t" + "packuswb 72(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 80(%1),%%mm0\n\t" + "packuswb 88(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 96(%1),%%mm0\n\t" + "packuswb 104(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "addl %2,%0\n\t" + + "movq 112(%1),%%mm0\n\t" + "packuswb 120(%1),%%mm0\n\t" + "movq %%mm0,(%0)\n\t" + "emms" + :"+r" (p_data): "r" (p_block),"r" (i_incr+8)); +} +#endif + /******************************************************************************* * vdec_DecodeMacroblock : decode a macroblock of a picture