summaryrefslogtreecommitdiffstats
path: root/media/libtheora/lib/x86_vc/mmxfrag.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libtheora/lib/x86_vc/mmxfrag.c')
-rw-r--r--media/libtheora/lib/x86_vc/mmxfrag.c416
1 files changed, 416 insertions, 0 deletions
diff --git a/media/libtheora/lib/x86_vc/mmxfrag.c b/media/libtheora/lib/x86_vc/mmxfrag.c
new file mode 100644
index 000000000..c16b026ff
--- /dev/null
+++ b/media/libtheora/lib/x86_vc/mmxfrag.c
@@ -0,0 +1,416 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+ Originally written by Rudolf Marek.
+ Additional optimization by Nils Pipenbrinck.
+ Note: Loops are unrolled for best performance.
+ The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+ do{ \
+ const unsigned char *src; \
+ unsigned char *dst; \
+ src=(_src); \
+ dst=(_dst); \
+ __asm mov SRC,src \
+ __asm mov DST,dst \
+ __asm mov YSTRIDE,_ystride \
+ /*src+0*ystride*/ \
+ __asm movq mm0,[SRC] \
+ /*src+1*ystride*/ \
+ __asm movq mm1,[SRC+YSTRIDE] \
+ /*ystride3=ystride*3*/ \
+ __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+ /*src+2*ystride*/ \
+ __asm movq mm2,[SRC+YSTRIDE*2] \
+ /*src+3*ystride*/ \
+ __asm movq mm3,[SRC+YSTRIDE3] \
+ /*dst+0*ystride*/ \
+ __asm movq [DST],mm0 \
+ /*dst+1*ystride*/ \
+ __asm movq [DST+YSTRIDE],mm1 \
+ /*Pointer to next 4.*/ \
+ __asm lea SRC,[SRC+YSTRIDE*4] \
+ /*dst+2*ystride*/ \
+ __asm movq [DST+YSTRIDE*2],mm2 \
+ /*dst+3*ystride*/ \
+ __asm movq [DST+YSTRIDE3],mm3 \
+ /*Pointer to next 4.*/ \
+ __asm lea DST,[DST+YSTRIDE*4] \
+ /*src+0*ystride*/ \
+ __asm movq mm0,[SRC] \
+ /*src+1*ystride*/ \
+ __asm movq mm1,[SRC+YSTRIDE] \
+ /*src+2*ystride*/ \
+ __asm movq mm2,[SRC+YSTRIDE*2] \
+ /*src+3*ystride*/ \
+ __asm movq mm3,[SRC+YSTRIDE3] \
+ /*dst+0*ystride*/ \
+ __asm movq [DST],mm0 \
+ /*dst+1*ystride*/ \
+ __asm movq [DST+YSTRIDE],mm1 \
+ /*dst+2*ystride*/ \
+ __asm movq [DST+YSTRIDE*2],mm2 \
+ /*dst+3*ystride*/ \
+ __asm movq [DST+YSTRIDE3],mm3 \
+ } \
+ while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 esi
+ OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+}
+
+/*Copies the fragments specified by the lists of fragment indices from one
+ frame to another.
+ _dst_frame: The reference frame to copy to.
+ _src_frame: The reference frame to copy from.
+ _ystride: The row stride of the reference frames.
+ _fragis: A pointer to a list of fragment indices.
+ _nfragis: The number of fragment indices to copy.
+ _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+ ptrdiff_t fragii;
+ for(fragii=0;fragii<_nfragis;fragii++){
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+ OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+ _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+ }
+}
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+ __asm{
+#define DST edx
+#define DST4 esi
+#define YSTRIDE eax
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+ mov DST,_dst
+ mov YSTRIDE,_ystride
+ mov RESIDUE,_residue
+ lea DST4,[DST+YSTRIDE*4]
+ lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+ /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+ pcmpeqw mm0,mm0
+ /*#0 Load low residue.*/
+ movq mm1,[0*8+RESIDUE]
+ /*#0 Load high residue.*/
+ movq mm2,[1*8+RESIDUE]
+ /*Set mm0 to 0x8000800080008000.*/
+ psllw mm0,15
+ /*#1 Load low residue.*/
+ movq mm3,[2*8+RESIDUE]
+ /*#1 Load high residue.*/
+ movq mm4,[3*8+RESIDUE]
+ /*Set mm0 to 0x0080008000800080.*/
+ psrlw mm0,8
+ /*#2 Load low residue.*/
+ movq mm5,[4*8+RESIDUE]
+ /*#2 Load high residue.*/
+ movq mm6,[5*8+RESIDUE]
+ /*#0 Bias low residue.*/
+ paddsw mm1,mm0
+ /*#0 Bias high residue.*/
+ paddsw mm2,mm0
+ /*#0 Pack to byte.*/
+ packuswb mm1,mm2
+ /*#1 Bias low residue.*/
+ paddsw mm3,mm0
+ /*#1 Bias high residue.*/
+ paddsw mm4,mm0
+ /*#1 Pack to byte.*/
+ packuswb mm3,mm4
+ /*#2 Bias low residue.*/
+ paddsw mm5,mm0
+ /*#2 Bias high residue.*/
+ paddsw mm6,mm0
+ /*#2 Pack to byte.*/
+ packuswb mm5,mm6
+ /*#0 Write row.*/
+ movq [DST],mm1
+ /*#1 Write row.*/
+ movq [DST+YSTRIDE],mm3
+ /*#2 Write row.*/
+ movq [DST+YSTRIDE*2],mm5
+ /*#3 Load low residue.*/
+ movq mm1,[6*8+RESIDUE]
+ /*#3 Load high residue.*/
+ movq mm2,[7*8+RESIDUE]
+ /*#4 Load high residue.*/
+ movq mm3,[8*8+RESIDUE]
+ /*#4 Load high residue.*/
+ movq mm4,[9*8+RESIDUE]
+ /*#5 Load high residue.*/
+ movq mm5,[10*8+RESIDUE]
+ /*#5 Load high residue.*/
+ movq mm6,[11*8+RESIDUE]
+ /*#3 Bias low residue.*/
+ paddsw mm1,mm0
+ /*#3 Bias high residue.*/
+ paddsw mm2,mm0
+ /*#3 Pack to byte.*/
+ packuswb mm1,mm2
+ /*#4 Bias low residue.*/
+ paddsw mm3,mm0
+ /*#4 Bias high residue.*/
+ paddsw mm4,mm0
+ /*#4 Pack to byte.*/
+ packuswb mm3,mm4
+ /*#5 Bias low residue.*/
+ paddsw mm5,mm0
+ /*#5 Bias high residue.*/
+ paddsw mm6,mm0
+ /*#5 Pack to byte.*/
+ packuswb mm5,mm6
+ /*#3 Write row.*/
+ movq [DST+YSTRIDE3],mm1
+ /*#4 Write row.*/
+ movq [DST4],mm3
+ /*#5 Write row.*/
+ movq [DST4+YSTRIDE],mm5
+ /*#6 Load low residue.*/
+ movq mm1,[12*8+RESIDUE]
+ /*#6 Load high residue.*/
+ movq mm2,[13*8+RESIDUE]
+ /*#7 Load low residue.*/
+ movq mm3,[14*8+RESIDUE]
+ /*#7 Load high residue.*/
+ movq mm4,[15*8+RESIDUE]
+ /*#6 Bias low residue.*/
+ paddsw mm1,mm0
+ /*#6 Bias high residue.*/
+ paddsw mm2,mm0
+ /*#6 Pack to byte.*/
+ packuswb mm1,mm2
+ /*#7 Bias low residue.*/
+ paddsw mm3,mm0
+ /*#7 Bias high residue.*/
+ paddsw mm4,mm0
+ /*#7 Pack to byte.*/
+ packuswb mm3,mm4
+ /*#6 Write row.*/
+ movq [DST4+YSTRIDE*2],mm1
+ /*#7 Write row.*/
+ movq [DST4+YSTRIDE3],mm3
+#undef DST
+#undef DST4
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+ }
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+ int i;
+ /*Zero mm0.*/
+ __asm pxor mm0,mm0;
+ for(i=4;i-->0;){
+ __asm{
+#define DST edx
+#define SRC ecx
+#define YSTRIDE edi
+#define RESIDUE eax
+ mov DST,_dst
+ mov SRC,_src
+ mov YSTRIDE,_ystride
+ mov RESIDUE,_residue
+ /*#0 Load source.*/
+ movq mm3,[SRC]
+ /*#1 Load source.*/
+ movq mm7,[SRC+YSTRIDE]
+ /*#0 Get copy of src.*/
+ movq mm4,mm3
+ /*#0 Expand high source.*/
+ punpckhbw mm4,mm0
+ /*#0 Expand low source.*/
+ punpcklbw mm3,mm0
+ /*#0 Add residue high.*/
+ paddsw mm4,[8+RESIDUE]
+ /*#1 Get copy of src.*/
+ movq mm2,mm7
+ /*#0 Add residue low.*/
+ paddsw mm3,[RESIDUE]
+ /*#1 Expand high source.*/
+ punpckhbw mm2,mm0
+ /*#0 Pack final row pixels.*/
+ packuswb mm3,mm4
+ /*#1 Expand low source.*/
+ punpcklbw mm7,mm0
+ /*#1 Add residue low.*/
+ paddsw mm7,[16+RESIDUE]
+ /*#1 Add residue high.*/
+ paddsw mm2,[24+RESIDUE]
+ /*Advance residue.*/
+ lea RESIDUE,[32+RESIDUE]
+ /*#1 Pack final row pixels.*/
+ packuswb mm7,mm2
+ /*Advance src.*/
+ lea SRC,[SRC+YSTRIDE*2]
+ /*#0 Write row.*/
+ movq [DST],mm3
+ /*#1 Write row.*/
+ movq [DST+YSTRIDE],mm7
+ /*Advance dst.*/
+ lea DST,[DST+YSTRIDE*2]
+ mov _residue,RESIDUE
+ mov _dst,DST
+ mov _src,SRC
+#undef DST
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+ }
+ }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+ int i;
+ /*Zero mm7.*/
+ __asm pxor mm7,mm7;
+ for(i=4;i-->0;){
+ __asm{
+#define SRC1 ecx
+#define SRC2 edi
+#define YSTRIDE esi
+#define RESIDUE edx
+#define DST eax
+ mov YSTRIDE,_ystride
+ mov DST,_dst
+ mov RESIDUE,_residue
+ mov SRC1,_src1
+ mov SRC2,_src2
+ /*#0 Load src1.*/
+ movq mm0,[SRC1]
+ /*#0 Load src2.*/
+ movq mm2,[SRC2]
+ /*#0 Copy src1.*/
+ movq mm1,mm0
+ /*#0 Copy src2.*/
+ movq mm3,mm2
+ /*#1 Load src1.*/
+ movq mm4,[SRC1+YSTRIDE]
+ /*#0 Unpack lower src1.*/
+ punpcklbw mm0,mm7
+ /*#1 Load src2.*/
+ movq mm5,[SRC2+YSTRIDE]
+ /*#0 Unpack higher src1.*/
+ punpckhbw mm1,mm7
+ /*#0 Unpack lower src2.*/
+ punpcklbw mm2,mm7
+ /*#0 Unpack higher src2.*/
+ punpckhbw mm3,mm7
+ /*Advance src1 ptr.*/
+ lea SRC1,[SRC1+YSTRIDE*2]
+ /*Advance src2 ptr.*/
+ lea SRC2,[SRC2+YSTRIDE*2]
+ /*#0 Lower src1+src2.*/
+ paddsw mm0,mm2
+ /*#0 Higher src1+src2.*/
+ paddsw mm1,mm3
+ /*#1 Copy src1.*/
+ movq mm2,mm4
+ /*#0 Build lo average.*/
+ psraw mm0,1
+ /*#1 Copy src2.*/
+ movq mm3,mm5
+ /*#1 Unpack lower src1.*/
+ punpcklbw mm4,mm7
+ /*#0 Build hi average.*/
+ psraw mm1,1
+ /*#1 Unpack higher src1.*/
+ punpckhbw mm2,mm7
+ /*#0 low+=residue.*/
+ paddsw mm0,[RESIDUE]
+ /*#1 Unpack lower src2.*/
+ punpcklbw mm5,mm7
+ /*#0 high+=residue.*/
+ paddsw mm1,[8+RESIDUE]
+ /*#1 Unpack higher src2.*/
+ punpckhbw mm3,mm7
+ /*#1 Lower src1+src2.*/
+ paddsw mm5,mm4
+ /*#0 Pack and saturate.*/
+ packuswb mm0,mm1
+ /*#1 Higher src1+src2.*/
+ paddsw mm3,mm2
+ /*#0 Write row.*/
+ movq [DST],mm0
+ /*#1 Build lo average.*/
+ psraw mm5,1
+ /*#1 Build hi average.*/
+ psraw mm3,1
+ /*#1 low+=residue.*/
+ paddsw mm5,[16+RESIDUE]
+ /*#1 high+=residue.*/
+ paddsw mm3,[24+RESIDUE]
+ /*#1 Pack and saturate.*/
+ packuswb mm5,mm3
+ /*#1 Write row ptr.*/
+ movq [DST+YSTRIDE],mm5
+ /*Advance residue ptr.*/
+ add RESIDUE,32
+ /*Advance dest ptr.*/
+ lea DST,[DST+YSTRIDE*2]
+ mov _dst,DST
+ mov _residue,RESIDUE
+ mov _src1,SRC1
+ mov _src2,SRC2
+#undef SRC1
+#undef SRC2
+#undef YSTRIDE
+#undef RESIDUE
+#undef DST
+ }
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+ __asm emms;
+}
+
+#endif