diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /media/libtheora/lib/x86 | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/libtheora/lib/x86')
-rw-r--r-- | media/libtheora/lib/x86/mmxfrag.c | 368 | ||||
-rw-r--r-- | media/libtheora/lib/x86/mmxidct.c | 562 | ||||
-rw-r--r-- | media/libtheora/lib/x86/mmxloop.h | 318 | ||||
-rw-r--r-- | media/libtheora/lib/x86/mmxstate.c | 226 | ||||
-rw-r--r-- | media/libtheora/lib/x86/sse2idct.c | 460 | ||||
-rw-r--r-- | media/libtheora/lib/x86/sse2trans.h | 242 | ||||
-rw-r--r-- | media/libtheora/lib/x86/x86cpu.c | 182 | ||||
-rw-r--r-- | media/libtheora/lib/x86/x86cpu.h | 36 | ||||
-rw-r--r-- | media/libtheora/lib/x86/x86int.h | 122 | ||||
-rw-r--r-- | media/libtheora/lib/x86/x86state.c | 95 |
10 files changed, 2611 insertions, 0 deletions
diff --git a/media/libtheora/lib/x86/mmxfrag.c b/media/libtheora/lib/x86/mmxfrag.c new file mode 100644 index 000000000..b7df1c1ec --- /dev/null +++ b/media/libtheora/lib/x86/mmxfrag.c @@ -0,0 +1,368 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $ + + ********************************************************************/ + +/*MMX acceleration of fragment reconstruction for motion compensation. + Originally written by Rudolf Marek. + Additional optimization by Nils Pipenbrinck. + Note: Loops are unrolled for best performance. + The iteration each instruction belongs to is marked in the comments as #i.*/ +#include <stddef.h> +#include "x86int.h" + +#if defined(OC_X86_ASM) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ + do{ \ + const unsigned char *src; \ + unsigned char *dst; \ + ptrdiff_t ystride3; \ + src=(_src); \ + dst=(_dst); \ + __asm__ __volatile__( \ + /*src+0*ystride*/ \ + "movq (%[src]),%%mm0\n\t" \ + /*src+1*ystride*/ \ + "movq (%[src],%[ystride]),%%mm1\n\t" \ + /*ystride3=ystride*3*/ \ + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ + /*src+2*ystride*/ \ + "movq (%[src],%[ystride],2),%%mm2\n\t" \ + /*src+3*ystride*/ \ + "movq (%[src],%[ystride3]),%%mm3\n\t" \ + /*dst+0*ystride*/ \ + "movq %%mm0,(%[dst])\n\t" \ + /*dst+1*ystride*/ \ + "movq %%mm1,(%[dst],%[ystride])\n\t" \ + /*Pointer to next 4.*/ \ + "lea (%[src],%[ystride],4),%[src]\n\t" \ + /*dst+2*ystride*/ \ + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ + /*dst+3*ystride*/ \ + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ + /*Pointer to next 4.*/ \ + "lea (%[dst],%[ystride],4),%[dst]\n\t" \ + /*src+0*ystride*/ \ + "movq (%[src]),%%mm0\n\t" \ + /*src+1*ystride*/ \ + "movq (%[src],%[ystride]),%%mm1\n\t" \ + /*src+2*ystride*/ \ + "movq (%[src],%[ystride],2),%%mm2\n\t" \ + /*src+3*ystride*/ \ + "movq (%[src],%[ystride3]),%%mm3\n\t" \ + /*dst+0*ystride*/ \ + "movq %%mm0,(%[dst])\n\t" \ + /*dst+1*ystride*/ \ + "movq %%mm1,(%[dst],%[ystride])\n\t" \ + /*dst+2*ystride*/ \ + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ + /*dst+3*ystride*/ \ + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ + :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ + :[ystride]"r"((ptrdiff_t)(_ystride)) \ + :"memory" \ + ); \ + } \ + while(0) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ +void oc_frag_copy_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride){ + OC_FRAG_COPY_MMX(_dst,_src,_ystride); +} + +/*Copies the fragments specified by the lists of fragment indices from one + frame to another. + _dst_frame: The reference frame to copy to. + _src_frame: The reference frame to copy from. + _ystride: The row stride of the reference frames. + _fragis: A pointer to a list of fragment indices. + _nfragis: The number of fragment indices to copy. + _frag_buf_offs: The offsets of fragments in the reference frames.*/ +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ + ptrdiff_t fragii; + for(fragii=0;fragii<_nfragis;fragii++){ + ptrdiff_t frag_buf_off; + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; + OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, + _src_frame+frag_buf_off,_ystride); + } +} + + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue){ + __asm__ __volatile__( + /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ + "pcmpeqw %%mm0,%%mm0\n\t" + /*#0 Load low residue.*/ + "movq 0*8(%[residue]),%%mm1\n\t" + /*#0 Load high residue.*/ + "movq 1*8(%[residue]),%%mm2\n\t" + /*Set mm0 to 0x8000800080008000.*/ + "psllw $15,%%mm0\n\t" + /*#1 Load low residue.*/ + "movq 2*8(%[residue]),%%mm3\n\t" + /*#1 Load high residue.*/ + "movq 3*8(%[residue]),%%mm4\n\t" + /*Set mm0 to 0x0080008000800080.*/ + "psrlw $8,%%mm0\n\t" + /*#2 Load low residue.*/ + "movq 4*8(%[residue]),%%mm5\n\t" + /*#2 Load high residue.*/ + "movq 5*8(%[residue]),%%mm6\n\t" + /*#0 Bias low residue.*/ + "paddsw %%mm0,%%mm1\n\t" + /*#0 Bias high residue.*/ + "paddsw %%mm0,%%mm2\n\t" + /*#0 Pack to byte.*/ + "packuswb %%mm2,%%mm1\n\t" + /*#1 Bias low residue.*/ + "paddsw %%mm0,%%mm3\n\t" + /*#1 Bias high residue.*/ + "paddsw %%mm0,%%mm4\n\t" + /*#1 Pack to byte.*/ + "packuswb %%mm4,%%mm3\n\t" + /*#2 Bias low residue.*/ + "paddsw %%mm0,%%mm5\n\t" + /*#2 Bias high residue.*/ + "paddsw %%mm0,%%mm6\n\t" + /*#2 Pack to byte.*/ + "packuswb %%mm6,%%mm5\n\t" + /*#0 Write row.*/ + "movq %%mm1,(%[dst])\n\t" + /*#1 Write row.*/ + "movq %%mm3,(%[dst],%[ystride])\n\t" + /*#2 Write row.*/ + "movq %%mm5,(%[dst],%[ystride],2)\n\t" + /*#3 Load low residue.*/ + "movq 6*8(%[residue]),%%mm1\n\t" + /*#3 Load high residue.*/ + "movq 7*8(%[residue]),%%mm2\n\t" + /*#4 Load high residue.*/ + "movq 8*8(%[residue]),%%mm3\n\t" + /*#4 Load high residue.*/ + "movq 9*8(%[residue]),%%mm4\n\t" + /*#5 Load high residue.*/ + "movq 10*8(%[residue]),%%mm5\n\t" + /*#5 Load high residue.*/ + "movq 11*8(%[residue]),%%mm6\n\t" + /*#3 Bias low residue.*/ + "paddsw %%mm0,%%mm1\n\t" + /*#3 Bias high residue.*/ + "paddsw %%mm0,%%mm2\n\t" + /*#3 Pack to byte.*/ + "packuswb %%mm2,%%mm1\n\t" + /*#4 Bias low residue.*/ + "paddsw %%mm0,%%mm3\n\t" + /*#4 Bias high residue.*/ + "paddsw %%mm0,%%mm4\n\t" + /*#4 Pack to byte.*/ + "packuswb %%mm4,%%mm3\n\t" + /*#5 Bias low residue.*/ + "paddsw %%mm0,%%mm5\n\t" + /*#5 Bias high residue.*/ + "paddsw %%mm0,%%mm6\n\t" + /*#5 Pack to byte.*/ + "packuswb %%mm6,%%mm5\n\t" + /*#3 Write row.*/ + "movq %%mm1,(%[dst],%[ystride3])\n\t" + /*#4 Write row.*/ + "movq %%mm3,(%[dst4])\n\t" + /*#5 Write row.*/ + "movq %%mm5,(%[dst4],%[ystride])\n\t" + /*#6 Load low residue.*/ + "movq 12*8(%[residue]),%%mm1\n\t" + /*#6 Load high residue.*/ + "movq 13*8(%[residue]),%%mm2\n\t" + /*#7 Load low residue.*/ + "movq 14*8(%[residue]),%%mm3\n\t" + /*#7 Load high residue.*/ + "movq 15*8(%[residue]),%%mm4\n\t" + /*#6 Bias low residue.*/ + "paddsw %%mm0,%%mm1\n\t" + /*#6 Bias high residue.*/ + "paddsw %%mm0,%%mm2\n\t" + /*#6 Pack to byte.*/ + "packuswb %%mm2,%%mm1\n\t" + /*#7 Bias low residue.*/ + "paddsw %%mm0,%%mm3\n\t" + /*#7 Bias high residue.*/ + "paddsw %%mm0,%%mm4\n\t" + /*#7 Pack to byte.*/ + "packuswb %%mm4,%%mm3\n\t" + /*#6 Write row.*/ + "movq %%mm1,(%[dst4],%[ystride],2)\n\t" + /*#7 Write row.*/ + "movq %%mm3,(%[dst4],%[ystride3])\n\t" + : + :[residue]"r"(_residue), + [dst]"r"(_dst), + [dst4]"r"(_dst+(_ystride<<2)), + [ystride]"r"((ptrdiff_t)_ystride), + [ystride3]"r"((ptrdiff_t)_ystride*3) + :"memory" + ); +} + +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, + int _ystride,const ogg_int16_t *_residue){ + int i; + /*Zero mm0.*/ + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); + for(i=4;i-->0;){ + __asm__ __volatile__( + /*#0 Load source.*/ + "movq (%[src]),%%mm3\n\t" + /*#1 Load source.*/ + "movq (%[src],%[ystride]),%%mm7\n\t" + /*#0 Get copy of src.*/ + "movq %%mm3,%%mm4\n\t" + /*#0 Expand high source.*/ + "punpckhbw %%mm0,%%mm4\n\t" + /*#0 Expand low source.*/ + "punpcklbw %%mm0,%%mm3\n\t" + /*#0 Add residue high.*/ + "paddsw 8(%[residue]),%%mm4\n\t" + /*#1 Get copy of src.*/ + "movq %%mm7,%%mm2\n\t" + /*#0 Add residue low.*/ + "paddsw (%[residue]), %%mm3\n\t" + /*#1 Expand high source.*/ + "punpckhbw %%mm0,%%mm2\n\t" + /*#0 Pack final row pixels.*/ + "packuswb %%mm4,%%mm3\n\t" + /*#1 Expand low source.*/ + "punpcklbw %%mm0,%%mm7\n\t" + /*#1 Add residue low.*/ + "paddsw 16(%[residue]),%%mm7\n\t" + /*#1 Add residue high.*/ + "paddsw 24(%[residue]),%%mm2\n\t" + /*Advance residue.*/ + "lea 32(%[residue]),%[residue]\n\t" + /*#1 Pack final row pixels.*/ + "packuswb %%mm2,%%mm7\n\t" + /*Advance src.*/ + "lea (%[src],%[ystride],2),%[src]\n\t" + /*#0 Write row.*/ + "movq %%mm3,(%[dst])\n\t" + /*#1 Write row.*/ + "movq %%mm7,(%[dst],%[ystride])\n\t" + /*Advance dst.*/ + "lea (%[dst],%[ystride],2),%[dst]\n\t" + :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src) + :[ystride]"r"((ptrdiff_t)_ystride) + :"memory" + ); + } +} + +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ + int i; + /*Zero mm7.*/ + __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); + for(i=4;i-->0;){ + __asm__ __volatile__( + /*#0 Load src1.*/ + "movq (%[src1]),%%mm0\n\t" + /*#0 Load src2.*/ + "movq (%[src2]),%%mm2\n\t" + /*#0 Copy src1.*/ + "movq %%mm0,%%mm1\n\t" + /*#0 Copy src2.*/ + "movq %%mm2,%%mm3\n\t" + /*#1 Load src1.*/ + "movq (%[src1],%[ystride]),%%mm4\n\t" + /*#0 Unpack lower src1.*/ + "punpcklbw %%mm7,%%mm0\n\t" + /*#1 Load src2.*/ + "movq (%[src2],%[ystride]),%%mm5\n\t" + /*#0 Unpack higher src1.*/ + "punpckhbw %%mm7,%%mm1\n\t" + /*#0 Unpack lower src2.*/ + "punpcklbw %%mm7,%%mm2\n\t" + /*#0 Unpack higher src2.*/ + "punpckhbw %%mm7,%%mm3\n\t" + /*Advance src1 ptr.*/ + "lea (%[src1],%[ystride],2),%[src1]\n\t" + /*Advance src2 ptr.*/ + "lea (%[src2],%[ystride],2),%[src2]\n\t" + /*#0 Lower src1+src2.*/ + "paddsw %%mm2,%%mm0\n\t" + /*#0 Higher src1+src2.*/ + "paddsw %%mm3,%%mm1\n\t" + /*#1 Copy src1.*/ + "movq %%mm4,%%mm2\n\t" + /*#0 Build lo average.*/ + "psraw $1,%%mm0\n\t" + /*#1 Copy src2.*/ + "movq %%mm5,%%mm3\n\t" + /*#1 Unpack lower src1.*/ + "punpcklbw %%mm7,%%mm4\n\t" + /*#0 Build hi average.*/ + "psraw $1,%%mm1\n\t" + /*#1 Unpack higher src1.*/ + "punpckhbw %%mm7,%%mm2\n\t" + /*#0 low+=residue.*/ + "paddsw (%[residue]),%%mm0\n\t" + /*#1 Unpack lower src2.*/ + "punpcklbw %%mm7,%%mm5\n\t" + /*#0 high+=residue.*/ + "paddsw 8(%[residue]),%%mm1\n\t" + /*#1 Unpack higher src2.*/ + "punpckhbw %%mm7,%%mm3\n\t" + /*#1 Lower src1+src2.*/ + "paddsw %%mm4,%%mm5\n\t" + /*#0 Pack and saturate.*/ + "packuswb %%mm1,%%mm0\n\t" + /*#1 Higher src1+src2.*/ + "paddsw %%mm2,%%mm3\n\t" + /*#0 Write row.*/ + "movq %%mm0,(%[dst])\n\t" + /*#1 Build lo average.*/ + "psraw $1,%%mm5\n\t" + /*#1 Build hi average.*/ + "psraw $1,%%mm3\n\t" + /*#1 low+=residue.*/ + "paddsw 16(%[residue]),%%mm5\n\t" + /*#1 high+=residue.*/ + "paddsw 24(%[residue]),%%mm3\n\t" + /*#1 Pack and saturate.*/ + "packuswb %%mm3,%%mm5\n\t" + /*#1 Write row ptr.*/ + "movq %%mm5,(%[dst],%[ystride])\n\t" + /*Advance residue ptr.*/ + "add $32,%[residue]\n\t" + /*Advance dest ptr.*/ + "lea (%[dst],%[ystride],2),%[dst]\n\t" + :[dst]"+r"(_dst),[residue]"+r"(_residue), + [src1]"+%r"(_src1),[src2]"+r"(_src2) + :[ystride]"r"((ptrdiff_t)_ystride) + :"memory" + ); + } +} + +void oc_restore_fpu_mmx(void){ + __asm__ __volatile__("emms\n\t"); +} +#endif diff --git a/media/libtheora/lib/x86/mmxidct.c b/media/libtheora/lib/x86/mmxidct.c new file mode 100644 index 000000000..8d61bdfb1 --- /dev/null +++ b/media/libtheora/lib/x86/mmxidct.c @@ -0,0 +1,562 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ + + ********************************************************************/ + +/*MMX acceleration of Theora's iDCT. + Originally written by Rudolf Marek, based on code from On2's VP3.*/ +#include "x86int.h" +#include "../dct.h" + +#if defined(OC_X86_ASM) + +/*These are offsets into the table of constants below.*/ +/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ +#define OC_COSINE_OFFSET (0) +/*A row of 8's.*/ +#define OC_EIGHT_OFFSET (56) + + + +/*38 cycles*/ +#define OC_IDCT_BEGIN(_y,_x) \ + "#OC_IDCT_BEGIN\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ + "movq %%mm2,%%mm4\n\t" \ + "movq "OC_J(5,_x)",%%mm7\n\t" \ + "pmulhw %%mm6,%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ + "pmulhw %%mm7,%%mm6\n\t" \ + "movq %%mm1,%%mm5\n\t" \ + "pmulhw %%mm2,%%mm1\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ + "pmulhw %%mm7,%%mm5\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ + "paddw %%mm2,%%mm4\n\t" \ + "paddw %%mm7,%%mm6\n\t" \ + "paddw %%mm1,%%mm2\n\t" \ + "movq "OC_J(7,_x)",%%mm1\n\t" \ + "paddw %%mm5,%%mm7\n\t" \ + "movq %%mm0,%%mm5\n\t" \ + "pmulhw %%mm3,%%mm0\n\t" \ + "paddw %%mm7,%%mm4\n\t" \ + "pmulhw %%mm1,%%mm5\n\t" \ + "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ + "psubw %%mm2,%%mm6\n\t" \ + "paddw %%mm3,%%mm0\n\t" \ + "pmulhw %%mm7,%%mm3\n\t" \ + "movq "OC_I(2,_x)",%%mm2\n\t" \ + "pmulhw %%mm1,%%mm7\n\t" \ + "paddw %%mm1,%%mm5\n\t" \ + "movq %%mm2,%%mm1\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ + "psubw %%mm5,%%mm3\n\t" \ + "movq "OC_J(6,_x)",%%mm5\n\t" \ + "paddw %%mm7,%%mm0\n\t" \ + "movq %%mm5,%%mm7\n\t" \ + "psubw %%mm4,%%mm0\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ + "paddw %%mm1,%%mm2\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ + "paddw %%mm4,%%mm4\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "psubw %%mm6,%%mm3\n\t" \ + "paddw %%mm7,%%mm5\n\t" \ + "paddw %%mm6,%%mm6\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ + "paddw %%mm3,%%mm6\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ + "psubw %%mm5,%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ + "movq %%mm3,%%mm5\n\t" \ + "pmulhw %%mm4,%%mm3\n\t" \ + "paddw %%mm2,%%mm7\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ + "movq %%mm0,%%mm2\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ + "pmulhw %%mm4,%%mm0\n\t" \ + "paddw %%mm3,%%mm5\n\t" \ + "movq "OC_J(4,_x)",%%mm3\n\t" \ + "psubw %%mm1,%%mm5\n\t" \ + "paddw %%mm0,%%mm2\n\t" \ + "psubw %%mm3,%%mm6\n\t" \ + "movq %%mm6,%%mm0\n\t" \ + "pmulhw %%mm4,%%mm6\n\t" \ + "paddw %%mm3,%%mm3\n\t" \ + "paddw %%mm1,%%mm1\n\t" \ + "paddw %%mm0,%%mm3\n\t" \ + "paddw %%mm5,%%mm1\n\t" \ + "pmulhw %%mm3,%%mm4\n\t" \ + "paddw %%mm0,%%mm6\n\t" \ + "psubw %%mm2,%%mm6\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ + "paddw %%mm6,%%mm2\n\t" \ + "paddw %%mm3,%%mm4\n\t" \ + "psubw %%mm1,%%mm2\n\t" \ + "#end OC_IDCT_BEGIN\n\t" \ + +/*38+8=46 cycles.*/ +#define OC_ROW_IDCT(_y,_x) \ + "#OC_ROW_IDCT\n" \ + OC_IDCT_BEGIN(_y,_x) \ + /*r3=D'*/ \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ + /*r4=E'=E-G*/ \ + "psubw %%mm7,%%mm4\n\t" \ + /*r1=H'+H'*/ \ + "paddw %%mm1,%%mm1\n\t" \ + /*r7=G+G*/ \ + "paddw %%mm7,%%mm7\n\t" \ + /*r1=R1=A''+H'*/ \ + "paddw %%mm2,%%mm1\n\t" \ + /*r7=G'=E+G*/ \ + "paddw %%mm4,%%mm7\n\t" \ + /*r4=R4=E'-D'*/ \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw %%mm3,%%mm3\n\t" \ + /*r6=R6=F'-B''*/ \ + "psubw %%mm5,%%mm6\n\t" \ + "paddw %%mm5,%%mm5\n\t" \ + /*r3=R3=E'+D'*/ \ + "paddw %%mm4,%%mm3\n\t" \ + /*r5=R5=F'+B''*/ \ + "paddw %%mm6,%%mm5\n\t" \ + /*r7=R7=G'-C'*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "paddw %%mm0,%%mm0\n\t" \ + /*Save R1.*/ \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ + /*r0=R0=G.+C.*/ \ + "paddw %%mm7,%%mm0\n\t" \ + "#end OC_ROW_IDCT\n\t" \ + +/*The following macro does two 4x4 transposes in place. + At entry, we assume: + r0 = a3 a2 a1 a0 + I(1) = b3 b2 b1 b0 + r2 = c3 c2 c1 c0 + r3 = d3 d2 d1 d0 + + r4 = e3 e2 e1 e0 + r5 = f3 f2 f1 f0 + r6 = g3 g2 g1 g0 + r7 = h3 h2 h1 h0 + + At exit, we have: + I(0) = d0 c0 b0 a0 + I(1) = d1 c1 b1 a1 + I(2) = d2 c2 b2 a2 + I(3) = d3 c3 b3 a3 + + J(4) = h0 g0 f0 e0 + J(5) = h1 g1 f1 e1 + J(6) = h2 g2 f2 e2 + J(7) = h3 g3 f3 e3 + + I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. + J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. + + Since r1 is free at entry, we calculate the Js first.*/ +/*19 cycles.*/ +#define OC_TRANSPOSE(_y) \ + "#OC_TRANSPOSE\n\t" \ + "movq %%mm4,%%mm1\n\t" \ + "punpcklwd %%mm5,%%mm4\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ + "punpckhwd %%mm5,%%mm1\n\t" \ + "movq %%mm6,%%mm0\n\t" \ + "punpcklwd %%mm7,%%mm6\n\t" \ + "movq %%mm4,%%mm5\n\t" \ + "punpckldq %%mm6,%%mm4\n\t" \ + "punpckhdq %%mm6,%%mm5\n\t" \ + "movq %%mm1,%%mm6\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ + "punpckhwd %%mm7,%%mm0\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ + "punpckhdq %%mm0,%%mm6\n\t" \ + "movq "OC_I(0,_y)",%%mm4\n\t" \ + "punpckldq %%mm0,%%mm1\n\t" \ + "movq "OC_I(1,_y)",%%mm5\n\t" \ + "movq %%mm4,%%mm0\n\t" \ + "movq %%mm6,"OC_J(7,_y)"\n\t" \ + "punpcklwd %%mm5,%%mm0\n\t" \ + "movq %%mm1,"OC_J(6,_y)"\n\t" \ + "punpckhwd %%mm5,%%mm4\n\t" \ + "movq %%mm2,%%mm5\n\t" \ + "punpcklwd %%mm3,%%mm2\n\t" \ + "movq %%mm0,%%mm1\n\t" \ + "punpckldq %%mm2,%%mm0\n\t" \ + "punpckhdq %%mm2,%%mm1\n\t" \ + "movq %%mm4,%%mm2\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ + "punpckhwd %%mm3,%%mm5\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ + "punpckhdq %%mm5,%%mm4\n\t" \ + "punpckldq %%mm5,%%mm2\n\t" \ + "movq %%mm4,"OC_I(3,_y)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ + "#end OC_TRANSPOSE\n\t" \ + +/*38+19=57 cycles.*/ +#define OC_COLUMN_IDCT(_y) \ + "#OC_COLUMN_IDCT\n" \ + OC_IDCT_BEGIN(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ + /*r1=H'+H'*/ \ + "paddw %%mm1,%%mm1\n\t" \ + /*r1=R1=A''+H'*/ \ + "paddw %%mm2,%%mm1\n\t" \ + /*r2=NR2*/ \ + "psraw $4,%%mm2\n\t" \ + /*r4=E'=E-G*/ \ + "psubw %%mm7,%%mm4\n\t" \ + /*r1=NR1*/ \ + "psraw $4,%%mm1\n\t" \ + /*r3=D'*/ \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ + /*r7=G+G*/ \ + "paddw %%mm7,%%mm7\n\t" \ + /*Store NR2 at I(2).*/ \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ + /*r7=G'=E+G*/ \ + "paddw %%mm4,%%mm7\n\t" \ + /*Store NR1 at I(1).*/ \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ + /*r4=R4=E'-D'*/ \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ + /*r3=D'+D'*/ \ + "paddw %%mm3,%%mm3\n\t" \ + /*r3=R3=E'+D'*/ \ + "paddw %%mm4,%%mm3\n\t" \ + /*r4=NR4*/ \ + "psraw $4,%%mm4\n\t" \ + /*r6=R6=F'-B''*/ \ + "psubw %%mm5,%%mm6\n\t" \ + /*r3=NR3*/ \ + "psraw $4,%%mm3\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ + /*r5=B''+B''*/ \ + "paddw %%mm5,%%mm5\n\t" \ + /*r5=R5=F'+B''*/ \ + "paddw %%mm6,%%mm5\n\t" \ + /*r6=NR6*/ \ + "psraw $4,%%mm6\n\t" \ + /*Store NR4 at J(4).*/ \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ + /*r5=NR5*/ \ + "psraw $4,%%mm5\n\t" \ + /*Store NR3 at I(3).*/ \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ + /*r7=R7=G'-C'*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ + /*r0=C'+C'*/ \ + "paddw %%mm0,%%mm0\n\t" \ + /*r0=R0=G'+C'*/ \ + "paddw %%mm7,%%mm0\n\t" \ + /*r7=NR7*/ \ + "psraw $4,%%mm7\n\t" \ + /*Store NR6 at J(6).*/ \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ + /*r0=NR0*/ \ + "psraw $4,%%mm0\n\t" \ + /*Store NR5 at J(5).*/ \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ + /*Store NR7 at J(7).*/ \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ + /*Store NR0 at I(0).*/ \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ + "#end OC_COLUMN_IDCT\n\t" \ + +static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + /*This routine accepts an 8x8 matrix, but in partially transposed form. + Every 4x4 block is transposed.*/ + __asm__ __volatile__( +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) +#undef OC_I +#undef OC_J +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) +#undef OC_I +#undef OC_J +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) +#undef OC_I +#undef OC_J +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) +#undef OC_I +#undef OC_J + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) + ); + if(_x!=_y){ + int i; + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); + for(i=0;i<4;i++){ + __asm__ __volatile__( + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" + :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) + ); + } + } +} + +/*25 cycles.*/ +#define OC_IDCT_BEGIN_10(_y,_x) \ + "#OC_IDCT_BEGIN_10\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ + "nop\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ + "movq %%mm2,%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ + "pmulhw %%mm6,%%mm4\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ + "pmulhw %%mm2,%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ + "paddw %%mm2,%%mm4\n\t" \ + "pxor %%mm6,%%mm6\n\t" \ + "paddw %%mm1,%%mm2\n\t" \ + "movq "OC_I(2,_x)",%%mm5\n\t" \ + "pmulhw %%mm3,%%mm0\n\t" \ + "movq %%mm5,%%mm1\n\t" \ + "paddw %%mm3,%%mm0\n\t" \ + "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ + "psubw %%mm2,%%mm6\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ + "psubw %%mm4,%%mm0\n\t" \ + "movq "OC_I(2,_x)",%%mm7\n\t" \ + "paddw %%mm4,%%mm4\n\t" \ + "paddw %%mm5,%%mm7\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ + "psubw %%mm6,%%mm3\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ + "paddw %%mm6,%%mm6\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ + "paddw %%mm3,%%mm6\n\t" \ + "movq %%mm3,%%mm5\n\t" \ + "pmulhw %%mm4,%%mm3\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ + "movq %%mm0,%%mm2\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ + "pmulhw %%mm4,%%mm0\n\t" \ + "paddw %%mm3,%%mm5\n\t" \ + "paddw %%mm0,%%mm2\n\t" \ + "psubw %%mm1,%%mm5\n\t" \ + "pmulhw %%mm4,%%mm6\n\t" \ + "paddw "OC_I(0,_x)",%%mm6\n\t" \ + "paddw %%mm1,%%mm1\n\t" \ + "movq %%mm6,%%mm4\n\t" \ + "paddw %%mm5,%%mm1\n\t" \ + "psubw %%mm2,%%mm6\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ + "paddw %%mm6,%%mm2\n\t" \ + "psubw %%mm1,%%mm2\n\t" \ + "nop\n\t" \ + "#end OC_IDCT_BEGIN_10\n\t" \ + +/*25+8=33 cycles.*/ +#define OC_ROW_IDCT_10(_y,_x) \ + "#OC_ROW_IDCT_10\n\t" \ + OC_IDCT_BEGIN_10(_y,_x) \ + /*r3=D'*/ \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ + /*r4=E'=E-G*/ \ + "psubw %%mm7,%%mm4\n\t" \ + /*r1=H'+H'*/ \ + "paddw %%mm1,%%mm1\n\t" \ + /*r7=G+G*/ \ + "paddw %%mm7,%%mm7\n\t" \ + /*r1=R1=A''+H'*/ \ + "paddw %%mm2,%%mm1\n\t" \ + /*r7=G'=E+G*/ \ + "paddw %%mm4,%%mm7\n\t" \ + /*r4=R4=E'-D'*/ \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw %%mm3,%%mm3\n\t" \ + /*r6=R6=F'-B''*/ \ + "psubw %%mm5,%%mm6\n\t" \ + "paddw %%mm5,%%mm5\n\t" \ + /*r3=R3=E'+D'*/ \ + "paddw %%mm4,%%mm3\n\t" \ + /*r5=R5=F'+B''*/ \ + "paddw %%mm6,%%mm5\n\t" \ + /*r7=R7=G'-C'*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "paddw %%mm0,%%mm0\n\t" \ + /*Save R1.*/ \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ + /*r0=R0=G'+C'*/ \ + "paddw %%mm7,%%mm0\n\t" \ + "#end OC_ROW_IDCT_10\n\t" \ + +/*25+19=44 cycles'*/ +#define OC_COLUMN_IDCT_10(_y) \ + "#OC_COLUMN_IDCT_10\n\t" \ + OC_IDCT_BEGIN_10(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ + /*r1=H'+H'*/ \ + "paddw %%mm1,%%mm1\n\t" \ + /*r1=R1=A''+H'*/ \ + "paddw %%mm2,%%mm1\n\t" \ + /*r2=NR2*/ \ + "psraw $4,%%mm2\n\t" \ + /*r4=E'=E-G*/ \ + "psubw %%mm7,%%mm4\n\t" \ + /*r1=NR1*/ \ + "psraw $4,%%mm1\n\t" \ + /*r3=D'*/ \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ + /*r7=G+G*/ \ + "paddw %%mm7,%%mm7\n\t" \ + /*Store NR2 at I(2).*/ \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ + /*r7=G'=E+G*/ \ + "paddw %%mm4,%%mm7\n\t" \ + /*Store NR1 at I(1).*/ \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ + /*r4=R4=E'-D'*/ \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ + /*r3=D'+D'*/ \ + "paddw %%mm3,%%mm3\n\t" \ + /*r3=R3=E'+D'*/ \ + "paddw %%mm4,%%mm3\n\t" \ + /*r4=NR4*/ \ + "psraw $4,%%mm4\n\t" \ + /*r6=R6=F'-B''*/ \ + "psubw %%mm5,%%mm6\n\t" \ + /*r3=NR3*/ \ + "psraw $4,%%mm3\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ + /*r5=B''+B''*/ \ + "paddw %%mm5,%%mm5\n\t" \ + /*r5=R5=F'+B''*/ \ + "paddw %%mm6,%%mm5\n\t" \ + /*r6=NR6*/ \ + "psraw $4,%%mm6\n\t" \ + /*Store NR4 at J(4).*/ \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ + /*r5=NR5*/ \ + "psraw $4,%%mm5\n\t" \ + /*Store NR3 at I(3).*/ \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ + /*r7=R7=G'-C'*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ + /*r0=C'+C'*/ \ + "paddw %%mm0,%%mm0\n\t" \ + /*r0=R0=G'+C'*/ \ + "paddw %%mm7,%%mm0\n\t" \ + /*r7=NR7*/ \ + "psraw $4,%%mm7\n\t" \ + /*Store NR6 at J(6).*/ \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ + /*r0=NR0*/ \ + "psraw $4,%%mm0\n\t" \ + /*Store NR5 at J(5).*/ \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ + /*Store NR7 at J(7).*/ \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ + /*Store NR0 at I(0).*/ \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ + "#end OC_COLUMN_IDCT_10\n\t" \ + +static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + __asm__ __volatile__( +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) + /*Done with dequant, descramble, and partial transpose. + Now do the iDCT itself.*/ + OC_ROW_IDCT_10(y,x) + OC_TRANSPOSE(y) +#undef OC_I +#undef OC_J +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) +#undef OC_I +#undef OC_J +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) +#undef OC_I +#undef OC_J + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) + ); + if(_x!=_y){ + __asm__ __volatile__( + "pxor %%mm0,%%mm0\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) + ); + } +} + +/*Performs an inverse 8x8 Type-II DCT transform. + The input is assumed to be scaled by a factor of 4 relative to orthonormal + version of the transform.*/ +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + use a smaller transform when the block ends with a long zero run instead + of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Then perform the iDCT.*/ + if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); + else oc_idct8x8_slow_mmx(_y,_x); +} + +#endif diff --git a/media/libtheora/lib/x86/mmxloop.h b/media/libtheora/lib/x86/mmxloop.h new file mode 100644 index 000000000..1f6090b56 --- /dev/null +++ b/media/libtheora/lib/x86/mmxloop.h @@ -0,0 +1,318 @@ +#if !defined(_x86_mmxloop_H) +# define _x86_mmxloop_H (1) +# include <stddef.h> +# include "x86int.h" + +#if defined(OC_X86_ASM) + +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ +#define OC_LOOP_FILTER8_MMX \ + "#OC_LOOP_FILTER8_MMX\n\t" \ + /*mm7=0*/ \ + "pxor %%mm7,%%mm7\n\t" \ + /*mm6:mm0={a0,...,a7}*/ \ + "movq %%mm0,%%mm6\n\t" \ + "punpcklbw %%mm7,%%mm0\n\t" \ + "punpckhbw %%mm7,%%mm6\n\t" \ + /*mm3:mm5={d0,...,d7}*/ \ + "movq %%mm3,%%mm5\n\t" \ + "punpcklbw %%mm7,%%mm3\n\t" \ + "punpckhbw %%mm7,%%mm5\n\t" \ + /*mm6:mm0={a0-d0,...,a7-d7}*/ \ + "psubw %%mm3,%%mm0\n\t" \ + "psubw %%mm5,%%mm6\n\t" \ + /*mm3:mm1={b0,...,b7}*/ \ + "movq %%mm1,%%mm3\n\t" \ + "punpcklbw %%mm7,%%mm1\n\t" \ + "movq %%mm2,%%mm4\n\t" \ + "punpckhbw %%mm7,%%mm3\n\t" \ + /*mm5:mm4={c0,...,c7}*/ \ + "movq %%mm2,%%mm5\n\t" \ + "punpcklbw %%mm7,%%mm4\n\t" \ + "punpckhbw %%mm7,%%mm5\n\t" \ + /*mm7={3}x4 \ + mm5:mm4={c0-b0,...,c7-b7}*/ \ + "pcmpeqw %%mm7,%%mm7\n\t" \ + "psubw %%mm1,%%mm4\n\t" \ + "psrlw $14,%%mm7\n\t" \ + "psubw %%mm3,%%mm5\n\t" \ + /*Scale by 3.*/ \ + "pmullw %%mm7,%%mm4\n\t" \ + "pmullw %%mm7,%%mm5\n\t" \ + /*mm7={4}x4 \ + mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ + "psrlw $1,%%mm7\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "psllw $2,%%mm7\n\t" \ + "movq (%[ll]),%%mm0\n\t" \ + "paddw %%mm6,%%mm5\n\t" \ + /*R_i has the range [-127,128], so we compute -R_i instead. \ + mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ + "psubw %%mm7,%%mm4\n\t" \ + "psubw %%mm7,%%mm5\n\t" \ + "psraw $3,%%mm4\n\t" \ + "psraw $3,%%mm5\n\t" \ + "pcmpeqb %%mm7,%%mm7\n\t" \ + "packsswb %%mm5,%%mm4\n\t" \ + "pxor %%mm6,%%mm6\n\t" \ + "pxor %%mm7,%%mm4\n\t" \ + "packuswb %%mm3,%%mm1\n\t" \ + /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ + we have to split things by sign (the other option is to work in 16 bits, \ + but working in 8 bits gives much better parallelism). \ + We compute abs(R_i), but save a mask of which terms were negative in mm6. \ + Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ + Finally, we split mm4 into positive and negative pieces using the mask in \ + mm6, and add and subtract them as appropriate.*/ \ + /*mm4=abs(-R_i)*/ \ + /*mm7=255-2*L*/ \ + "pcmpgtb %%mm4,%%mm6\n\t" \ + "psubb %%mm0,%%mm7\n\t" \ + "pxor %%mm6,%%mm4\n\t" \ + "psubb %%mm0,%%mm7\n\t" \ + "psubb %%mm6,%%mm4\n\t" \ + /*mm7=255-max(2*L-abs(R_i),0)*/ \ + "paddusb %%mm4,%%mm7\n\t" \ + /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ + "paddusb %%mm7,%%mm4\n\t" \ + "psubusb %%mm7,%%mm4\n\t" \ + /*Now split mm4 by the original sign of -R_i.*/ \ + "movq %%mm4,%%mm5\n\t" \ + "pand %%mm6,%%mm4\n\t" \ + "pandn %%mm5,%%mm6\n\t" \ + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ + "paddusb %%mm4,%%mm1\n\t" \ + "psubusb %%mm4,%%mm2\n\t" \ + "psubusb %%mm6,%%mm1\n\t" \ + "paddusb %%mm6,%%mm2\n\t" \ + +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}. + All other MMX registers are clobbered.*/ +#define OC_LOOP_FILTER8_MMXEXT \ + "#OC_LOOP_FILTER8_MMXEXT\n\t" \ + /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \ + -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \ + /*This first part is based on the transformation \ + f = -(3*(c-b)+a-d+4>>3) \ + = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \ + = -(3*(c+~b)+(a+~d)-1016>>3) \ + = 127-(3*(c+~b)+(a+~d)>>3) \ + = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \ + Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \ + fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \ + Using this, the last expression above can be computed in 8 bits of working \ + precision via: \ + u = ~pavgb(~b,c); \ + v = pavgb(b,~c); \ + This mask is 0 or 0xFF, and controls whether t is biased up or down: \ + m = u-v; \ + t = m^pavgb(m^~a,m^d); \ + f = 128+pavgb(pavgb(t,u),v); \ + This required some careful analysis to ensure that carries are propagated \ + correctly in all cases, but has been checked exhaustively.*/ \ + /*input (a, b, c, d, ., ., ., .)*/ \ + /*ff=0xFF; \ + u=b; \ + v=c; \ + ll=255-2*L;*/ \ + "pcmpeqb %%mm7,%%mm7\n\t" \ + "movq %%mm1,%%mm4\n\t" \ + "movq %%mm2,%%mm5\n\t" \ + "movq (%[ll]),%%mm6\n\t" \ + /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \ + /*u^=ff; \ + v^=ff;*/ \ + "pxor %%mm7,%%mm4\n\t" \ + "pxor %%mm7,%%mm5\n\t" \ + /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \ + /*u=pavgb(u,c); \ + v=pavgb(v,b);*/ \ + "pavgb %%mm2,%%mm4\n\t" \ + "pavgb %%mm1,%%mm5\n\t" \ + /*u^=ff; \ + a^=ff;*/ \ + "pxor %%mm7,%%mm4\n\t" \ + "pxor %%mm7,%%mm0\n\t" \ + /*m=u-v;*/ \ + "psubb %%mm5,%%mm4\n\t" \ + /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \ + /*a^=m; \ + d^=m;*/ \ + "pxor %%mm4,%%mm0\n\t" \ + "pxor %%mm4,%%mm3\n\t" \ + /*t=pavgb(a,d);*/ \ + "pavgb %%mm3,%%mm0\n\t" \ + "psllw $7,%%mm7\n\t" \ + /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \ + /*t^=m; \ + u=m+v;*/ \ + "pxor %%mm4,%%mm0\n\t" \ + "paddb %%mm5,%%mm4\n\t" \ + /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \ + /*f=pavgb(f,u); \ + of=128;*/ \ + "pavgb %%mm4,%%mm0\n\t" \ + "packsswb %%mm7,%%mm7\n\t" \ + /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \ + /*f=pavgb(f,v);*/ \ + "pavgb %%mm5,%%mm0\n\t" \ + "movq %%mm7,%%mm3\n\t" \ + "movq %%mm6,%%mm4\n\t" \ + /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \ + /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \ + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ + we have to split things by sign (the other option is to work in 16 bits, \ + but staying in 8 bits gives much better parallelism).*/ \ + /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \ + This is the same number of instructions as computing a mask and splitting \ + after the lflim computation, but has shorter dependency chains.*/ \ + /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\ + mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \ + "psubusb %%mm0,%%mm3\n\t" \ + "psubusb %%mm7,%%mm0\n\t" \ + /*mm6=255-max(2*L-abs(R_i<0),0) \ + mm4=255-max(2*L-abs(R_i>0),0)*/ \ + "paddusb %%mm3,%%mm4\n\t" \ + "paddusb %%mm0,%%mm6\n\t" \ + /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \ + mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \ + "paddusb %%mm4,%%mm3\n\t" \ + "paddusb %%mm6,%%mm0\n\t" \ + "psubusb %%mm4,%%mm3\n\t" \ + "psubusb %%mm6,%%mm0\n\t" \ + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ + "paddusb %%mm3,%%mm1\n\t" \ + "psubusb %%mm3,%%mm2\n\t" \ + "psubusb %%mm0,%%mm1\n\t" \ + "paddusb %%mm0,%%mm2\n\t" \ + +#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \ + do{ \ + ptrdiff_t ystride3__; \ + __asm__ __volatile__( \ + /*mm0={a0,...,a7}*/ \ + "movq (%[pix]),%%mm0\n\t" \ + /*ystride3=_ystride*3*/ \ + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ + /*mm3={d0,...,d7}*/ \ + "movq (%[pix],%[ystride3]),%%mm3\n\t" \ + /*mm1={b0,...,b7}*/ \ + "movq (%[pix],%[ystride]),%%mm1\n\t" \ + /*mm2={c0,...,c7}*/ \ + "movq (%[pix],%[ystride],2),%%mm2\n\t" \ + _filter \ + /*Write it back out.*/ \ + "movq %%mm1,(%[pix],%[ystride])\n\t" \ + "movq %%mm2,(%[pix],%[ystride],2)\n\t" \ + :[ystride3]"=&r"(ystride3__) \ + :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \ + [ll]"r"(_ll) \ + :"memory" \ + ); \ + } \ + while(0) + +#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \ + do{ \ + unsigned char *pix__; \ + ptrdiff_t ystride3__; \ + ptrdiff_t d__; \ + pix__=(_pix)-2; \ + __asm__ __volatile__( \ + /*x x x x d0 c0 b0 a0*/ \ + "movd (%[pix]),%%mm0\n\t" \ + /*x x x x d1 c1 b1 a1*/ \ + "movd (%[pix],%[ystride]),%%mm1\n\t" \ + /*ystride3=_ystride*3*/ \ + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ + /*x x x x d2 c2 b2 a2*/ \ + "movd (%[pix],%[ystride],2),%%mm2\n\t" \ + /*x x x x d3 c3 b3 a3*/ \ + "lea (%[pix],%[ystride],4),%[d]\n\t" \ + "movd (%[pix],%[ystride3]),%%mm3\n\t" \ + /*x x x x d4 c4 b4 a4*/ \ + "movd (%[d]),%%mm4\n\t" \ + /*x x x x d5 c5 b5 a5*/ \ + "movd (%[d],%[ystride]),%%mm5\n\t" \ + /*x x x x d6 c6 b6 a6*/ \ + "movd (%[d],%[ystride],2),%%mm6\n\t" \ + /*x x x x d7 c7 b7 a7*/ \ + "movd (%[d],%[ystride3]),%%mm7\n\t" \ + /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ + "punpcklbw %%mm1,%%mm0\n\t" \ + /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ + "punpcklbw %%mm3,%%mm2\n\t" \ + /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ + "movq %%mm0,%%mm3\n\t" \ + /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ + "punpcklwd %%mm2,%%mm0\n\t" \ + /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ + "punpckhwd %%mm2,%%mm3\n\t" \ + /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ + "movq %%mm0,%%mm1\n\t" \ + /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ + "punpcklbw %%mm5,%%mm4\n\t" \ + /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ + "punpcklbw %%mm7,%%mm6\n\t" \ + /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ + "movq %%mm4,%%mm5\n\t" \ + /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ + "punpcklwd %%mm6,%%mm4\n\t" \ + /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ + "punpckhwd %%mm6,%%mm5\n\t" \ + /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ + "movq %%mm3,%%mm2\n\t" \ + /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ + "punpckldq %%mm4,%%mm0\n\t" \ + /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ + "punpckhdq %%mm4,%%mm1\n\t" \ + /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ + "punpckldq %%mm5,%%mm2\n\t" \ + /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ + "punpckhdq %%mm5,%%mm3\n\t" \ + _filter \ + /*mm2={b0+R_0'',...,b7+R_7''}*/ \ + "movq %%mm1,%%mm0\n\t" \ + /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ + "punpcklbw %%mm2,%%mm1\n\t" \ + /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ + "punpckhbw %%mm2,%%mm0\n\t" \ + /*[d]=c1 b1 c0 b0*/ \ + "movd %%mm1,%[d]\n\t" \ + "movw %w[d],1(%[pix])\n\t" \ + "psrlq $32,%%mm1\n\t" \ + "shr $16,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride])\n\t" \ + /*[d]=c3 b3 c2 b2*/ \ + "movd %%mm1,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ + "shr $16,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride3])\n\t" \ + "lea (%[pix],%[ystride],4),%[pix]\n\t" \ + /*[d]=c5 b5 c4 b4*/ \ + "movd %%mm0,%[d]\n\t" \ + "movw %w[d],1(%[pix])\n\t" \ + "psrlq $32,%%mm0\n\t" \ + "shr $16,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride])\n\t" \ + /*[d]=c7 b7 c6 b6*/ \ + "movd %%mm0,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ + "shr $16,%[d]\n\t" \ + "movw %w[d],1(%[pix],%[ystride3])\n\t" \ + :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \ + :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \ + :"memory" \ + ); \ + } \ + while(0) + +# endif +#endif diff --git a/media/libtheora/lib/x86/mmxstate.c b/media/libtheora/lib/x86/mmxstate.c new file mode 100644 index 000000000..0b9586f94 --- /dev/null +++ b/media/libtheora/lib/x86/mmxstate.c @@ -0,0 +1,226 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ + + ********************************************************************/ + +/*MMX acceleration of complete fragment reconstruction algorithm. + Originally written by Rudolf Marek.*/ +#include <string.h> +#include "x86int.h" +#include "mmxloop.h" + +#if defined(OC_X86_ASM) + +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ + unsigned char *dst; + ptrdiff_t frag_buf_off; + int ystride; + int refi; + /*Apply the inverse transform.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + /*Note that this value must be unsigned, to keep the __asm__ block from + sign-extending it when it puts it in a register.*/ + ogg_uint16_t p; + int i; + /*We round this dequant product (and not any of the others) because there's + no iDCT rounding.*/ + p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); + /*Fill _dct_coeffs with p.*/ + __asm__ __volatile__( + /*mm0=0000 0000 0000 AAAA*/ + "movd %[p],%%mm0\n\t" + /*mm0=0000 0000 AAAA AAAA*/ + "punpcklwd %%mm0,%%mm0\n\t" + /*mm0=AAAA AAAA AAAA AAAA*/ + "punpckldq %%mm0,%%mm0\n\t" + : + :[p]"r"((unsigned)p) + ); + for(i=0;i<4;i++){ + __asm__ __volatile__( + "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) + ); + } + } + else{ + /*Dequantize the DC coefficient.*/ + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); + oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); + } + /*Fill in the target buffer.*/ + frag_buf_off=_state->frag_buf_offs[_fragi]; + refi=_state->frags[_fragi].refi; + ystride=_state->ref_ystride[_pli]; + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); + else{ + const unsigned char *ref; + int mvoffsets[2]; + ref=_state->ref_frame_data[refi]+frag_buf_off; + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, + _state->frag_mvs[_fragi])>1){ + oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, + _dct_coeffs+64); + } + else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); + } +} + +/*We copy these entire function to inline the actual MMX routines so that we + use only a single indirect call.*/ + +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ + memset(_bv,_flimit,8); +} + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ + OC_ALIGN8(unsigned char ll[8]); + const oc_fragment_plane *fplane; + const oc_fragment *frags; + const ptrdiff_t *frag_buf_offs; + unsigned char *ref_frame_data; + ptrdiff_t fragi_top; + ptrdiff_t fragi_bot; + ptrdiff_t fragi0; + ptrdiff_t fragi0_end; + int ystride; + int nhfrags; + memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); + fplane=_state->fplanes+_pli; + nhfrags=fplane->nhfrags; + fragi_top=fplane->froffset; + fragi_bot=fragi_top+fplane->nfrags; + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; + fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; + ystride=_state->ref_ystride[_pli]; + frags=_state->frags; + frag_buf_offs=_state->frag_buf_offs; + ref_frame_data=_state->ref_frame_data[_refi]; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + while(fragi0<fragi0_end){ + ptrdiff_t fragi; + ptrdiff_t fragi_end; + fragi=fragi0; + fragi_end=fragi+nhfrags; + while(fragi<fragi_end){ + if(frags[fragi].coded){ + unsigned char *ref; + ref=ref_frame_data+frag_buf_offs[fragi]; + if(fragi>fragi0){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); + } + if(fragi0>fragi_top){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); + } + if(fragi+1<fragi_end&&!frags[fragi+1].coded){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll); + } + if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll); + } + } + fragi++; + } + fragi0+=nhfrags; + } +} + +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){ + memset(_bv,~(_flimit<<1),8); +} + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ + const oc_fragment_plane *fplane; + const oc_fragment *frags; + const ptrdiff_t *frag_buf_offs; + unsigned char *ref_frame_data; + ptrdiff_t fragi_top; + ptrdiff_t fragi_bot; + ptrdiff_t fragi0; + ptrdiff_t fragi0_end; + int ystride; + int nhfrags; + fplane=_state->fplanes+_pli; + nhfrags=fplane->nhfrags; + fragi_top=fplane->froffset; + fragi_bot=fragi_top+fplane->nfrags; + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; + fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; + ystride=_state->ref_ystride[_pli]; + frags=_state->frags; + frag_buf_offs=_state->frag_buf_offs; + ref_frame_data=_state->ref_frame_data[_refi]; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + while(fragi0<fragi0_end){ + ptrdiff_t fragi; + ptrdiff_t fragi_end; + fragi=fragi0; + fragi_end=fragi+nhfrags; + while(fragi<fragi_end){ + if(frags[fragi].coded){ + unsigned char *ref; + ref=ref_frame_data+frag_buf_offs[fragi]; + if(fragi>fragi0){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); + } + if(fragi0>fragi_top){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); + } + if(fragi+1<fragi_end&&!frags[fragi+1].coded){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv); + } + if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv); + } + } + fragi++; + } + fragi0+=nhfrags; + } +} + +#endif diff --git a/media/libtheora/lib/x86/sse2idct.c b/media/libtheora/lib/x86/sse2idct.c new file mode 100644 index 000000000..5f8523fa5 --- /dev/null +++ b/media/libtheora/lib/x86/sse2idct.c @@ -0,0 +1,460 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + + ********************************************************************/ + +/*SSE2 acceleration of Theora's iDCT.*/ +#include "x86int.h" +#include "sse2trans.h" +#include "../dct.h" + +#if defined(OC_X86_ASM) + +/*A table of constants used by the MMX routines.*/ +const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ + 8, 8, 8, 8, 8, 8, 8, 8, + OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, + OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, + OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, + OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, + OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, + OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, + OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 +}; + + +/*Performs the first three stages of the iDCT. + xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input + (accessed in that order). + The remaining rows must be in _x at their corresponding locations. + On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7.*/ +#define OC_IDCT_8x8_ABC(_x) \ + "#OC_IDCT_8x8_ABC\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ + "movdqa %%xmm1,%%xmm0\n\t" \ + "pmulhw %%xmm2,%%xmm1\n\t" \ + "movdqa %%xmm4,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm0\n\t" \ + "pmulhw %%xmm2,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "paddw %%xmm6,%%xmm0\n\t" \ + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm0,%%xmm7\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + "paddw %%xmm4,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ + "movdqa %%xmm4,%%xmm2\n\t" \ + "movdqa %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm3,%%xmm4\n\t" \ + "pmulhw %%xmm5,%%xmm1\n\t" \ + "pmulhw %%xmm3,%%xmm6\n\t" \ + "pmulhw %%xmm5,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm3\n\t" \ + "paddw %%xmm6,%%xmm3\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ + "paddw %%xmm5,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ + "paddw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ + "psubw %%xmm4,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ + /*4-7 rotation by 7pi/16. \ + xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ + "movdqa %%xmm3,%%xmm0\n\t" \ + "movdqa %%xmm4,%%xmm7\n\t" \ + "pmulhw %%xmm5,%%xmm3\n\t" \ + "pmulhw %%xmm5,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "pmulhw %%xmm6,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm4\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ + "paddw %%xmm5,%%xmm7\n\t" \ + "psubw %%xmm4,%%xmm3\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ + /*0-1 butterfly. \ + xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ + "paddw %%xmm7,%%xmm6\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm6,%%xmm7\n\t" \ + "paddw %%xmm6,%%xmm4\n\t" \ + /*Stage 2:*/ \ + /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ + 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ + "movdqa %%xmm3,%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm3\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "movdqa %%xmm5,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm5\n\t" \ + "paddw %%xmm7,%%xmm5\n\t" \ + "movdqa %%xmm0,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm0\n\t" \ + "psubw %%xmm2,%%xmm7\n\t" \ + "movdqa %%xmm1,%%xmm2\n\t" \ + "pmulhw %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm2\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ + "paddw %%xmm2,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "psubw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm7,%%xmm4\n\t" \ + "psubw %%xmm6,%%xmm5\n\t" \ + +/*Performs the last stage of the iDCT. + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7. + On output, xmm0 through xmm7 contain the corresponding rows.*/ +#define OC_IDCT_8x8_D \ + "#OC_IDCT_8x8_D\n\t" \ + /*Stage 4: \ + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ + "psubw %%xmm0,%%xmm7\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "psubw %%xmm2,%%xmm5\n\t" \ + "psubw %%xmm3,%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm0\n\t" \ + "paddw %%xmm1,%%xmm1\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm3\n\t" \ + +/*Performs the last stage of the iDCT. + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7. + On output, xmm0 through xmm7 contain the corresponding rows.*/ +#define OC_IDCT_8x8_D_STORE \ + "#OC_IDCT_8x8_D_STORE\n\t" \ + /*Stage 4: \ + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ + "psubw %%xmm3,%%xmm4\n\t" \ + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ + "psubw %%xmm0,%%xmm7\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "psubw %%xmm2,%%xmm5\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm4,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm5\n\t" \ + "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm0\n\t" \ + "paddw %%xmm1,%%xmm1\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "psraw $4,%%xmm0\n\t" \ + "paddw %%xmm5,%%xmm2\n\t" \ + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ + "psraw $4,%%xmm1\n\t" \ + "paddw %%xmm4,%%xmm3\n\t" \ + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ + "psraw $4,%%xmm2\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ + "psraw $4,%%xmm3\n\t" \ + "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ + "psraw $4,%%xmm4\n\t" \ + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ + "psraw $4,%%xmm5\n\t" \ + "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ + "psraw $4,%%xmm6\n\t" \ + "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ + "psraw $4,%%xmm7\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ + +static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + OC_ALIGN16(ogg_int16_t buf[16]); + /*This routine accepts an 8x8 matrix pre-transposed.*/ + __asm__ __volatile__( + /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ + "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" + "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" + "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" + "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" + OC_IDCT_8x8_ABC(x) + OC_IDCT_8x8_D + OC_TRANSPOSE_8x8 + /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" + OC_IDCT_8x8_ABC(y) + OC_IDCT_8x8_D_STORE + :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) + :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) + ); + if(_x!=_y){ + int i; + __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); + /*Clear input data for next block (decoder only).*/ + for(i=0;i<2;i++){ + __asm__ __volatile__( + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) + ); + } + } +} + +/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only + need to work with four columns at a time. + Doing this in MMX is faster on processors with a 64-bit data path.*/ +#define OC_IDCT_8x8_10_MMX \ + "#OC_IDCT_8x8_10_MMX\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ + "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ + "pmulhw %%mm2,%%mm6\n\t" \ + "pmulhw %%mm2,%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ + "paddw %%mm6,%%mm2\n\t" \ + "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ + "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ + "pmulhw %%mm3,%%mm5\n\t" \ + "pmulhw %%mm3,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ + "paddw %%mm3,%%mm5\n\t" \ + "paddw %%mm3,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ + /*4-7 rotation by 7pi/16. \ + mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ + "pmulhw %%mm1,%%mm3\n\t" \ + "pmulhw %%mm1,%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ + "movq %%mm3,%%mm6\n\t" \ + "paddw %%mm1,%%mm7\n\t" \ + /*0-1 butterfly. \ + mm4=C4, mm0=X0, X4=0.*/ \ + /*Stage 2:*/ \ + /*4-5 butterfly: mm3=t[4], mm5=t[5] \ + 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ + "psubw %%mm5,%%mm3\n\t" \ + "paddw %%mm5,%%mm6\n\t" \ + "movq %%mm4,%%mm1\n\t" \ + "pmulhw %%mm0,%%mm4\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "movq %%mm7,%%mm0\n\t" \ + "movq %%mm4,%%mm5\n\t" \ + "paddw %%mm2,%%mm0\n\t" \ + "psubw %%mm2,%%mm7\n\t" \ + "movq %%mm1,%%mm2\n\t" \ + "pmulhw %%mm6,%%mm1\n\t" \ + "pmulhw %%mm7,%%mm2\n\t" \ + "paddw %%mm6,%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ + "paddw %%mm7,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ + 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ + 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ + "paddw %%mm2,%%mm1\n\t" \ + "paddw %%mm5,%%mm6\n\t" \ + "paddw %%mm4,%%mm7\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "paddw %%mm4,%%mm4\n\t" \ + "paddw %%mm5,%%mm5\n\t" \ + "psubw %%mm1,%%mm2\n\t" \ + "psubw %%mm7,%%mm4\n\t" \ + "psubw %%mm6,%%mm5\n\t" \ + /*Stage 4: \ + 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ + 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ + 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ + 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "psubw %%mm1,%%mm6\n\t" \ + "psubw %%mm2,%%mm5\n\t" \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw %%mm0,%%mm0\n\t" \ + "paddw %%mm1,%%mm1\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "paddw %%mm3,%%mm3\n\t" \ + "paddw %%mm7,%%mm0\n\t" \ + "paddw %%mm6,%%mm1\n\t" \ + "paddw %%mm5,%%mm2\n\t" \ + "paddw %%mm4,%%mm3\n\t" \ + +#define OC_IDCT_8x8_10_ABC \ + "#OC_IDCT_8x8_10_ABC\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ + "pmulhw %%xmm2,%%xmm6\n\t" \ + "pmulhw %%xmm2,%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ + "paddw %%xmm6,%%xmm2\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ + "pmulhw %%xmm3,%%xmm5\n\t" \ + "pmulhw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ + "paddw %%xmm3,%%xmm5\n\t" \ + "paddw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ + /*4-7 rotation by 7pi/16. \ + xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ + "pmulhw %%xmm1,%%xmm3\n\t" \ + "pmulhw %%xmm1,%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ + "movdqa %%xmm3,%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm7\n\t" \ + /*0-1 butterfly. \ + xmm4=C4, xmm0=X0, X4=0.*/ \ + /*Stage 2:*/ \ + /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ + 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ + "psubw %%xmm5,%%xmm3\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "movdqa %%xmm4,%%xmm1\n\t" \ + "pmulhw %%xmm0,%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm4\n\t" \ + "movdqa %%xmm7,%%xmm0\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + "paddw %%xmm2,%%xmm0\n\t" \ + "psubw %%xmm2,%%xmm7\n\t" \ + "movdqa %%xmm1,%%xmm2\n\t" \ + "pmulhw %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm2\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ + "paddw %%xmm2,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "psubw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm7,%%xmm4\n\t" \ + "psubw %%xmm6,%%xmm5\n\t" \ + +static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + OC_ALIGN16(ogg_int16_t buf[16]); + /*This routine accepts an 8x8 matrix pre-transposed.*/ + __asm__ __volatile__( + "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" + "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" + "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" + OC_IDCT_8x8_10_MMX + OC_TRANSPOSE_8x4_MMX2SSE + OC_IDCT_8x8_10_ABC + OC_IDCT_8x8_D_STORE + :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) + ); + if(_x!=_y){ + /*Clear input data for next block (decoder only).*/ + __asm__ __volatile__( + "pxor %%mm0,%%mm0\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) + ); + } +} + +/*Performs an inverse 8x8 Type-II DCT transform. + The input is assumed to be scaled by a factor of 4 relative to orthonormal + version of the transform.*/ +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + use a smaller transform when the block ends with a long zero run instead + of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Then perform the iDCT.*/ + if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); + else oc_idct8x8_slow_sse2(_y,_x); +} + +#endif diff --git a/media/libtheora/lib/x86/sse2trans.h b/media/libtheora/lib/x86/sse2trans.h new file mode 100644 index 000000000..e76da5140 --- /dev/null +++ b/media/libtheora/lib/x86/sse2trans.h @@ -0,0 +1,242 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_sse2trans_H) +# define _x86_sse2trans_H (1) +# include "x86int.h" + +# if defined(OC_X86_64_ASM) +/*On x86-64 we can transpose in-place without spilling registers. + By clever choices of the order to apply the butterflies and the order of + their outputs, we can take the rows in order and output the columns in order + without any extra operations and using just one temporary register.*/ +# define OC_TRANSPOSE_8x8 \ + "#OC_TRANSPOSE_8x8\n\t" \ + "movdqa %%xmm4,%%xmm8\n\t" \ + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ + "punpckhwd %%xmm5,%%xmm8\n\t" \ + /*xmm5 is free.*/ \ + "movdqa %%xmm0,%%xmm5\n\t" \ + /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm0\n\t" \ + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ + "punpckhwd %%xmm1,%%xmm5\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm6,%%xmm1\n\t" \ + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm7,%%xmm6\n\t" \ + /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ + "punpckhwd %%xmm7,%%xmm1\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm2,%%xmm7\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "punpckhwd %%xmm3,%%xmm2\n\t" \ + /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm3,%%xmm7\n\t" \ + /*xmm3 is free.*/ \ + "movdqa %%xmm0,%%xmm3\n\t" \ + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm7,%%xmm0\n\t" \ + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm7,%%xmm3\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm5,%%xmm7\n\t" \ + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ + "punpckldq %%xmm2,%%xmm5\n\t" \ + /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ + "punpckhdq %%xmm2,%%xmm7\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm4,%%xmm2\n\t" \ + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm6,%%xmm4\n\t" \ + /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm6,%%xmm2\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm8,%%xmm6\n\t" \ + /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ + "punpckldq %%xmm1,%%xmm6\n\t" \ + /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "punpckhdq %%xmm1,%%xmm8\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm0,%%xmm1\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm2,%%xmm0\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm2,%%xmm1\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm3,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm4,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm4,%%xmm2\n\t" \ + /*xmm4 is free.*/ \ + "movdqa %%xmm5,%%xmm4\n\t" \ + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ + "punpckhqdq %%xmm6,%%xmm5\n\t" \ + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ + "punpcklqdq %%xmm6,%%xmm4\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm7,%%xmm6\n\t" \ + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ + "punpckhqdq %%xmm8,%%xmm7\n\t" \ + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ + "punpcklqdq %%xmm8,%%xmm6\n\t" \ + /*xmm8 is free.*/ \ + +# else +/*Otherwise, we need to spill some values to %[buf] temporarily. + Again, the butterflies are carefully arranged to get the columns to come out + in order, minimizing register spills and maximizing the delay between a load + and when the value loaded is actually used.*/ +# define OC_TRANSPOSE_8x8 \ + "#OC_TRANSPOSE_8x8\n\t" \ + /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*xmm0 is free.*/ \ + "movdqa %%xmm2,%%xmm0\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "punpckhwd %%xmm3,%%xmm2\n\t" \ + /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm3,%%xmm0\n\t" \ + /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ + /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm6,%%xmm2\n\t" \ + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm7,%%xmm6\n\t" \ + /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ + "punpckhwd %%xmm7,%%xmm2\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm4,%%xmm7\n\t" \ + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ + "punpckhwd %%xmm5,%%xmm7\n\t" \ + /*xmm5 is free.*/ \ + "movdqa %%xmm3,%%xmm5\n\t" \ + /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm3\n\t" \ + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ + "punpckhwd %%xmm1,%%xmm5\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm7,%%xmm1\n\t" \ + /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ + "punpckldq %%xmm2,%%xmm7\n\t" \ + /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "punpckhdq %%xmm2,%%xmm1\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ + /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm3,%%xmm1\n\t" \ + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm0,%%xmm3\n\t" \ + /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm0,%%xmm1\n\t" \ + /*xmm0 is free.*/ \ + "movdqa %%xmm4,%%xmm0\n\t" \ + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm6,%%xmm4\n\t" \ + /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm6,%%xmm0\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm5,%%xmm6\n\t" \ + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ + "punpckldq %%xmm2,%%xmm5\n\t" \ + /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ + "punpckhdq %%xmm2,%%xmm6\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm1,%%xmm2\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm0,%%xmm1\n\t" \ + /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm0,%%xmm2\n\t" \ + /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ + /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm3,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm4,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm4,%%xmm2\n\t" \ + /*xmm4 is free.*/ \ + "movdqa %%xmm5,%%xmm4\n\t" \ + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ + "punpckhqdq %%xmm7,%%xmm5\n\t" \ + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ + "punpcklqdq %%xmm7,%%xmm4\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm6,%%xmm7\n\t" \ + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ + "punpcklqdq %%xmm0,%%xmm6\n\t" \ + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ + "punpckhqdq %%xmm0,%%xmm7\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ + +# endif + +/*Transpose 4 values in each of 8 MMX registers into 8 values in the first + four SSE registers. + No need to be clever here; we have plenty of room.*/ +# define OC_TRANSPOSE_8x4_MMX2SSE \ + "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ + "movq2dq %%mm0,%%xmm0\n\t" \ + "movq2dq %%mm1,%%xmm1\n\t" \ + /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm0\n\t" \ + "movq2dq %%mm2,%%xmm3\n\t" \ + "movq2dq %%mm3,%%xmm2\n\t" \ + /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm2,%%xmm3\n\t" \ + "movq2dq %%mm4,%%xmm4\n\t" \ + "movq2dq %%mm5,%%xmm5\n\t" \ + /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + "movq2dq %%mm6,%%xmm7\n\t" \ + "movq2dq %%mm7,%%xmm6\n\t" \ + /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm6,%%xmm7\n\t" \ + "movdqa %%xmm0,%%xmm2\n\t" \ + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm3,%%xmm0\n\t" \ + /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm3,%%xmm2\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm7,%%xmm4\n\t" \ + /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm7,%%xmm5\n\t" \ + "movdqa %%xmm0,%%xmm1\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm4,%%xmm0\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm4,%%xmm1\n\t" \ + "movdqa %%xmm2,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm5,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm5,%%xmm3\n\t" \ + +#endif diff --git a/media/libtheora/lib/x86/x86cpu.c b/media/libtheora/lib/x86/x86cpu.c new file mode 100644 index 000000000..c3a20b319 --- /dev/null +++ b/media/libtheora/lib/x86/x86cpu.c @@ -0,0 +1,182 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + CPU capability detection for x86 processors. + Originally written by Rudolf Marek. + + function: + last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $ + + ********************************************************************/ + +#include "x86cpu.h" + +#if !defined(OC_X86_ASM) +ogg_uint32_t oc_cpu_flags_get(void){ + return 0; +} +#else +# if defined(__amd64__)||defined(__x86_64__) +/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when + compiling with -fPIC.*/ +# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ + __asm__ __volatile__( \ + "cpuid\n\t" \ + :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ + :"a"(_op) \ + :"cc" \ + ) +# else +/*On x86-32, not so much.*/ +# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ + __asm__ __volatile__( \ + "xchgl %%ebx,%[ebx]\n\t" \ + "cpuid\n\t" \ + "xchgl %%ebx,%[ebx]\n\t" \ + :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ + :"a"(_op) \ + :"cc" \ + ) +# endif + +static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ + ogg_uint32_t flags; + /*If there isn't even MMX, give up.*/ + if(!(_edx&0x00800000))return 0; + flags=OC_CPU_X86_MMX; + if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE; + if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2; + if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI; + if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3; + if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1; + if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2; + return flags; +} + +static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ + ogg_uint32_t flags; + /*If there isn't even MMX, give up.*/ + if(!(_edx&0x00800000))return 0; + flags=OC_CPU_X86_MMX; + if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT; + if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW; + if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT; + if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A; + if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5; + return flags; +} + +ogg_uint32_t oc_cpu_flags_get(void){ + ogg_uint32_t flags; + ogg_uint32_t eax; + ogg_uint32_t ebx; + ogg_uint32_t ecx; + ogg_uint32_t edx; +# if !defined(__amd64__)&&!defined(__x86_64__) + /*Not all x86-32 chips support cpuid, so we have to check.*/ + __asm__ __volatile__( + "pushfl\n\t" + "pushfl\n\t" + "popl %[a]\n\t" + "movl %[a],%[b]\n\t" + "xorl $0x200000,%[a]\n\t" + "pushl %[a]\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %[a]\n\t" + "popfl\n\t" + :[a]"=r"(eax),[b]"=r"(ebx) + : + :"cc" + ); + /*No cpuid.*/ + if(eax==ebx)return 0; +# endif + cpuid(0,eax,ebx,ecx,edx); + /* l e t n I e n i u n e G*/ + if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547|| + /* 6 8 x M T e n i u n e G*/ + ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){ + int family; + int model; + /*Intel, Transmeta (tested with Crusoe TM5800):*/ + cpuid(1,eax,ebx,ecx,edx); + flags=oc_parse_intel_flags(edx,ecx); + family=(eax>>8)&0xF; + model=(eax>>4)&0xF; + /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX + unit, so don't use it.*/ + if(family==6&&(model==9||model==13||model==14)){ + flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI); + } + } + /* D M A c i t n e h t u A*/ + else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541|| + /* C S N y b e d o e G*/ + ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){ + /*AMD, Geode:*/ + cpuid(0x80000000,eax,ebx,ecx,edx); + if(eax<0x80000001)flags=0; + else{ + cpuid(0x80000001,eax,ebx,ecx,edx); + flags=oc_parse_amd_flags(edx,ecx); + } + /*Also check for SSE.*/ + cpuid(1,eax,ebx,ecx,edx); + flags|=oc_parse_intel_flags(edx,ecx); + } + /*Technically some VIA chips can be configured in the BIOS to return any + string here the user wants. + There is a special detection method that can be used to identify such + processors, but in my opinion, if the user really wants to change it, they + deserve what they get.*/ + /* s l u a H r u a t n e C*/ + else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){ + /*VIA:*/ + /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming) + chips (thanks to the engineers from Centaur Technology who provided it). + These chips support Intel-like cpuid info. + The C3-2 (Nehemiah) cores appear to, as well.*/ + cpuid(1,eax,ebx,ecx,edx); + flags=oc_parse_intel_flags(edx,ecx); + if(eax>=0x80000001){ + /*The (non-Nehemiah) C3 processors support AMD-like cpuid info. + We need to check this even if the Intel test succeeds to pick up 3DNow! + support on these processors. + Unlike actual AMD processors, we cannot _rely_ on this info, since + some cores (e.g., the 693 stepping of the Nehemiah) claim to support + this function, yet return edx=0, despite the Intel test indicating + MMX support. + Therefore the features detected here are strictly added to those + detected by the Intel test.*/ + /*TODO: How about earlier chips?*/ + cpuid(0x80000001,eax,ebx,ecx,edx); + /*Note: As of the C7, this function returns Intel-style extended feature + flags, not AMD-style. + Currently, this only defines bits 11, 20, and 29 (0x20100800), which + do not conflict with any of the AMD flags we inspect. + For the remaining bits, Intel tells us, "Do not count on their value", + but VIA assures us that they will all be zero (at least on the C7 and + Isaiah chips). + In the (unlikely) event a future processor uses bits 18, 19, 30, or 31 + (0xC0C00000) for something else, we will have to add code to detect + the model to decide when it is appropriate to inspect them.*/ + flags|=oc_parse_amd_flags(edx,ecx); + } + } + else{ + /*Implement me.*/ + flags=0; + } + return flags; +} +#endif diff --git a/media/libtheora/lib/x86/x86cpu.h b/media/libtheora/lib/x86/x86cpu.h new file mode 100644 index 000000000..153a48d89 --- /dev/null +++ b/media/libtheora/lib/x86/x86cpu.h @@ -0,0 +1,36 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + function: + last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_x86cpu_H) +# define _x86_x86cpu_H (1) +#include "../internal.h" + +#define OC_CPU_X86_MMX (1<<0) +#define OC_CPU_X86_3DNOW (1<<1) +#define OC_CPU_X86_3DNOWEXT (1<<2) +#define OC_CPU_X86_MMXEXT (1<<3) +#define OC_CPU_X86_SSE (1<<4) +#define OC_CPU_X86_SSE2 (1<<5) +#define OC_CPU_X86_PNI (1<<6) +#define OC_CPU_X86_SSSE3 (1<<7) +#define OC_CPU_X86_SSE4_1 (1<<8) +#define OC_CPU_X86_SSE4_2 (1<<9) +#define OC_CPU_X86_SSE4A (1<<10) +#define OC_CPU_X86_SSE5 (1<<11) + +ogg_uint32_t oc_cpu_flags_get(void); + +#endif diff --git a/media/libtheora/lib/x86/x86int.h b/media/libtheora/lib/x86/x86int.h new file mode 100644 index 000000000..35bfb0a02 --- /dev/null +++ b/media/libtheora/lib/x86/x86int.h @@ -0,0 +1,122 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 17578 2010-10-29 04:21:26Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_x86int_H) +# define _x86_x86int_H (1) +# include "../internal.h" + +# if defined(OC_X86_ASM) +# define oc_state_accel_init oc_state_accel_init_x86 +# if defined(OC_X86_64_ASM) +/*x86-64 guarantees SIMD support up through at least SSE2. + If the best routine we have available only needs SSE2 (which at the moment + covers all of them), then we can avoid runtime detection and the indirect + call.*/ +# define oc_frag_copy(_state,_dst,_src,_ystride) \ + oc_frag_copy_mmx(_dst,_src,_ystride) +# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) \ + oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) +# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \ + oc_frag_recon_intra_mmx(_dst,_ystride,_residue) +# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ + oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue) +# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ + oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue) +# define oc_idct8x8(_state,_y,_x,_last_zzi) \ + oc_idct8x8_sse2(_y,_x,_last_zzi) +# define oc_state_frag_recon oc_state_frag_recon_mmx +# define oc_loop_filter_init(_state,_bv,_flimit) \ + oc_loop_filter_init_mmxext(_bv,_flimit) +# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext +# define oc_restore_fpu(_state) \ + oc_restore_fpu_mmx() +# else +# define OC_STATE_USE_VTABLE (1) +# endif +# endif + +# include "../state.h" +# include "x86cpu.h" + +/*Converts the expression in the argument to a string.*/ +#define OC_M2STR(_s) #_s + +/*Memory operands do not always include an offset. + To avoid warnings, we force an offset with %H (which adds 8).*/ +# if __GNUC_PREREQ(4,0) +# define OC_MEM_OFFS(_offs,_name) \ + OC_M2STR(_offs-8+%H[_name]) +# endif +/*If your gcc version does't support %H, then you get to suffer the warnings. + Note that Apple's gas breaks on things like _offs+(%esp): it throws away the + whole offset, instead of substituting in 0 for the missing operand to +.*/ +# if !defined(OC_MEM_OFFS) +# define OC_MEM_OFFS(_offs,_name) \ + OC_M2STR(_offs+%[_name]) +# endif + +/*Declare an array operand with an exact size. + This tells gcc we're going to clobber this memory region, without having to + clobber all of "memory" and lets us access local buffers directly using the + stack pointer, without allocating a separate register to point to them.*/ +#define OC_ARRAY_OPERAND(_type,_ptr,_size) \ + (*({ \ + struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \ + array_addr__; \ + })) + +/*Declare an array operand with an exact size. + This tells gcc we're going to clobber this memory region, without having to + clobber all of "memory" and lets us access local buffers directly using the + stack pointer, without allocating a separate register to point to them.*/ +#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \ + (*({ \ + const struct{_type array_value__[(_size)];} *array_addr__= \ + (const void *)(_ptr); \ + array_addr__; \ + })) + +extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64]; + +void oc_state_accel_init_x86(oc_theora_state *_state); + +void oc_frag_copy_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride); +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue); +void oc_frag_recon_inter_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit); +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit); +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); +void oc_restore_fpu_mmx(void); + +#endif diff --git a/media/libtheora/lib/x86/x86state.c b/media/libtheora/lib/x86/x86state.c new file mode 100644 index 000000000..a3d37267f --- /dev/null +++ b/media/libtheora/lib/x86/x86state.c @@ -0,0 +1,95 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86state.c 17421 2010-09-22 16:46:18Z giles $ + + ********************************************************************/ + +#include "x86int.h" + +#if defined(OC_X86_ASM) + +/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into + each quadrant of the destination.*/ +static const unsigned char OC_FZIG_ZAG_MMX[128]={ + 0, 8, 1, 2, 9,16,24,17, + 10, 3,32,11,18,25, 4,12, + 5,26,19,40,33,34,41,48, + 27, 6,13,20,28,21,14, 7, + 56,49,42,35,43,50,57,36, + 15,22,29,30,23,44,37,58, + 51,59,38,45,52,31,60,53, + 46,39,47,54,61,62,55,63, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 +}; + +/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into + the destination.*/ +static const unsigned char OC_FZIG_ZAG_SSE2[128]={ + 0, 8, 1, 2, 9,16,24,17, + 10, 3, 4,11,18,25,32,40, + 33,26,19,12, 5, 6,13,20, + 27,34,41,48,56,49,42,35, + 28,21,14, 7,15,22,29,36, + 43,50,57,58,51,44,37,30, + 23,31,38,45,52,59,60,53, + 46,39,47,54,61,62,55,63, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 +}; + +void oc_state_accel_init_x86(oc_theora_state *_state){ + oc_state_accel_init_c(_state); + _state->cpu_flags=oc_cpu_flags_get(); +# if defined(OC_STATE_USE_VTABLE) + if(_state->cpu_flags&OC_CPU_X86_MMX){ + _state->opt_vtable.frag_copy=oc_frag_copy_mmx; + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx; + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; + _state->opt_vtable.idct8x8=oc_idct8x8_mmx; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx; + _state->opt_vtable.state_loop_filter_frag_rows= + oc_state_loop_filter_frag_rows_mmx; + _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; + } + if(_state->cpu_flags&OC_CPU_X86_MMXEXT){ + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext; + _state->opt_vtable.state_loop_filter_frag_rows= + oc_state_loop_filter_frag_rows_mmxext; + } + if(_state->cpu_flags&OC_CPU_X86_SSE2){ + _state->opt_vtable.idct8x8=oc_idct8x8_sse2; +# endif + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2; +# if defined(OC_STATE_USE_VTABLE) + } +# endif +} +#endif |