summaryrefslogtreecommitdiffstats
path: root/gfx/ycbcr
diff options
context:
space:
mode:
authorMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
committerMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
commit5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree10027f336435511475e392454359edea8e25895d /gfx/ycbcr
parent49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
downloadUXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
Add m-esr52 at 52.6.0
Diffstat (limited to 'gfx/ycbcr')
-rw-r--r--gfx/ycbcr/LICENSE27
-rw-r--r--gfx/ycbcr/QuellGccWarnings.patch40
-rw-r--r--gfx/ycbcr/README29
-rw-r--r--gfx/ycbcr/TypeFromSize.patch58
-rw-r--r--gfx/ycbcr/YCbCrUtils.cpp157
-rw-r--r--gfx/ycbcr/YCbCrUtils.h30
-rw-r--r--gfx/ycbcr/chromium_types.h50
-rw-r--r--gfx/ycbcr/convert.patch3143
-rw-r--r--gfx/ycbcr/moz.build65
-rw-r--r--gfx/ycbcr/scale_yuv_argb.cpp1126
-rw-r--r--gfx/ycbcr/scale_yuv_argb.h39
-rw-r--r--gfx/ycbcr/update.sh12
-rw-r--r--gfx/ycbcr/win64.patch210
-rw-r--r--gfx/ycbcr/ycbcr_to_rgb565.cpp672
-rw-r--r--gfx/ycbcr/ycbcr_to_rgb565.h72
-rw-r--r--gfx/ycbcr/yuv_convert.cpp510
-rw-r--r--gfx/ycbcr/yuv_convert.h110
-rw-r--r--gfx/ycbcr/yuv_convert_arm.cpp232
-rw-r--r--gfx/ycbcr/yuv_convert_mmx.cpp45
-rw-r--r--gfx/ycbcr/yuv_convert_sse2.cpp47
-rw-r--r--gfx/ycbcr/yuv_row.h142
-rw-r--r--gfx/ycbcr/yuv_row_arm.s304
-rw-r--r--gfx/ycbcr/yuv_row_c.cpp133
-rw-r--r--gfx/ycbcr/yuv_row_other.cpp34
-rw-r--r--gfx/ycbcr/yuv_row_posix.cpp917
-rw-r--r--gfx/ycbcr/yuv_row_table.cpp233
-rw-r--r--gfx/ycbcr/yuv_row_win.cpp498
-rw-r--r--gfx/ycbcr/yuv_row_win64.cpp205
28 files changed, 9140 insertions, 0 deletions
diff --git a/gfx/ycbcr/LICENSE b/gfx/ycbcr/LICENSE
new file mode 100644
index 000000000..8dc35041d
--- /dev/null
+++ b/gfx/ycbcr/LICENSE
@@ -0,0 +1,27 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/gfx/ycbcr/QuellGccWarnings.patch b/gfx/ycbcr/QuellGccWarnings.patch
new file mode 100644
index 000000000..d580ac981
--- /dev/null
+++ b/gfx/ycbcr/QuellGccWarnings.patch
@@ -0,0 +1,40 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -337,16 +337,17 @@ void ScaleYCbCrToRGB32(const uint* yplan
+ source_dx_uv >> kFractionBits);
+ }
+ }
+ else {
+ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ }
+ #else
++ (void)source_dx_uv;
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ #endif
+ }
+ }
+ // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
+ if (has_mmx)
+ EMMS();
+diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
+--- a/gfx/ycbcr/yuv_row.h
++++ b/gfx/ycbcr/yuv_row.h
+@@ -129,14 +129,14 @@ extern SIMD_ALIGNED(int16 kCoefficientsR
+ #if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
+ #if defined(_MSC_VER)
+ #define EMMS() __asm emms
+ #pragma warning(disable: 4799)
+ #else
+ #define EMMS() asm("emms")
+ #endif
+ #else
+-#define EMMS()
++#define EMMS() ((void)0)
+ #endif
+
+ } // extern "C"
+
+ #endif // MEDIA_BASE_YUV_ROW_H_
diff --git a/gfx/ycbcr/README b/gfx/ycbcr/README
new file mode 100644
index 000000000..a951bc83a
--- /dev/null
+++ b/gfx/ycbcr/README
@@ -0,0 +1,29 @@
+This color conversion code is from the Chromium open source project available here:
+
+http://code.google.com/chromium/
+
+The code comes from svn revision 63840 on 2010-10-26.
+
+If you just want to check out this individual directory, use:
+
+svn co -r 63840 http://src.chromium.org/svn/trunk/src/media/base
+
+The code was copied from a Chromium svn checkout using the 'update.sh' script which then applies patches for our build and to add dynamic CPU detection.
+
+convert.patch contains the following changes:
+
+ * Change Chromium code to build using Mozilla build system.
+ * Add runtime CPU detection for MMX
+ * Move default C implementation to work on all platforms.
+ * Change Chromium code to allow a picture region.
+ * The YUV conversion will convert within this picture region only.
+ * Add YCbCr 4:4:4 support
+ * Bug 619178 - Update CPU detection in yuv_convert to new SSE.h interface.
+ * Bug 616778 - Split yuv_convert FilterRows vectorized code into separate files so it can
+ be properly guarded with cpuid() calls.
+
+win64.patch: SSE2 optimization for Microsoft Visual C++ x64 version
+
+TypeFromSize.patch: Bug 656185 - Add a method to detect YUVType from plane sizes.
+
+QuellGccWarnings.patch: Bug 711895 - Avoid some GCC compilation warnings.
diff --git a/gfx/ycbcr/TypeFromSize.patch b/gfx/ycbcr/TypeFromSize.patch
new file mode 100644
index 000000000..d08a19690
--- /dev/null
+++ b/gfx/ycbcr/TypeFromSize.patch
@@ -0,0 +1,58 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -26,16 +26,32 @@ namespace mozilla {
+
+ namespace gfx {
+
+ // 16.16 fixed point arithmetic
+ const int kFractionBits = 16;
+ const int kFractionMax = 1 << kFractionBits;
+ const int kFractionMask = ((1 << kFractionBits) - 1);
+
++YUVType TypeFromSize(int ywidth,
++ int yheight,
++ int cbcrwidth,
++ int cbcrheight)
++{
++ if (ywidth == cbcrwidth && yheight == cbcrheight) {
++ return YV24;
++ }
++ else if (ywidth / 2 == cbcrwidth && yheight == cbcrheight) {
++ return YV16;
++ }
++ else {
++ return YV12;
++ }
++}
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ void ConvertYCbCrToRGB32(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -36,16 +36,18 @@ enum Rotate {
+ // Filter affects how scaling looks.
+ enum ScaleFilter {
+ FILTER_NONE = 0, // No filter (point sampled).
+ FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
+ FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
+ FILTER_BILINEAR = 3 // Bilinear filter.
+ };
+
++YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight);
++
+ // Convert a frame of YUV to 32 bit ARGB.
+ // Pass in YV16/YV12 depending on source format
+ void ConvertYCbCrToRGB32(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int pic_x,
+ int pic_y,
diff --git a/gfx/ycbcr/YCbCrUtils.cpp b/gfx/ycbcr/YCbCrUtils.cpp
new file mode 100644
index 000000000..882197857
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.cpp
@@ -0,0 +1,157 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "gfx2DGlue.h"
+
+#include "YCbCrUtils.h"
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+
+namespace mozilla {
+namespace gfx {
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+ SurfaceFormat& aSuggestedFormat,
+ IntSize& aSuggestedSize)
+{
+ YUVType yuvtype =
+ TypeFromSize(aData.mYSize.width,
+ aData.mYSize.height,
+ aData.mCbCrSize.width,
+ aData.mCbCrSize.height);
+
+ // 'prescale' is true if the scaling is to be done as part of the
+ // YCbCr to RGB conversion rather than on the RGB data when rendered.
+ bool prescale = aSuggestedSize.width > 0 && aSuggestedSize.height > 0 &&
+ aSuggestedSize != aData.mPicSize;
+
+ if (aSuggestedFormat == SurfaceFormat::R5G6B5_UINT16) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (prescale &&
+ !IsScaleYCbCrToRGB565Fast(aData.mPicX,
+ aData.mPicY,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ aSuggestedSize.width,
+ aSuggestedSize.height,
+ yuvtype,
+ FILTER_BILINEAR) &&
+ IsConvertYCbCrToRGB565Fast(aData.mPicX,
+ aData.mPicY,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ yuvtype)) {
+ prescale = false;
+ }
+#else
+ // yuv2rgb16 function not available
+ aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+#endif
+ }
+ else if (aSuggestedFormat != SurfaceFormat::B8G8R8X8) {
+ // No other formats are currently supported.
+ aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+ }
+ if (aSuggestedFormat == SurfaceFormat::B8G8R8X8) {
+ /* ScaleYCbCrToRGB32 does not support a picture offset, nor 4:4:4 data.
+ See bugs 639415 and 640073. */
+ if (aData.mPicX != 0 || aData.mPicY != 0 || yuvtype == YV24)
+ prescale = false;
+ }
+ if (!prescale) {
+ aSuggestedSize = aData.mPicSize;
+ }
+}
+
+void
+ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize,
+ unsigned char* aDestBuffer,
+ int32_t aStride)
+{
+ // ConvertYCbCrToRGB et al. assume the chroma planes are rounded up if the
+ // luma plane is odd sized.
+ MOZ_ASSERT((aData.mCbCrSize.width == aData.mYSize.width ||
+ aData.mCbCrSize.width == (aData.mYSize.width + 1) >> 1) &&
+ (aData.mCbCrSize.height == aData.mYSize.height ||
+ aData.mCbCrSize.height == (aData.mYSize.height + 1) >> 1));
+ YUVType yuvtype =
+ TypeFromSize(aData.mYSize.width,
+ aData.mYSize.height,
+ aData.mCbCrSize.width,
+ aData.mCbCrSize.height);
+
+ // Convert from YCbCr to RGB now, scaling the image if needed.
+ if (aDestSize != aData.mPicSize) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+ ScaleYCbCrToRGB565(aData.mYChannel,
+ aData.mCbChannel,
+ aData.mCrChannel,
+ aDestBuffer,
+ aData.mPicX,
+ aData.mPicY,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ aDestSize.width,
+ aDestSize.height,
+ aData.mYStride,
+ aData.mCbCrStride,
+ aStride,
+ yuvtype,
+ FILTER_BILINEAR);
+ } else
+#endif
+ ScaleYCbCrToRGB32(aData.mYChannel, //
+ aData.mCbChannel,
+ aData.mCrChannel,
+ aDestBuffer,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ aDestSize.width,
+ aDestSize.height,
+ aData.mYStride,
+ aData.mCbCrStride,
+ aStride,
+ yuvtype,
+ aData.mYUVColorSpace,
+ FILTER_BILINEAR);
+ } else { // no prescale
+#if defined(HAVE_YCBCR_TO_RGB565)
+ if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+ ConvertYCbCrToRGB565(aData.mYChannel,
+ aData.mCbChannel,
+ aData.mCrChannel,
+ aDestBuffer,
+ aData.mPicX,
+ aData.mPicY,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ aData.mYStride,
+ aData.mCbCrStride,
+ aStride,
+ yuvtype);
+ } else // aDestFormat != SurfaceFormat::R5G6B5_UINT16
+#endif
+ ConvertYCbCrToRGB32(aData.mYChannel, //
+ aData.mCbChannel,
+ aData.mCrChannel,
+ aDestBuffer,
+ aData.mPicX,
+ aData.mPicY,
+ aData.mPicSize.width,
+ aData.mPicSize.height,
+ aData.mYStride,
+ aData.mCbCrStride,
+ aStride,
+ yuvtype,
+ aData.mYUVColorSpace);
+ }
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/YCbCrUtils.h b/gfx/ycbcr/YCbCrUtils.h
new file mode 100644
index 000000000..1cd2e1c4f
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.h
@@ -0,0 +1,30 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_GFX_UTILS_H_
+#define MOZILLA_GFX_UTILS_H_
+
+#include "mozilla/gfx/Types.h"
+#include "ImageContainer.h"
+
+namespace mozilla {
+namespace gfx {
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+ SurfaceFormat& aSuggestedFormat,
+ IntSize& aSuggestedSize);
+
+void
+ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+ const SurfaceFormat& aDestFormat,
+ const IntSize& aDestSize,
+ unsigned char* aDestBuffer,
+ int32_t aStride);
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif /* MOZILLA_GFX_UTILS_H_ */
diff --git a/gfx/ycbcr/chromium_types.h b/gfx/ycbcr/chromium_types.h
new file mode 100644
index 000000000..dceac4766
--- /dev/null
+++ b/gfx/ycbcr/chromium_types.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef GFX_CHROMIUMTYPES_H
+#define GFX_CHROMIUMTYPES_H
+
+#include <stdint.h>
+
+#include "libyuv/basic_types.h"
+
+// From Chromium build_config.h:
+// Processor architecture detection. For more info on what's defined, see:
+// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+// http://www.agner.org/optimize/calling_conventions.pdf
+// or with gcc, run: "echo | gcc -E -dM -"
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_64 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_32 1
+#define ARCH_CPU_X86 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARMEL 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ppc__) || defined(__powerpc) || defined(__PPC__)
+#define ARCH_CPU_PPC_FAMILY 1
+#define ARCH_CPU_PPC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparc)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparcv9)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(__aarch64__)
+#define ARCH_CPU_AARCH64_FAMILY 1
+#define ARCH_CPU_AARCH64 1
+#define ARCH_CPU_64_BITS 1
+#else
+#warning Please add support for your architecture in chromium_types.h
+#endif
+
+#endif // GFX_CHROMIUMTYPES_H
diff --git a/gfx/ycbcr/convert.patch b/gfx/ycbcr/convert.patch
new file mode 100644
index 000000000..e39f923b3
--- /dev/null
+++ b/gfx/ycbcr/convert.patch
@@ -0,0 +1,3143 @@
+diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
+--- a/gfx/ycbcr/yuv_convert.cpp
++++ b/gfx/ycbcr/yuv_convert.cpp
+@@ -6,145 +6,102 @@
+ // http://www.fourcc.org/yuv.php
+ // The actual conversion is best described here
+ // http://en.wikipedia.org/wiki/YUV
+ // An article on optimizing YUV conversion using tables instead of multiplies
+ // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
+ //
+ // YV12 is a full plane of Y and a half height, half width chroma planes
+ // YV16 is a full plane of Y and a full height, half width chroma planes
++// YV24 is a full plane of Y and a full height, full width chroma planes
+ //
+ // ARGB pixel format is output, which on little endian is stored as BGRA.
+ // The alpha is set to 255, allowing the application to use RGBA or RGB32.
+
+-#include "media/base/yuv_convert.h"
++#include "yuv_convert.h"
+
+ // Header for low level row functions.
+-#include "media/base/yuv_row.h"
+-
+-#if USE_MMX
+-#if defined(_MSC_VER)
+-#include <intrin.h>
+-#else
+-#include <mmintrin.h>
+-#endif
+-#endif
+-
+-#if USE_SSE2
+-#include <emmintrin.h>
+-#endif
+-
+-namespace media {
+-
++#include "yuv_row.h"
++#include "mozilla/SSE.h"
++
++namespace mozilla {
++
++namespace gfx {
++
+ // 16.16 fixed point arithmetic
+ const int kFractionBits = 16;
+ const int kFractionMax = 1 << kFractionBits;
+ const int kFractionMask = ((1 << kFractionBits) - 1);
+
+ // Convert a frame of YUV to 32 bit ARGB.
+-void ConvertYUVToRGB32(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int height,
+- int y_pitch,
+- int uv_pitch,
+- int rgb_pitch,
+- YUVType yuv_type) {
+- unsigned int y_shift = yuv_type;
+- for (int y = 0; y < height; ++y) {
+- uint8* rgb_row = rgb_buf + y * rgb_pitch;
+- const uint8* y_ptr = y_buf + y * y_pitch;
+- const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
+- const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
+-
+- FastConvertYUVToRGB32Row(y_ptr,
+- u_ptr,
+- v_ptr,
+- rgb_row,
+- width);
+- }
++void ConvertYCbCrToRGB32(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int pic_x,
++ int pic_y,
++ int pic_width,
++ int pic_height,
++ int y_pitch,
++ int uv_pitch,
++ int rgb_pitch,
++ YUVType yuv_type) {
++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
++ unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
++ // Test for SSE because the optimized code uses movntq, which is not part of MMX.
++ bool has_sse = supports_mmx() && supports_sse();
++ // There is no optimized YV24 SSE routine so we check for this and
++ // fall back to the C code.
++ has_sse &= yuv_type != YV24;
++ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
++ int x_width = odd_pic_x ? pic_width - 1 : pic_width;
++
++ for (int y = pic_y; y < pic_height + pic_y; ++y) {
++ uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
++ const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
++ const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
++ const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
++
++ if (odd_pic_x) {
++ // Handle the single odd pixel manually and use the
++ // fast routines for the remaining.
++ FastConvertYUVToRGB32Row_C(y_ptr++,
++ u_ptr++,
++ v_ptr++,
++ rgb_row,
++ 1,
++ x_shift);
++ rgb_row += 4;
++ }
++
++ if (has_sse) {
++ FastConvertYUVToRGB32Row(y_ptr,
++ u_ptr,
++ v_ptr,
++ rgb_row,
++ x_width);
++ }
++ else {
++ FastConvertYUVToRGB32Row_C(y_ptr,
++ u_ptr,
++ v_ptr,
++ rgb_row,
++ x_width,
++ x_shift);
++ }
++ }
+
+ // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+- EMMS();
+-}
+-
+-#if USE_SSE2
+-// FilterRows combines two rows of the image using linear interpolation.
+-// SSE2 version does 16 pixels at a time
+-
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+- int source_width, int source_y_fraction) {
+- __m128i zero = _mm_setzero_si128();
+- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+-
+- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+-
+- do {
+- __m128i y0 = _mm_loadu_si128(y0_ptr128);
+- __m128i y1 = _mm_loadu_si128(y1_ptr128);
+- __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+- __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+- y0 = _mm_unpacklo_epi8(y0, zero);
+- y1 = _mm_unpacklo_epi8(y1, zero);
+- y0 = _mm_mullo_epi16(y0, y0_fraction);
+- y1 = _mm_mullo_epi16(y1, y1_fraction);
+- y2 = _mm_mullo_epi16(y2, y0_fraction);
+- y3 = _mm_mullo_epi16(y3, y1_fraction);
+- y0 = _mm_add_epi16(y0, y1);
+- y2 = _mm_add_epi16(y2, y3);
+- y0 = _mm_srli_epi16(y0, 8);
+- y2 = _mm_srli_epi16(y2, 8);
+- y0 = _mm_packus_epi16(y0, y2);
+- *dest128++ = y0;
+- ++y0_ptr128;
+- ++y1_ptr128;
+- } while (dest128 < end128);
+-}
+-#elif USE_MMX
+-// MMX version does 8 pixels at a time
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+- int source_width, int source_y_fraction) {
+- __m64 zero = _mm_setzero_si64();
+- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+-
+- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+- __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+-
+- do {
+- __m64 y0 = *y0_ptr64++;
+- __m64 y1 = *y1_ptr64++;
+- __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+- __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+- y0 = _mm_unpacklo_pi8(y0, zero);
+- y1 = _mm_unpacklo_pi8(y1, zero);
+- y0 = _mm_mullo_pi16(y0, y0_fraction);
+- y1 = _mm_mullo_pi16(y1, y1_fraction);
+- y2 = _mm_mullo_pi16(y2, y0_fraction);
+- y3 = _mm_mullo_pi16(y3, y1_fraction);
+- y0 = _mm_add_pi16(y0, y1);
+- y2 = _mm_add_pi16(y2, y3);
+- y0 = _mm_srli_pi16(y0, 8);
+- y2 = _mm_srli_pi16(y2, 8);
+- y0 = _mm_packs_pu16(y0, y2);
+- *dest64++ = y0;
+- } while (dest64 < end64);
+-}
+-#else // no MMX or SSE2
++ if (has_sse)
++ EMMS();
++}
++
+ // C version does 8 at a time to mimic MMX code
+-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+- int source_width, int source_y_fraction) {
++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++ int source_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ uint8* end = ybuf + source_width;
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
+ }
+-#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_MMX
++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++ int source_width, int source_y_fraction);
++#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_SSE2
++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++ int source_width, int source_y_fraction);
++#endif
++
++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
++ const uint8* y1_ptr, int source_width,
++ int source_y_fraction) {
++#ifdef MOZILLA_MAY_SUPPORT_SSE2
++ if (mozilla::supports_sse2()) {
++ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++ return;
++ }
++#endif
++
++#ifdef MOZILLA_MAY_SUPPORT_MMX
++ if (mozilla::supports_mmx()) {
++ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++ return;
++ }
++#endif
++
++ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
++}
+
+
+ // Scale a frame of YUV to 32 bit ARGB.
+-void ScaleYUVToRGB32(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int source_width,
+- int source_height,
+- int width,
+- int height,
+- int y_pitch,
+- int uv_pitch,
+- int rgb_pitch,
+- YUVType yuv_type,
+- Rotate view_rotate,
+- ScaleFilter filter) {
++void ScaleYCbCrToRGB32(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int source_width,
++ int source_height,
++ int width,
++ int height,
++ int y_pitch,
++ int uv_pitch,
++ int rgb_pitch,
++ YUVType yuv_type,
++ Rotate view_rotate,
++ ScaleFilter filter) {
++ bool has_mmx = supports_mmx();
++
+ // 4096 allows 3 buffers to fit in 12k.
+ // Helps performance on CPU with 16K L1 cache.
+ // Large enough for 3830x2160 and 30" displays which are 2560x1600.
+ const int kFilterBufferSize = 4096;
+ // Disable filtering if the screen is too big (to avoid buffer overflows).
+ // This should never happen to regular users: they don't have monitors
+ // wider than 4096 pixels.
+ // TODO(fbarchard): Allow rotated videos to filter.
+ if (source_width > kFilterBufferSize || view_rotate)
+ filter = FILTER_NONE;
+
+- unsigned int y_shift = yuv_type;
++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+ // Diagram showing origin and direction of source sampling.
+ // ->0 4<-
+ // 7 3
+ //
+ // 6 5
+ // ->1 2<-
+ // Rotations that start at right side of image.
+ if ((view_rotate == ROTATE_180) ||
+@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+ int source_uv_fraction =
+ ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
+
+ const uint8* y_ptr = y0_ptr;
+ const uint8* u_ptr = u0_ptr;
+ const uint8* v_ptr = v0_ptr;
+ // Apply vertical filtering if necessary.
+ // TODO(fbarchard): Remove memcpy when not necessary.
+- if (filter & media::FILTER_BILINEAR_V) {
++ if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
+ if (yscale_fixed != kFractionMax &&
+ source_y_fraction && ((source_y + 1) < source_height)) {
+ FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ } else {
+ memcpy(ybuf, y0_ptr, source_width);
+ }
+ y_ptr = ybuf;
+ ybuf[source_width] = ybuf[source_width-1];
+@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
+ u_ptr = ubuf;
+ v_ptr = vbuf;
+ ubuf[uv_source_width] = ubuf[uv_source_width - 1];
+ vbuf[uv_source_width] = vbuf[uv_source_width - 1];
+ }
+ if (source_dx == kFractionMax) { // Not scaled
+ FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width);
+- } else {
+- if (filter & FILTER_BILINEAR_H) {
++ } else if (filter & FILTER_BILINEAR_H) {
+ LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+ // Specialized scalers and rotation.
+-#if USE_MMX && defined(_MSC_VER)
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
++ if(mozilla::supports_sse()) {
+ if (width == (source_width * 2)) {
+- DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+- dest_pixel, width);
++ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++ dest_pixel, width);
+ } else if ((source_dx & kFractionMask) == 0) {
+ // Scaling by integer scale factor. ie half.
+- ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+- dest_pixel, width,
+- source_dx >> kFractionBits);
++ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++ dest_pixel, width,
++ source_dx >> kFractionBits);
+ } else if (source_dx_uv == source_dx) { // Not rotated.
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+- RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+- dest_pixel, width,
+- source_dx >> kFractionBits,
+- source_dx_uv >> kFractionBits);
++ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
++ dest_pixel, width,
++ source_dx >> kFractionBits,
++ source_dx_uv >> kFractionBits);
+ }
++ }
++ else {
++ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
++ dest_pixel, width, source_dx);
++ }
+ #else
+- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+- dest_pixel, width, source_dx);
+-#endif
+- }
++ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
++ dest_pixel, width, source_dx);
++#endif
+ }
+ }
+ // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
+- EMMS();
+-}
+-
+-} // namespace media
++ if (has_mmx)
++ EMMS();
++}
++
++} // namespace gfx
++} // namespace mozilla
+diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
+--- a/gfx/ycbcr/yuv_convert.h
++++ b/gfx/ycbcr/yuv_convert.h
+@@ -1,72 +1,79 @@
+ // Copyright (c) 2010 The Chromium Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+
+ #ifndef MEDIA_BASE_YUV_CONVERT_H_
+ #define MEDIA_BASE_YUV_CONVERT_H_
+
+-#include "base/basictypes.h"
+-
+-namespace media {
+-
++#include "chromium_types.h"
++#include "gfxCore.h"
++
++namespace mozilla {
++
++namespace gfx {
++
+ // Type of YUV surface.
+ // The value of these enums matter as they are used to shift vertical indices.
+ enum YUVType {
+- YV16 = 0, // YV16 is half width and full height chroma channels.
+- YV12 = 1, // YV12 is half width and half height chroma channels.
++ YV12 = 0, // YV12 is half width and half height chroma channels.
++ YV16 = 1, // YV16 is half width and full height chroma channels.
++ YV24 = 2 // YV24 is full width and full height chroma channels.
+ };
+
+ // Mirror means flip the image horizontally, as in looking in a mirror.
+ // Rotate happens after mirroring.
+ enum Rotate {
+ ROTATE_0, // Rotation off.
+ ROTATE_90, // Rotate clockwise.
+ ROTATE_180, // Rotate upside down.
+ ROTATE_270, // Rotate counter clockwise.
+ MIRROR_ROTATE_0, // Mirror horizontally.
+ MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
+ MIRROR_ROTATE_180, // Mirror vertically.
+- MIRROR_ROTATE_270, // Transpose.
++ MIRROR_ROTATE_270 // Transpose.
+ };
+
+ // Filter affects how scaling looks.
+ enum ScaleFilter {
+ FILTER_NONE = 0, // No filter (point sampled).
+ FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
+ FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
+- FILTER_BILINEAR = 3, // Bilinear filter.
++ FILTER_BILINEAR = 3 // Bilinear filter.
+ };
+
+ // Convert a frame of YUV to 32 bit ARGB.
+ // Pass in YV16/YV12 depending on source format
+-void ConvertYUVToRGB32(const uint8* yplane,
+- const uint8* uplane,
+- const uint8* vplane,
+- uint8* rgbframe,
+- int width,
+- int height,
+- int ystride,
+- int uvstride,
+- int rgbstride,
+- YUVType yuv_type);
++void ConvertYCbCrToRGB32(const uint8* yplane,
++ const uint8* uplane,
++ const uint8* vplane,
++ uint8* rgbframe,
++ int pic_x,
++ int pic_y,
++ int pic_width,
++ int pic_height,
++ int ystride,
++ int uvstride,
++ int rgbstride,
++ YUVType yuv_type);
+
+ // Scale a frame of YUV to 32 bit ARGB.
+ // Supports rotation and mirroring.
+-void ScaleYUVToRGB32(const uint8* yplane,
+- const uint8* uplane,
+- const uint8* vplane,
+- uint8* rgbframe,
+- int source_width,
+- int source_height,
+- int width,
+- int height,
+- int ystride,
+- int uvstride,
+- int rgbstride,
+- YUVType yuv_type,
+- Rotate view_rotate,
+- ScaleFilter filter);
+-
+-} // namespace media
+-
++void ScaleYCbCrToRGB32(const uint8* yplane,
++ const uint8* uplane,
++ const uint8* vplane,
++ uint8* rgbframe,
++ int source_width,
++ int source_height,
++ int width,
++ int height,
++ int ystride,
++ int uvstride,
++ int rgbstride,
++ YUVType yuv_type,
++ Rotate view_rotate,
++ ScaleFilter filter);
++
++} // namespace gfx
++} // namespace mozilla
++
+ #endif // MEDIA_BASE_YUV_CONVERT_H_
+diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_convert_mmx.cpp
+@@ -0,0 +1,45 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include <mmintrin.h>
++#include "yuv_row.h"
++
++namespace mozilla {
++namespace gfx {
++
++// FilterRows combines two rows of the image using linear interpolation.
++// MMX version does 8 pixels at a time.
++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++ int source_width, int source_y_fraction) {
++ __m64 zero = _mm_setzero_si64();
++ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
++ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
++
++ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
++ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
++ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
++ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
++
++ do {
++ __m64 y0 = *y0_ptr64++;
++ __m64 y1 = *y1_ptr64++;
++ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
++ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
++ y0 = _mm_unpacklo_pi8(y0, zero);
++ y1 = _mm_unpacklo_pi8(y1, zero);
++ y0 = _mm_mullo_pi16(y0, y0_fraction);
++ y1 = _mm_mullo_pi16(y1, y1_fraction);
++ y2 = _mm_mullo_pi16(y2, y0_fraction);
++ y3 = _mm_mullo_pi16(y3, y1_fraction);
++ y0 = _mm_add_pi16(y0, y1);
++ y2 = _mm_add_pi16(y2, y3);
++ y0 = _mm_srli_pi16(y0, 8);
++ y2 = _mm_srli_pi16(y2, 8);
++ y0 = _mm_packs_pu16(y0, y2);
++ *dest64++ = y0;
++ } while (dest64 < end64);
++}
++
++}
++}
+diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_convert_sse2.cpp
+@@ -0,0 +1,47 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include <emmintrin.h>
++#include "yuv_row.h"
++
++namespace mozilla {
++namespace gfx {
++
++// FilterRows combines two rows of the image using linear interpolation.
++// SSE2 version does 16 pixels at a time.
++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
++ int source_width, int source_y_fraction) {
++ __m128i zero = _mm_setzero_si128();
++ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
++ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
++
++ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
++ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
++ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
++ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
++
++ do {
++ __m128i y0 = _mm_loadu_si128(y0_ptr128);
++ __m128i y1 = _mm_loadu_si128(y1_ptr128);
++ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
++ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
++ y0 = _mm_unpacklo_epi8(y0, zero);
++ y1 = _mm_unpacklo_epi8(y1, zero);
++ y0 = _mm_mullo_epi16(y0, y0_fraction);
++ y1 = _mm_mullo_epi16(y1, y1_fraction);
++ y2 = _mm_mullo_epi16(y2, y0_fraction);
++ y3 = _mm_mullo_epi16(y3, y1_fraction);
++ y0 = _mm_add_epi16(y0, y1);
++ y2 = _mm_add_epi16(y2, y3);
++ y0 = _mm_srli_epi16(y0, 8);
++ y2 = _mm_srli_epi16(y2, 8);
++ y0 = _mm_packus_epi16(y0, y2);
++ *dest128++ = y0;
++ ++y0_ptr128;
++ ++y1_ptr128;
++ } while (dest128 < end128);
++}
++
++}
++}
+diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
+--- a/gfx/ycbcr/yuv_row.h
++++ b/gfx/ycbcr/yuv_row.h
+@@ -5,109 +5,133 @@
+ // yuv_row internal functions to handle YUV conversion and scaling to RGB.
+ // These functions are used from both yuv_convert.cc and yuv_scale.cc.
+
+ // TODO(fbarchard): Write function that can handle rotation and scaling.
+
+ #ifndef MEDIA_BASE_YUV_ROW_H_
+ #define MEDIA_BASE_YUV_ROW_H_
+
+-#include "base/basictypes.h"
++#include "chromium_types.h"
+
+ extern "C" {
+ // Can only do 1x.
+ // This is the second fastest of the scalers.
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+-// Can do 1x, half size or any scale down by an integer amount.
+-// Step can be negative (mirroring, rotate 180).
+-// This is the third fastest of the scalers.
+-void ConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int step);
+-
+-// Rotate is like Convert, but applies different step to Y versus U and V.
+-// This allows rotation by 90 or 270, by stepping by stride.
+-// This is the forth fastest of the scalers.
+-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
++void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+- int ystep,
+- int uvstep);
++ unsigned int x_shift);
++
++void FastConvertYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width);
++
++// Can do 1x, half size or any scale down by an integer amount.
++// Step can be negative (mirroring, rotate 180).
++// This is the third fastest of the scalers.
++// Only defined on Windows x86-32.
++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int step);
++
++// Rotate is like Convert, but applies different step to Y versus U and V.
++// This allows rotation by 90 or 270, by stepping by stride.
++// This is the forth fastest of the scalers.
++// Only defined on Windows x86-32.
++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int ystep,
++ int uvstep);
+
+ // Doubler does 4 pixels at a time. Each pixel is replicated.
+ // This is the fastest of the scalers.
+-void DoubleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width);
++// Only defined on Windows x86-32.
++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width);
+
+ // Handles arbitrary scaling up or down.
+ // Mirroring is supported, but not 90 or 270 degree rotation.
+ // Chroma is under sampled every 2 pixels for performance.
+ void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
++
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
++
+ // Handles arbitrary scaling up or down with bilinear filtering.
+ // Mirroring is supported, but not 90 or 270 degree rotation.
+ // Chroma is under sampled every 2 pixels for performance.
+ // This is the slowest of the scalers.
+ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
++
++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
++
++
+ #if defined(_MSC_VER)
+ #define SIMD_ALIGNED(var) __declspec(align(16)) var
+ #else
+ #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+ #endif
+ extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
+
+-// Method to force C version.
+-//#define USE_MMX 0
+-//#define USE_SSE2 0
+-
+-#if !defined(USE_MMX)
+-// Windows, Mac and Linux/BSD use MMX
+-#if defined(__MMX__) || defined(_MSC_VER)
+-#define USE_MMX 1
+-#else
+-#define USE_MMX 0
+-#endif
+-#endif
+-
+-#if !defined(USE_SSE2)
+-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
+-#define USE_SSE2 1
+-#else
+-#define USE_SSE2 0
+-#endif
+-#endif
+-
+ // x64 uses MMX2 (SSE) so emms is not required.
+ // Warning C4799: function has no EMMS instruction.
+ // EMMS() is slow and should be called by the calling function once per image.
+-#if USE_MMX && !defined(ARCH_CPU_X86_64)
++#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
+ #if defined(_MSC_VER)
+ #define EMMS() __asm emms
+ #pragma warning(disable: 4799)
+ #else
+ #define EMMS() asm("emms")
+ #endif
+ #else
+ #define EMMS()
+diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
+--- a/gfx/ycbcr/yuv_row_c.cpp
++++ b/gfx/ycbcr/yuv_row_c.cpp
+@@ -1,812 +1,18 @@
+ // Copyright (c) 2010 The Chromium Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+
+-#include "media/base/yuv_row.h"
+-
+-#ifdef _DEBUG
+-#include "base/logging.h"
+-#else
++#include "yuv_row.h"
++
+ #define DCHECK(a)
+-#endif
+
+ extern "C" {
+
+-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
+-
+-// AMD64 ABI uses register paremters.
+-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
+- const uint8* u_buf, // rsi
+- const uint8* v_buf, // rdx
+- uint8* rgb_buf, // rcx
+- int width) { // r8
+- asm(
+- "jmp convertend\n"
+-"convertloop:"
+- "movzb (%1),%%r10\n"
+- "add $0x1,%1\n"
+- "movzb (%2),%%r11\n"
+- "add $0x1,%2\n"
+- "movq 2048(%5,%%r10,8),%%xmm0\n"
+- "movzb (%0),%%r10\n"
+- "movq 4096(%5,%%r11,8),%%xmm1\n"
+- "movzb 0x1(%0),%%r11\n"
+- "paddsw %%xmm1,%%xmm0\n"
+- "movq (%5,%%r10,8),%%xmm2\n"
+- "add $0x2,%0\n"
+- "movq (%5,%%r11,8),%%xmm3\n"
+- "paddsw %%xmm0,%%xmm2\n"
+- "paddsw %%xmm0,%%xmm3\n"
+- "shufps $0x44,%%xmm3,%%xmm2\n"
+- "psraw $0x6,%%xmm2\n"
+- "packuswb %%xmm2,%%xmm2\n"
+- "movq %%xmm2,0x0(%3)\n"
+- "add $0x8,%3\n"
+-"convertend:"
+- "sub $0x2,%4\n"
+- "jns convertloop\n"
+-
+-"convertnext:"
+- "add $0x1,%4\n"
+- "js convertdone\n"
+-
+- "movzb (%1),%%r10\n"
+- "movq 2048(%5,%%r10,8),%%xmm0\n"
+- "movzb (%2),%%r10\n"
+- "movq 4096(%5,%%r10,8),%%xmm1\n"
+- "paddsw %%xmm1,%%xmm0\n"
+- "movzb (%0),%%r10\n"
+- "movq (%5,%%r10,8),%%xmm1\n"
+- "paddsw %%xmm0,%%xmm1\n"
+- "psraw $0x6,%%xmm1\n"
+- "packuswb %%xmm1,%%xmm1\n"
+- "movd %%xmm1,0x0(%3)\n"
+-"convertdone:"
+- :
+- : "r"(y_buf), // %0
+- "r"(u_buf), // %1
+- "r"(v_buf), // %2
+- "r"(rgb_buf), // %3
+- "r"(width), // %4
+- "r" (kCoefficientsRgbY) // %5
+- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+-);
+-}
+-
+-void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
+- const uint8* u_buf, // rsi
+- const uint8* v_buf, // rdx
+- uint8* rgb_buf, // rcx
+- int width, // r8
+- int source_dx) { // r9
+- asm(
+- "xor %%r11,%%r11\n"
+- "sub $0x2,%4\n"
+- "js scalenext\n"
+-
+-"scaleloop:"
+- "mov %%r11,%%r10\n"
+- "sar $0x11,%%r10\n"
+- "movzb (%1,%%r10,1),%%rax\n"
+- "movq 2048(%5,%%rax,8),%%xmm0\n"
+- "movzb (%2,%%r10,1),%%rax\n"
+- "movq 4096(%5,%%rax,8),%%xmm1\n"
+- "lea (%%r11,%6),%%r10\n"
+- "sar $0x10,%%r11\n"
+- "movzb (%0,%%r11,1),%%rax\n"
+- "paddsw %%xmm1,%%xmm0\n"
+- "movq (%5,%%rax,8),%%xmm1\n"
+- "lea (%%r10,%6),%%r11\n"
+- "sar $0x10,%%r10\n"
+- "movzb (%0,%%r10,1),%%rax\n"
+- "movq (%5,%%rax,8),%%xmm2\n"
+- "paddsw %%xmm0,%%xmm1\n"
+- "paddsw %%xmm0,%%xmm2\n"
+- "shufps $0x44,%%xmm2,%%xmm1\n"
+- "psraw $0x6,%%xmm1\n"
+- "packuswb %%xmm1,%%xmm1\n"
+- "movq %%xmm1,0x0(%3)\n"
+- "add $0x8,%3\n"
+- "sub $0x2,%4\n"
+- "jns scaleloop\n"
+-
+-"scalenext:"
+- "add $0x1,%4\n"
+- "js scaledone\n"
+-
+- "mov %%r11,%%r10\n"
+- "sar $0x11,%%r10\n"
+- "movzb (%1,%%r10,1),%%rax\n"
+- "movq 2048(%5,%%rax,8),%%xmm0\n"
+- "movzb (%2,%%r10,1),%%rax\n"
+- "movq 4096(%5,%%rax,8),%%xmm1\n"
+- "paddsw %%xmm1,%%xmm0\n"
+- "sar $0x10,%%r11\n"
+- "movzb (%0,%%r11,1),%%rax\n"
+- "movq (%5,%%rax,8),%%xmm1\n"
+- "paddsw %%xmm0,%%xmm1\n"
+- "psraw $0x6,%%xmm1\n"
+- "packuswb %%xmm1,%%xmm1\n"
+- "movd %%xmm1,0x0(%3)\n"
+-
+-"scaledone:"
+- :
+- : "r"(y_buf), // %0
+- "r"(u_buf), // %1
+- "r"(v_buf), // %2
+- "r"(rgb_buf), // %3
+- "r"(width), // %4
+- "r" (kCoefficientsRgbY), // %5
+- "r"(static_cast<long>(source_dx)) // %6
+- : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+-);
+-}
+-
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
+- asm(
+- "xor %%r11,%%r11\n" // x = 0
+- "sub $0x2,%4\n"
+- "js .lscalenext\n"
+- "cmp $0x20000,%6\n" // if source_dx >= 2.0
+- "jl .lscalehalf\n"
+- "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
+-".lscalehalf:"
+-
+-".lscaleloop:"
+- "mov %%r11,%%r10\n"
+- "sar $0x11,%%r10\n"
+-
+- "movzb (%1, %%r10, 1), %%r13 \n"
+- "movzb 1(%1, %%r10, 1), %%r14 \n"
+- "mov %%r11, %%rax \n"
+- "and $0x1fffe, %%rax \n"
+- "imul %%rax, %%r14 \n"
+- "xor $0x1fffe, %%rax \n"
+- "imul %%rax, %%r13 \n"
+- "add %%r14, %%r13 \n"
+- "shr $17, %%r13 \n"
+- "movq 2048(%5,%%r13,8), %%xmm0\n"
+-
+- "movzb (%2, %%r10, 1), %%r13 \n"
+- "movzb 1(%2, %%r10, 1), %%r14 \n"
+- "mov %%r11, %%rax \n"
+- "and $0x1fffe, %%rax \n"
+- "imul %%rax, %%r14 \n"
+- "xor $0x1fffe, %%rax \n"
+- "imul %%rax, %%r13 \n"
+- "add %%r14, %%r13 \n"
+- "shr $17, %%r13 \n"
+- "movq 4096(%5,%%r13,8), %%xmm1\n"
+-
+- "mov %%r11, %%rax \n"
+- "lea (%%r11,%6),%%r10\n"
+- "sar $0x10,%%r11\n"
+- "paddsw %%xmm1,%%xmm0\n"
+-
+- "movzb (%0, %%r11, 1), %%r13 \n"
+- "movzb 1(%0, %%r11, 1), %%r14 \n"
+- "and $0xffff, %%rax \n"
+- "imul %%rax, %%r14 \n"
+- "xor $0xffff, %%rax \n"
+- "imul %%rax, %%r13 \n"
+- "add %%r14, %%r13 \n"
+- "shr $16, %%r13 \n"
+- "movq (%5,%%r13,8),%%xmm1\n"
+-
+- "mov %%r10, %%rax \n"
+- "lea (%%r10,%6),%%r11\n"
+- "sar $0x10,%%r10\n"
+-
+- "movzb (%0,%%r10,1), %%r13 \n"
+- "movzb 1(%0,%%r10,1), %%r14 \n"
+- "and $0xffff, %%rax \n"
+- "imul %%rax, %%r14 \n"
+- "xor $0xffff, %%rax \n"
+- "imul %%rax, %%r13 \n"
+- "add %%r14, %%r13 \n"
+- "shr $16, %%r13 \n"
+- "movq (%5,%%r13,8),%%xmm2\n"
+-
+- "paddsw %%xmm0,%%xmm1\n"
+- "paddsw %%xmm0,%%xmm2\n"
+- "shufps $0x44,%%xmm2,%%xmm1\n"
+- "psraw $0x6,%%xmm1\n"
+- "packuswb %%xmm1,%%xmm1\n"
+- "movq %%xmm1,0x0(%3)\n"
+- "add $0x8,%3\n"
+- "sub $0x2,%4\n"
+- "jns .lscaleloop\n"
+-
+-".lscalenext:"
+- "add $0x1,%4\n"
+- "js .lscaledone\n"
+-
+- "mov %%r11,%%r10\n"
+- "sar $0x11,%%r10\n"
+-
+- "movzb (%1,%%r10,1), %%r13 \n"
+- "movq 2048(%5,%%r13,8),%%xmm0\n"
+-
+- "movzb (%2,%%r10,1), %%r13 \n"
+- "movq 4096(%5,%%r13,8),%%xmm1\n"
+-
+- "paddsw %%xmm1,%%xmm0\n"
+- "sar $0x10,%%r11\n"
+-
+- "movzb (%0,%%r11,1), %%r13 \n"
+- "movq (%5,%%r13,8),%%xmm1\n"
+-
+- "paddsw %%xmm0,%%xmm1\n"
+- "psraw $0x6,%%xmm1\n"
+- "packuswb %%xmm1,%%xmm1\n"
+- "movd %%xmm1,0x0(%3)\n"
+-
+-".lscaledone:"
+- :
+- : "r"(y_buf), // %0
+- "r"(u_buf), // %1
+- "r"(v_buf), // %2
+- "r"(rgb_buf), // %3
+- "r"(width), // %4
+- "r" (kCoefficientsRgbY), // %5
+- "r"(static_cast<long>(source_dx)) // %6
+- : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+-);
+-}
+-
+-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
+-
+-// PIC version is slower because less registers are available, so
+-// non-PIC is used on platforms where it is possible.
+-
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width);
+- asm(
+- ".text\n"
+- ".global FastConvertYUVToRGB32Row\n"
+-"FastConvertYUVToRGB32Row:\n"
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x28(%esp),%edi\n"
+- "mov 0x2c(%esp),%esi\n"
+- "mov 0x30(%esp),%ebp\n"
+- "mov 0x34(%esp),%ecx\n"
+- "jmp convertend\n"
+-
+-"convertloop:"
+- "movzbl (%edi),%eax\n"
+- "add $0x1,%edi\n"
+- "movzbl (%esi),%ebx\n"
+- "add $0x1,%esi\n"
+- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+- "movzbl (%edx),%eax\n"
+- "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+- "movzbl 0x1(%edx),%ebx\n"
+- "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+- "add $0x2,%edx\n"
+- "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-"convertend:"
+- "sub $0x2,%ecx\n"
+- "jns convertloop\n"
+-
+- "and $0x1,%ecx\n"
+- "je convertdone\n"
+-
+- "movzbl (%edi),%eax\n"
+- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+- "movzbl (%esi),%eax\n"
+- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+- "movzbl (%edx),%eax\n"
+- "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+- "paddsw %mm0,%mm1\n"
+- "psraw $0x6,%mm1\n"
+- "packuswb %mm1,%mm1\n"
+- "movd %mm1,0x0(%ebp)\n"
+-"convertdone:"
+- "popa\n"
+- "ret\n"
+-);
+-
+-
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx);
+- asm(
+- ".text\n"
+- ".global ScaleYUVToRGB32Row\n"
+-"ScaleYUVToRGB32Row:\n"
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x28(%esp),%edi\n"
+- "mov 0x2c(%esp),%esi\n"
+- "mov 0x30(%esp),%ebp\n"
+- "mov 0x34(%esp),%ecx\n"
+- "xor %ebx,%ebx\n"
+- "jmp scaleend\n"
+-
+-"scaleloop:"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%edi,%eax,1),%eax\n"
+- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%esi,%eax,1),%eax\n"
+- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-"scaleend:"
+- "sub $0x2,%ecx\n"
+- "jns scaleloop\n"
+-
+- "and $0x1,%ecx\n"
+- "je scaledone\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%edi,%eax,1),%eax\n"
+- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%esi,%eax,1),%eax\n"
+- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+- "paddsw %mm0,%mm1\n"
+- "psraw $0x6,%mm1\n"
+- "packuswb %mm1,%mm1\n"
+- "movd %mm1,0x0(%ebp)\n"
+-
+-"scaledone:"
+- "popa\n"
+- "ret\n"
+-);
+-
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx);
+- asm(
+- ".text\n"
+- ".global LinearScaleYUVToRGB32Row\n"
+-"LinearScaleYUVToRGB32Row:\n"
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x28(%esp),%edi\n"
+- "mov 0x30(%esp),%ebp\n"
+-
+- // source_width = width * source_dx + ebx
+- "mov 0x34(%esp), %ecx\n"
+- "imull 0x38(%esp), %ecx\n"
+- "mov %ecx, 0x34(%esp)\n"
+-
+- "mov 0x38(%esp), %ecx\n"
+- "xor %ebx,%ebx\n" // x = 0
+- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+- "jl .lscaleend\n"
+- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+- "jmp .lscaleend\n"
+-
+-".lscaleloop:"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+-
+- "movzbl (%edi,%eax,1),%ecx\n"
+- "movzbl 1(%edi,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "andl $0x1fffe, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0x1fffe, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $17, %ecx \n"
+- "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
+-
+- "mov 0x2c(%esp),%esi\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+-
+- "movzbl (%esi,%eax,1),%ecx\n"
+- "movzbl 1(%esi,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "andl $0x1fffe, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0x1fffe, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $17, %ecx \n"
+- "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%ecx\n"
+- "movzbl 1(%edx,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "andl $0xffff, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0xffff, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $16, %ecx \n"
+- "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
+-
+- "cmp 0x34(%esp), %ebx\n"
+- "jge .lscalelastpixel\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%ecx\n"
+- "movzbl 1(%edx,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "andl $0xffff, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0xffff, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $16, %ecx \n"
+- "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
+-
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-
+-".lscaleend:"
+- "cmp 0x34(%esp), %ebx\n"
+- "jl .lscaleloop\n"
+- "popa\n"
+- "ret\n"
+-
+-".lscalelastpixel:"
+- "paddsw %mm0, %mm1\n"
+- "psraw $6, %mm1\n"
+- "packuswb %mm1, %mm1\n"
+- "movd %mm1, (%ebp)\n"
+- "popa\n"
+- "ret\n"
+-);
+-
+-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
+-
+-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int16 *kCoefficientsRgbY);
+- asm(
+- ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICConvertYUVToRGB32Row:\n"
+-#else
+-"PICConvertYUVToRGB32Row:\n"
+-#endif
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x28(%esp),%edi\n"
+- "mov 0x2c(%esp),%esi\n"
+- "mov 0x30(%esp),%ebp\n"
+- "mov 0x38(%esp),%ecx\n"
+-
+- "jmp .Lconvertend\n"
+-
+-".Lconvertloop:"
+- "movzbl (%edi),%eax\n"
+- "add $0x1,%edi\n"
+- "movzbl (%esi),%ebx\n"
+- "add $0x1,%esi\n"
+- "movq 2048(%ecx,%eax,8),%mm0\n"
+- "movzbl (%edx),%eax\n"
+- "paddsw 4096(%ecx,%ebx,8),%mm0\n"
+- "movzbl 0x1(%edx),%ebx\n"
+- "movq 0(%ecx,%eax,8),%mm1\n"
+- "add $0x2,%edx\n"
+- "movq 0(%ecx,%ebx,8),%mm2\n"
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-".Lconvertend:"
+- "subl $0x2,0x34(%esp)\n"
+- "jns .Lconvertloop\n"
+-
+- "andl $0x1,0x34(%esp)\n"
+- "je .Lconvertdone\n"
+-
+- "movzbl (%edi),%eax\n"
+- "movq 2048(%ecx,%eax,8),%mm0\n"
+- "movzbl (%esi),%eax\n"
+- "paddsw 4096(%ecx,%eax,8),%mm0\n"
+- "movzbl (%edx),%eax\n"
+- "movq 0(%ecx,%eax,8),%mm1\n"
+- "paddsw %mm0,%mm1\n"
+- "psraw $0x6,%mm1\n"
+- "packuswb %mm1,%mm1\n"
+- "movd %mm1,0x0(%ebp)\n"
+-".Lconvertdone:\n"
+- "popa\n"
+- "ret\n"
+-);
+-
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width) {
+- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx,
+- int16 *kCoefficientsRgbY);
+-
+- asm(
+- ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICScaleYUVToRGB32Row:\n"
+-#else
+-"PICScaleYUVToRGB32Row:\n"
+-#endif
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x28(%esp),%edi\n"
+- "mov 0x2c(%esp),%esi\n"
+- "mov 0x30(%esp),%ebp\n"
+- "mov 0x3c(%esp),%ecx\n"
+- "xor %ebx,%ebx\n"
+- "jmp Lscaleend\n"
+-
+-"Lscaleloop:"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%edi,%eax,1),%eax\n"
+- "movq 2048(%ecx,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%esi,%eax,1),%eax\n"
+- "paddsw 4096(%ecx,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq 0(%ecx,%eax,8),%mm1\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq 0(%ecx,%eax,8),%mm2\n"
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-"Lscaleend:"
+- "subl $0x2,0x34(%esp)\n"
+- "jns Lscaleloop\n"
+-
+- "andl $0x1,0x34(%esp)\n"
+- "je Lscaledone\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%edi,%eax,1),%eax\n"
+- "movq 2048(%ecx,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+- "movzbl (%esi,%eax,1),%eax\n"
+- "paddsw 4096(%ecx,%eax,8),%mm0\n"
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%eax\n"
+- "movq 0(%ecx,%eax,8),%mm1\n"
+- "paddsw %mm0,%mm1\n"
+- "psraw $0x6,%mm1\n"
+- "packuswb %mm1,%mm1\n"
+- "movd %mm1,0x0(%ebp)\n"
+-
+-"Lscaledone:"
+- "popa\n"
+- "ret\n"
+-);
+-
+-
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
+- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx,
+- int16 *kCoefficientsRgbY);
+- asm(
+- ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICLinearScaleYUVToRGB32Row:\n"
+-#else
+-"PICLinearScaleYUVToRGB32Row:\n"
+-#endif
+- "pusha\n"
+- "mov 0x24(%esp),%edx\n"
+- "mov 0x30(%esp),%ebp\n"
+- "mov 0x34(%esp),%ecx\n"
+- "mov 0x3c(%esp),%edi\n"
+- "xor %ebx,%ebx\n"
+-
+- // source_width = width * source_dx + ebx
+- "mov 0x34(%esp), %ecx\n"
+- "imull 0x38(%esp), %ecx\n"
+- "mov %ecx, 0x34(%esp)\n"
+-
+- "mov 0x38(%esp), %ecx\n"
+- "xor %ebx,%ebx\n" // x = 0
+- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+- "jl .lscaleend\n"
+- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+- "jmp .lscaleend\n"
+-
+-".lscaleloop:"
+- "mov 0x28(%esp),%esi\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+-
+- "movzbl (%esi,%eax,1),%ecx\n"
+- "movzbl 1(%esi,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "andl $0x1fffe, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0x1fffe, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $17, %ecx \n"
+- "movq 2048(%edi,%ecx,8),%mm0\n"
+-
+- "mov 0x2c(%esp),%esi\n"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
+-
+- "movzbl (%esi,%eax,1),%ecx\n"
+- "movzbl 1(%esi,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "andl $0x1fffe, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0x1fffe, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $17, %ecx \n"
+- "paddsw 4096(%edi,%ecx,8),%mm0\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%ecx\n"
+- "movzbl 1(%edx,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "andl $0xffff, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0xffff, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $16, %ecx \n"
+- "movq (%edi,%ecx,8),%mm1\n"
+-
+- "cmp 0x34(%esp), %ebx\n"
+- "jge .lscalelastpixel\n"
+-
+- "mov %ebx,%eax\n"
+- "sar $0x10,%eax\n"
+- "movzbl (%edx,%eax,1),%ecx\n"
+- "movzbl 1(%edx,%eax,1),%esi\n"
+- "mov %ebx,%eax\n"
+- "add 0x38(%esp),%ebx\n"
+- "andl $0xffff, %eax \n"
+- "imul %eax, %esi \n"
+- "xorl $0xffff, %eax \n"
+- "imul %eax, %ecx \n"
+- "addl %esi, %ecx \n"
+- "shrl $16, %ecx \n"
+- "movq (%edi,%ecx,8),%mm2\n"
+-
+- "paddsw %mm0,%mm1\n"
+- "paddsw %mm0,%mm2\n"
+- "psraw $0x6,%mm1\n"
+- "psraw $0x6,%mm2\n"
+- "packuswb %mm2,%mm1\n"
+- "movntq %mm1,0x0(%ebp)\n"
+- "add $0x8,%ebp\n"
+-
+-".lscaleend:"
+- "cmp %ebx, 0x34(%esp)\n"
+- "jg .lscaleloop\n"
+- "popa\n"
+- "ret\n"
+-
+-".lscalelastpixel:"
+- "paddsw %mm0, %mm1\n"
+- "psraw $6, %mm1\n"
+- "packuswb %mm1, %mm1\n"
+- "movd %mm1, (%ebp)\n"
+- "popa\n"
+- "ret\n"
+-);
+-
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
+- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-#else // USE_MMX
+-
+ // C reference code that mimic the YUV assembly.
+ #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+ #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+ (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+ static inline void YuvPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
+ a >>= 6;
+
+ *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+ (packuswb(g) << 8) |
+ (packuswb(r) << 16) |
+ (packuswb(a) << 24);
+ }
+
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width) {
++void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ unsigned int x_shift) {
+ for (int x = 0; x < width; x += 2) {
+- uint8 u = u_buf[x >> 1];
+- uint8 v = v_buf[x >> 1];
++ uint8 u = u_buf[x >> x_shift];
++ uint8 v = v_buf[x >> x_shift];
+ uint8 y0 = y_buf[x];
+ YuvPixel(y0, u, v, rgb_buf);
+ if ((x + 1) < width) {
+ uint8 y1 = y_buf[x + 1];
++ if (x_shift == 0) {
++ u = u_buf[x + 1];
++ v = v_buf[x + 1];
++ }
+ YuvPixel(y1, u, v, rgb_buf + 4);
+ }
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ }
+
+ // 16.16 fixed point is used. A shift by 16 isolates the integer.
+ // A shift by 17 is used to further subsample the chrominence channels.
+ // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+ // for 1/65536 pixel accurate interpolation.
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
+ int x = 0;
+ for (int i = 0; i < width; i += 2) {
+ int y = y_buf[x >> 16];
+ int u = u_buf[(x >> 17)];
+ int v = v_buf[(x >> 17)];
+ YuvPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y = y_buf[x >> 16];
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+ }
+
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
+ int x = 0;
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+ for (int i = 0; i < width; i += 2) {
+ int y0 = y_buf[x >> 16];
+ int y1 = y_buf[(x >> 16) + 1];
+ int u0 = u_buf[(x >> 17)];
+@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
+ y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+ }
+
+-#endif // USE_MMX
+ } // extern "C"
+
+diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
+--- a/gfx/ycbcr/yuv_row_posix.cpp
++++ b/gfx/ycbcr/yuv_row_posix.cpp
+@@ -1,33 +1,32 @@
+ // Copyright (c) 2010 The Chromium Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+
+-#include "media/base/yuv_row.h"
+-
+-#ifdef _DEBUG
+-#include "base/logging.h"
+-#else
++#include "yuv_row.h"
++#include "mozilla/SSE.h"
++
+ #define DCHECK(a)
+-#endif
+
+ extern "C" {
+
+-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
++#if defined(ARCH_CPU_X86_64)
++
++// We don't need CPUID guards here, since x86-64 implies SSE2.
+
+ // AMD64 ABI uses register paremters.
+ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm(
+- "jmp convertend\n"
+-"convertloop:"
++ "jmp 1f\n"
++"0:"
+ "movzb (%1),%%r10\n"
+ "add $0x1,%1\n"
+ "movzb (%2),%%r11\n"
+ "add $0x1,%2\n"
+ "movq 2048(%5,%%r10,8),%%xmm0\n"
+ "movzb (%0),%%r10\n"
+ "movq 4096(%5,%%r11,8),%%xmm1\n"
+ "movzb 0x1(%0),%%r11\n"
+@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
+ "movq (%5,%%r11,8),%%xmm3\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "paddsw %%xmm0,%%xmm3\n"
+ "shufps $0x44,%%xmm3,%%xmm2\n"
+ "psraw $0x6,%%xmm2\n"
+ "packuswb %%xmm2,%%xmm2\n"
+ "movq %%xmm2,0x0(%3)\n"
+ "add $0x8,%3\n"
+-"convertend:"
++"1:"
+ "sub $0x2,%4\n"
+- "jns convertloop\n"
+-
+-"convertnext:"
++ "jns 0b\n"
++
++"2:"
+ "add $0x1,%4\n"
+- "js convertdone\n"
++ "js 3f\n"
+
+ "movzb (%1),%%r10\n"
+ "movq 2048(%5,%%r10,8),%%xmm0\n"
+ "movzb (%2),%%r10\n"
+ "movq 4096(%5,%%r10,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movzb (%0),%%r10\n"
+ "movq (%5,%%r10,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+-"convertdone:"
++"3:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY) // %5
+ : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width, // r8
+ int source_dx) { // r9
+ asm(
+ "xor %%r11,%%r11\n"
+ "sub $0x2,%4\n"
+- "js scalenext\n"
+-
+-"scaleloop:"
++ "js 1f\n"
++
++"0:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%1,%%r10,1),%%rax\n"
+ "movq 2048(%5,%%rax,8),%%xmm0\n"
+ "movzb (%2,%%r10,1),%%rax\n"
+ "movq 4096(%5,%%rax,8),%%xmm1\n"
+ "lea (%%r11,%6),%%r10\n"
+ "sar $0x10,%%r11\n"
+@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%3)\n"
+ "add $0x8,%3\n"
+ "sub $0x2,%4\n"
+- "jns scaleloop\n"
+-
+-"scalenext:"
++ "jns 0b\n"
++
++"1:"
+ "add $0x1,%4\n"
+- "js scaledone\n"
++ "js 2f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%1,%%r10,1),%%rax\n"
+ "movq 2048(%5,%%rax,8),%%xmm0\n"
+ "movzb (%2,%%r10,1),%%rax\n"
+ "movq 4096(%5,%%rax,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "sar $0x10,%%r11\n"
+ "movzb (%0,%%r11,1),%%rax\n"
+ "movq (%5,%%rax,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+
+-"scaledone:"
++"2:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY), // %5
+ "r"(static_cast<long>(source_dx)) // %6
+@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ asm(
+ "xor %%r11,%%r11\n" // x = 0
+ "sub $0x2,%4\n"
+- "js .lscalenext\n"
++ "js 2f\n"
+ "cmp $0x20000,%6\n" // if source_dx >= 2.0
+- "jl .lscalehalf\n"
++ "jl 0f\n"
+ "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
+-".lscalehalf:"
+-
+-".lscaleloop:"
++"0:"
++
++"1:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%1, %%r10, 1), %%r13 \n"
+ "movzb 1(%1, %%r10, 1), %%r14 \n"
+ "mov %%r11, %%rax \n"
+ "and $0x1fffe, %%rax \n"
+ "imul %%rax, %%r14 \n"
+@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%3)\n"
+ "add $0x8,%3\n"
+ "sub $0x2,%4\n"
+- "jns .lscaleloop\n"
+-
+-".lscalenext:"
++ "jns 1b\n"
++
++"2:"
+ "add $0x1,%4\n"
+- "js .lscaledone\n"
++ "js 3f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%1,%%r10,1), %%r13 \n"
+ "movq 2048(%5,%%r13,8),%%xmm0\n"
+
+ "movzb (%2,%%r10,1), %%r13 \n"
+@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
+ "movzb (%0,%%r11,1), %%r13 \n"
+ "movq (%5,%%r13,8),%%xmm1\n"
+
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+
+-".lscaledone:"
++"3:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY), // %5
+ "r"(static_cast<long>(source_dx)) // %6
+ : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+ );
+ }
+
+-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+
+ // PIC version is slower because less registers are available, so
+ // non-PIC is used on platforms where it is possible.
+-
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width);
++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width);
+ asm(
+ ".text\n"
+- ".global FastConvertYUVToRGB32Row\n"
+-"FastConvertYUVToRGB32Row:\n"
++ ".global FastConvertYUVToRGB32Row_SSE\n"
++ ".type FastConvertYUVToRGB32Row_SSE, @function\n"
++"FastConvertYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+- "jmp convertend\n"
+-
+-"convertloop:"
++ "jmp 1f\n"
++
++"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
+ "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+-"convertend:"
++"1:"
+ "sub $0x2,%ecx\n"
+- "jns convertloop\n"
++ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+- "je convertdone\n"
++ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+-"convertdone:"
++"2:"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
+-
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx);
++void FastConvertYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width)
++{
++ if (mozilla::supports_sse()) {
++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
++ return;
++ }
++
++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
++
++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
+ asm(
+ ".text\n"
+- ".global ScaleYUVToRGB32Row\n"
+-"ScaleYUVToRGB32Row:\n"
++ ".global ScaleYUVToRGB32Row_SSE\n"
++ ".type ScaleYUVToRGB32Row_SSE, @function\n"
++"ScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+- "jmp scaleend\n"
+-
+-"scaleloop:"
++ "jmp 1f\n"
++
++"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+ "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+-"scaleend:"
++"1:"
+ "sub $0x2,%ecx\n"
+- "jns scaleloop\n"
++ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+- "je scaledone\n"
++ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+-"scaledone:"
++"2:"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx);
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx)
++{
++ if (mozilla::supports_sse()) {
++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
++ width, source_dx);
++ }
++
++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
++ width, source_dx);
++}
++
++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx);
+ asm(
+ ".text\n"
+- ".global LinearScaleYUVToRGB32Row\n"
+-"LinearScaleYUVToRGB32Row:\n"
++ ".global LinearScaleYUVToRGB32Row_SSE\n"
++ ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
++"LinearScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x30(%esp),%ebp\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+- "jl .lscaleend\n"
++ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+- "jmp .lscaleend\n"
+-
+-".lscaleloop:"
+- "mov %ebx,%eax\n"
+- "sar $0x11,%eax\n"
++ "jmp 1f\n"
++
++"0:"
++ "mov %ebx,%eax\n"
++ "sar $0x11,%eax\n"
+
+ "movzbl (%edi,%eax,1),%ecx\n"
+ "movzbl 1(%edi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+- "jge .lscalelastpixel\n"
++ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+-".lscaleend:"
++"1:"
+ "cmp 0x34(%esp), %ebx\n"
+- "jl .lscaleloop\n"
++ "jl 0b\n"
+ "popa\n"
+ "ret\n"
+
+-".lscalelastpixel:"
++"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
+-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
+-
+-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int16 *kCoefficientsRgbY);
++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx)
++{
++ if (mozilla::supports_sse()) {
++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
++ width, source_dx);
++ }
++
++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
++ width, source_dx);
++}
++
++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
++
++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int16 *kCoefficientsRgbY);
++
+ asm(
+ ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICConvertYUVToRGB32Row:\n"
++#if defined(XP_MACOSX)
++"_PICConvertYUVToRGB32Row_SSE:\n"
+ #else
+-"PICConvertYUVToRGB32Row:\n"
++"PICConvertYUVToRGB32Row_SSE:\n"
+ #endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x38(%esp),%ecx\n"
+
+- "jmp .Lconvertend\n"
+-
+-".Lconvertloop:"
++ "jmp 1f\n"
++
++"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw 4096(%ecx,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
+ "movq 0(%ecx,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+-".Lconvertend:"
++"1:"
+ "subl $0x2,0x34(%esp)\n"
+- "jns .Lconvertloop\n"
++ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+- "je .Lconvertdone\n"
++ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+-".Lconvertdone:\n"
++"2:"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+- int width) {
+- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
++ int width)
++{
++ if (mozilla::supports_sse()) {
++ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++ &kCoefficientsRgbY[0][0]);
++ return;
++ }
++
++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx,
+ int16 *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICScaleYUVToRGB32Row:\n"
++#if defined(XP_MACOSX)
++"_PICScaleYUVToRGB32Row_SSE:\n"
+ #else
+-"PICScaleYUVToRGB32Row:\n"
++"PICScaleYUVToRGB32Row_SSE:\n"
+ #endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x3c(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+- "jmp Lscaleend\n"
+-
+-"Lscaleloop:"
++ "jmp 1f\n"
++
++"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
+ "movq 0(%ecx,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+-"Lscaleend:"
++"1:"
+ "subl $0x2,0x34(%esp)\n"
+- "jns Lscaleloop\n"
++ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+- "je Lscaledone\n"
++ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+-"Lscaledone:"
++"2:"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
+-
+ void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+- int source_dx) {
+- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx,
+- int16 *kCoefficientsRgbY);
++ int source_dx)
++{
++ if (mozilla::supports_sse()) {
++ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
++ &kCoefficientsRgbY[0][0]);
++ return;
++ }
++
++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx,
++ int16 *kCoefficientsRgbY);
++
+ asm(
+ ".text\n"
+-#if defined(OS_MACOSX)
+-"_PICLinearScaleYUVToRGB32Row:\n"
++#if defined(XP_MACOSX)
++"_PICLinearScaleYUVToRGB32Row_SSE:\n"
+ #else
+-"PICLinearScaleYUVToRGB32Row:\n"
++"PICLinearScaleYUVToRGB32Row_SSE:\n"
+ #endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "mov 0x3c(%esp),%edi\n"
+ "xor %ebx,%ebx\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+- "jl .lscaleend\n"
++ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+- "jmp .lscaleend\n"
+-
+-".lscaleloop:"
++ "jmp 1f\n"
++
++"0:"
+ "mov 0x28(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq (%edi,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+- "jge .lscalelastpixel\n"
++ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+-".lscaleend:"
++"1:"
+ "cmp %ebx, 0x34(%esp)\n"
+- "jg .lscaleloop\n"
++ "jg 0b\n"
+ "popa\n"
+ "ret\n"
+
+-".lscalelastpixel:"
++"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
++#if !defined(XP_MACOSX)
++ ".previous\n"
++#endif
+ );
+
++
+ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
+- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+- &kCoefficientsRgbY[0][0]);
+-}
+-
+-#else // USE_MMX
+-
+-// C reference code that mimic the YUV assembly.
+-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+- (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+-
+-static inline void YuvPixel(uint8 y,
+- uint8 u,
+- uint8 v,
+- uint8* rgb_buf) {
+-
+- int b = kCoefficientsRgbY[256+u][0];
+- int g = kCoefficientsRgbY[256+u][1];
+- int r = kCoefficientsRgbY[256+u][2];
+- int a = kCoefficientsRgbY[256+u][3];
+-
+- b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+- g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+- r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+- a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+-
+- b = paddsw(b, kCoefficientsRgbY[y][0]);
+- g = paddsw(g, kCoefficientsRgbY[y][1]);
+- r = paddsw(r, kCoefficientsRgbY[y][2]);
+- a = paddsw(a, kCoefficientsRgbY[y][3]);
+-
+- b >>= 6;
+- g >>= 6;
+- r >>= 6;
+- a >>= 6;
+-
+- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+- (packuswb(g) << 8) |
+- (packuswb(r) << 16) |
+- (packuswb(a) << 24);
+-}
+-
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx)
++{
++ if (mozilla::supports_sse()) {
++ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++ source_dx, &kCoefficientsRgbY[0][0]);
++ return;
++ }
++
++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++#else
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+- for (int x = 0; x < width; x += 2) {
+- uint8 u = u_buf[x >> 1];
+- uint8 v = v_buf[x >> 1];
+- uint8 y0 = y_buf[x];
+- YuvPixel(y0, u, v, rgb_buf);
+- if ((x + 1) < width) {
+- uint8 y1 = y_buf[x + 1];
+- YuvPixel(y1, u, v, rgb_buf + 4);
+- }
+- rgb_buf += 8; // Advance 2 pixels.
+- }
+-}
+-
+-// 16.16 fixed point is used. A shift by 16 isolates the integer.
+-// A shift by 17 is used to further subsample the chrominence channels.
+-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+-// for 1/65536 pixel accurate interpolation.
++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
+ void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+- int x = 0;
+- for (int i = 0; i < width; i += 2) {
+- int y = y_buf[x >> 16];
+- int u = u_buf[(x >> 17)];
+- int v = v_buf[(x >> 17)];
+- YuvPixel(y, u, v, rgb_buf);
+- x += source_dx;
+- if ((i + 1) < width) {
+- y = y_buf[x >> 16];
+- YuvPixel(y, u, v, rgb_buf+4);
+- x += source_dx;
+- }
+- rgb_buf += 8;
+- }
+-}
++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
+
+ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+- int x = 0;
+- if (source_dx >= 0x20000) {
+- x = 32768;
+- }
+- for (int i = 0; i < width; i += 2) {
+- int y0 = y_buf[x >> 16];
+- int y1 = y_buf[(x >> 16) + 1];
+- int u0 = u_buf[(x >> 17)];
+- int u1 = u_buf[(x >> 17) + 1];
+- int v0 = v_buf[(x >> 17)];
+- int v1 = v_buf[(x >> 17) + 1];
+- int y_frac = (x & 65535);
+- int uv_frac = ((x >> 1) & 65535);
+- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+- YuvPixel(y, u, v, rgb_buf);
+- x += source_dx;
+- if ((i + 1) < width) {
+- y0 = y_buf[x >> 16];
+- y1 = y_buf[(x >> 16) + 1];
+- y_frac = (x & 65535);
+- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+- YuvPixel(y, u, v, rgb_buf+4);
+- x += source_dx;
+- }
+- rgb_buf += 8;
+- }
+-}
+-
+-#endif // USE_MMX
+-} // extern "C"
+-
++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++#endif
++
++}
+diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
+--- a/gfx/ycbcr/yuv_row_table.cpp
++++ b/gfx/ycbcr/yuv_row_table.cpp
+@@ -1,13 +1,13 @@
+ // Copyright (c) 2010 The Chromium Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+
+-#include "media/base/yuv_row.h"
++#include "yuv_row.h"
+
+ extern "C" {
+
+ #define RGBY(i) { \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ 0 \
+diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
+--- a/gfx/ycbcr/yuv_row_win.cpp
++++ b/gfx/ycbcr/yuv_row_win.cpp
+@@ -1,26 +1,27 @@
+ // Copyright (c) 2010 The Chromium Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style license that can be
+ // found in the LICENSE file.
+
+-#include "media/base/yuv_row.h"
++#include "yuv_row.h"
++#include "mozilla/SSE.h"
+
+ #define kCoefficientsRgbU kCoefficientsRgbY + 2048
+ #define kCoefficientsRgbV kCoefficientsRgbY + 4096
+
+ extern "C" {
+
+-#if USE_MMX
+-__declspec(naked)
+-void FastConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width) {
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++__declspec(naked)
++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp convertend
+@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
+ convertdone :
+
+ popad
+ ret
+ }
+ }
+
+ __declspec(naked)
+-void ConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int step) {
++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int step) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ mov ebx, [esp + 32 + 24] // step
+@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
+ wdone :
+
+ popad
+ ret
+ }
+ }
+
+ __declspec(naked)
+-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int ystep,
+- int uvstep) {
++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int ystep,
++ int uvstep) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
+ wdone :
+
+ popad
+ ret
+ }
+ }
+
+ __declspec(naked)
+-void DoubleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width) {
++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
+ jns wloop1
+ wdone :
+ popad
+ ret
+ }
+ }
+
+ // This version does general purpose scaling by any amount, up or down.
+-// The only thing it can not do it rotation by 90 or 270.
+-// For performance the chroma is under sampled, reducing cost of a 3x
++// The only thing it cannot do is rotation by 90 or 270.
++// For performance the chroma is under-sampled, reducing cost of a 3x
+ // 1080p scale from 8.4 ms to 5.4 ms.
+ __declspec(naked)
+-void ScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ xor ebx, ebx // x
+@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
+
+ scaledone :
+ popad
+ ret
+ }
+ }
+
+ __declspec(naked)
+-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+- const uint8* u_buf,
+- const uint8* v_buf,
+- uint8* rgb_buf,
+- int width,
+- int source_dx) {
++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ // [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ imul ecx, [esp + 32 + 24] // source_dx
+@@ -438,152 +439,60 @@ lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ popad
+ ret
+ };
+ }
+-#else // USE_MMX
+-
+-// C reference code that mimic the YUV assembly.
+-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+- (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+-
+-static inline void YuvPixel(uint8 y,
+- uint8 u,
+- uint8 v,
+- uint8* rgb_buf) {
+-
+- int b = kCoefficientsRgbY[256+u][0];
+- int g = kCoefficientsRgbY[256+u][1];
+- int r = kCoefficientsRgbY[256+u][2];
+- int a = kCoefficientsRgbY[256+u][3];
+-
+- b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+- g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+- r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+- a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+-
+- b = paddsw(b, kCoefficientsRgbY[y][0]);
+- g = paddsw(g, kCoefficientsRgbY[y][1]);
+- r = paddsw(r, kCoefficientsRgbY[y][2]);
+- a = paddsw(a, kCoefficientsRgbY[y][3]);
+-
+- b >>= 6;
+- g >>= 6;
+- r >>= 6;
+- a >>= 6;
+-
+- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+- (packuswb(g) << 8) |
+- (packuswb(r) << 16) |
+- (packuswb(a) << 24);
+-}
+-
+-#if TEST_MMX_YUV
+-static inline void YuvPixel(uint8 y,
+- uint8 u,
+- uint8 v,
+- uint8* rgb_buf) {
+-
+- __asm {
+- movzx eax, u
+- movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
+- movzx eax, v
+- paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
+- movzx eax, y
+- movq mm1, [kCoefficientsRgbY + 8 * eax]
+- paddsw mm1, mm0
+- psraw mm1, 6
+- packuswb mm1, mm1
+- mov eax, rgb_buf
+- movd [eax], mm1
+- emms
+- }
+-}
+-#endif
++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+
+ void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+- for (int x = 0; x < width; x += 2) {
+- uint8 u = u_buf[x >> 1];
+- uint8 v = v_buf[x >> 1];
+- uint8 y0 = y_buf[x];
+- YuvPixel(y0, u, v, rgb_buf);
+- if ((x + 1) < width) {
+- uint8 y1 = y_buf[x + 1];
+- YuvPixel(y1, u, v, rgb_buf + 4);
+- }
+- rgb_buf += 8; // Advance 2 pixels.
+- }
+-}
+-
+-// 16.16 fixed point is used. A shift by 16 isolates the integer.
+-// A shift by 17 is used to further subsample the chrominence channels.
+-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+-// for 1/65536 pixel accurate interpolation.
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++ if (mozilla::supports_sse()) {
++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
++ return;
++ }
++#endif
++
++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
++}
++
+ void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+- int x = 0;
+- for (int i = 0; i < width; i += 2) {
+- int y = y_buf[x >> 16];
+- int u = u_buf[(x >> 17)];
+- int v = v_buf[(x >> 17)];
+- YuvPixel(y, u, v, rgb_buf);
+- x += source_dx;
+- if ((i + 1) < width) {
+- y = y_buf[x >> 16];
+- YuvPixel(y, u, v, rgb_buf+4);
+- x += source_dx;
+- }
+- rgb_buf += 8;
+- }
+-}
++
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++ if (mozilla::supports_sse()) {
++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++ return;
++ }
++#endif
++
++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
+
+ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+- int x = 0;
+- if (source_dx >= 0x20000) {
+- x = 32768;
+- }
+- for (int i = 0; i < width; i += 2) {
+- int y0 = y_buf[x >> 16];
+- int y1 = y_buf[(x >> 16) + 1];
+- int u0 = u_buf[(x >> 17)];
+- int u1 = u_buf[(x >> 17) + 1];
+- int v0 = v_buf[(x >> 17)];
+- int v1 = v_buf[(x >> 17) + 1];
+- int y_frac = (x & 65535);
+- int uv_frac = ((x >> 1) & 65535);
+- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+- YuvPixel(y, u, v, rgb_buf);
+- x += source_dx;
+- if ((i + 1) < width) {
+- y0 = y_buf[x >> 16];
+- y1 = y_buf[(x >> 16) + 1];
+- y_frac = (x & 65535);
+- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+- YuvPixel(y, u, v, rgb_buf+4);
+- x += source_dx;
+- }
+- rgb_buf += 8;
+- }
+-}
+-
+-#endif // USE_MMX
+-} // extern "C"
+-
++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
++ if (mozilla::supports_sse()) {
++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
++ source_dx);
++ return;
++ }
++#endif
++
++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++} // extern "C"
diff --git a/gfx/ycbcr/moz.build b/gfx/ycbcr/moz.build
new file mode 100644
index 000000000..04855e2e9
--- /dev/null
+++ b/gfx/ycbcr/moz.build
@@ -0,0 +1,65 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS += [
+ 'YCbCrUtils.h',
+]
+
+UNIFIED_SOURCES += [
+ 'scale_yuv_argb.cpp',
+ 'ycbcr_to_rgb565.cpp',
+ 'YCbCrUtils.cpp',
+ 'yuv_convert.cpp',
+ 'yuv_row_c.cpp',
+ 'yuv_row_table.cpp',
+]
+
+if CONFIG['INTEL_ARCHITECTURE']:
+ # These files use MMX and SSE2 intrinsics, so they need special compile flags
+ # on some compilers.
+ SOURCES += ['yuv_convert_sse2.cpp']
+ SOURCES['yuv_convert_sse2.cpp'].flags += CONFIG['SSE2_FLAGS']
+
+ # MSVC doesn't support MMX when targeting AMD64.
+ if CONFIG['_MSC_VER']:
+ if CONFIG['OS_TEST'] != 'x86_64':
+ SOURCES += [
+ 'yuv_convert_mmx.cpp',
+ ]
+ else:
+ SOURCES += ['yuv_convert_mmx.cpp']
+ SOURCES['yuv_convert_mmx.cpp'].flags += CONFIG['MMX_FLAGS']
+
+if CONFIG['_MSC_VER']:
+ if CONFIG['OS_TEST'] == 'x86_64':
+ SOURCES += [
+ 'yuv_row_win64.cpp',
+ ]
+ else:
+ SOURCES += [
+ 'yuv_row_win.cpp',
+ ]
+elif CONFIG['OS_ARCH'] in ('Linux', 'SunOS', 'Darwin', 'DragonFly',
+ 'FreeBSD', 'NetBSD', 'OpenBSD'):
+ SOURCES += [
+ 'yuv_row_posix.cpp',
+ ]
+else:
+ SOURCES += [
+ 'yuv_row_other.cpp',
+ ]
+
+if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['HAVE_ARM_NEON']:
+ SOURCES += [
+ 'yuv_row_arm.s',
+ ]
+ SOURCES += [
+ 'yuv_convert_arm.cpp',
+ ]
+
+LOCAL_INCLUDES += ['/media/libyuv/include']
+
+FINAL_LIBRARY = 'xul'
diff --git a/gfx/ycbcr/scale_yuv_argb.cpp b/gfx/ycbcr/scale_yuv_argb.cpp
new file mode 100644
index 000000000..91a96cb9f
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.cpp
@@ -0,0 +1,1126 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ * Copyright 2016 Mozilla Foundation
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// YUV to RGB conversion and scaling functions were implemented by referencing
+// scale_argb.cc
+//
+// libyuv already has ScaleYUVToARGBBilinearUp(), but its implementation is not
+// completed yet. Implementations of the functions are based on it.
+// At first, ScaleYUVToARGBBilinearUp() was implemented by modidying the
+// libyuv's one. Then all another functions were implemented similarly.
+//
+// Function relationship between yuv_convert.cpp abd scale_argb.cc are like
+// the followings
+// - ScaleYUVToARGBDown2() <-- ScaleARGBDown2()
+// - ScaleYUVToARGBDownEven() <-- ScaleARGBDownEven()
+// - ScaleYUVToARGBBilinearDown() <-- ScaleARGBBilinearDown()
+// - ScaleYUVToARGBBilinearUp() <-- ScaleARGBBilinearUp() and ScaleYUVToARGBBilinearUp() in libyuv
+// - ScaleYUVToARGBSimple() <-- ScaleARGBSimple()
+// - ScaleYUVToARGB() <-- ScaleARGB() // Removed some function calls for simplicity.
+// - YUVToARGBScale() <-- ARGBScale()
+//
+// Callings and selections of InterpolateRow() and ScaleARGBFilterCols() were
+// kept as same as possible.
+//
+// The followings changes were done to each scaling functions.
+//
+// -[1] Allocate YUV conversion buffer and use it as source buffer of scaling.
+// Its usage is borrowed from the libyuv's ScaleYUVToARGBBilinearUp().
+// -[2] Conversion from YUV to RGB was abstracted as YUVBuferIter.
+// It is for handling multiple yuv color formats.
+// -[3] Modified scaling functions as to handle YUV conversion buffer and
+// use YUVBuferIter.
+// -[4] Color conversion function selections in YUVBuferIter were borrowed from
+// I444ToARGBMatrix(), I422ToARGBMatrix() and I420ToARGBMatrix()
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+struct YUVBuferIter {
+ int src_width;
+ int src_height;
+ int src_stride_y;
+ int src_stride_u;
+ int src_stride_v;
+ const uint8* src_y;
+ const uint8* src_u;
+ const uint8* src_v;
+
+ uint32 src_fourcc;
+ const struct YuvConstants* yuvconstants;
+ int y_index;
+ const uint8* src_row_y;
+ const uint8* src_row_u;
+ const uint8* src_row_v;
+
+ void (*YUVToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+ void (*MoveTo)(YUVBuferIter& iter, int y_index);
+ void (*MoveToNextRow)(YUVBuferIter& iter);
+};
+
+void YUVBuferIter_InitI422(YUVBuferIter& iter) {
+ iter.YUVToARGBRow = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(iter.src_width, 16)) {
+ iter.YUVToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ iter.YUVToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(iter.src_width, 4) &&
+ IS_ALIGNED(iter.src_y, 4) && IS_ALIGNED(iter.src_stride_y, 4) &&
+ IS_ALIGNED(iter.src_u, 2) && IS_ALIGNED(iter.src_stride_u, 2) &&
+ IS_ALIGNED(iter.src_v, 2) && IS_ALIGNED(iter.src_stride_v, 2) {
+ // Always satisfy IS_ALIGNED(argb_cnv_row, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)
+ iter.YUVToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+}
+
+void YUVBuferIter_InitI444(YUVBuferIter& iter) {
+ iter.YUVToARGBRow = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(iter.src_width, 16)) {
+ iter.YUVToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ iter.YUVToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(iter.src_width, 8)) {
+ iter.YUVToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+}
+
+
+static void YUVBuferIter_MoveToForI444(YUVBuferIter& iter, int y_index) {
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI444(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI422(YUVBuferIter& iter, int y_index) {
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI422(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI420(YUVBuferIter& iter, int y_index) {
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int uv_y_index = y_index >> kYShift;
+
+ iter.y_index = y_index;
+ iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+ iter.src_row_u = iter.src_u + uv_y_index * iter.src_stride_u;
+ iter.src_row_v = iter.src_v + uv_y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI420(YUVBuferIter& iter) {
+ iter.src_row_y += iter.src_stride_y;
+ if (iter.y_index & 1) {
+ iter.src_row_u += iter.src_stride_u;
+ iter.src_row_v += iter.src_stride_v;
+ }
+ iter.y_index++;
+}
+
+static __inline void YUVBuferIter_ConvertToARGBRow(YUVBuferIter& iter, uint8* argb_row) {
+ iter.YUVToARGBRow(iter.src_row_y, iter.src_row_u, iter.src_row_v, argb_row, iter.yuvconstants, iter.src_width);
+}
+
+void YUVBuferIter_Init(YUVBuferIter& iter, uint32 src_fourcc, mozilla::YUVColorSpace yuv_color_space) {
+ iter.src_fourcc = src_fourcc;
+ iter.y_index = 0;
+ iter.src_row_y = iter.src_y;
+ iter.src_row_u = iter.src_u;
+ iter.src_row_v = iter.src_v;
+ if (yuv_color_space == mozilla::YUVColorSpace::BT709) {
+ iter.yuvconstants = &kYuvH709Constants;
+ } else {
+ iter.yuvconstants = &kYuvI601Constants;
+ }
+
+ if (src_fourcc == FOURCC_I444) {
+ YUVBuferIter_InitI444(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI444;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI444;
+ } else if(src_fourcc == FOURCC_I422){
+ YUVBuferIter_InitI422(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI422;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI422;
+ } else {
+ assert(src_fourcc == FOURCC_I420); // Should be FOURCC_I420
+ YUVBuferIter_InitI422(iter);
+ iter.MoveTo = YUVBuferIter_MoveToForI420;
+ iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI420;
+ }
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleYUVToARGBDown2(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space) {
+ int j;
+
+ // Allocate 2 rows of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ filtering == kFilterNone ? ScaleARGBRowDown2_C :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+ ScaleARGBRowDown2Box_C);
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ int yi = y >> 16;
+ iter.MoveTo(iter, yi);
+ ptrdiff_t x_offset;
+ if (filtering == kFilterBilinear) {
+ x_offset = (x >> 16) * 4;
+ } else {
+ x_offset = ((x >> 16) - 1) * 4;
+ }
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+ ScaleARGBRowDown2Box_Any_SSE2);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+ ScaleARGBRowDown2Box_SSE2);
+ }
+ }
+
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+ ScaleARGBRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+ ScaleARGBRowDown2Box_NEON);
+ }
+ }
+#endif
+
+ const int dyi = dy >> 16;
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + dyi) < (src_height - 1)) {
+ iter.MoveTo(iter, yi + dyi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ if (filtering == kFilterLinear) {
+ argb_cnv_rowstride = 0;
+ }
+ const int max_yi = src_height - 1;
+ const int max_yi_minus_dyi = max_yi - dyi;
+ for (j = 0; j < dst_height; ++j) {
+ if (yi != lastyi) {
+ if (yi > max_yi) {
+ yi = max_yi;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+ iter.MoveTo(iter, next_yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ ScaleARGBRowDown2(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, dst_argb, dst_width);
+ dst_argb += dst_stride_argb;
+ yi += dyi;
+ }
+
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleYUVToARGBDownEven(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space) {
+ int j;
+ // Allocate 2 rows of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+ int col_step = dx >> 16;
+ void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ int yi = y >> 16;
+ const ptrdiff_t x_offset = (x >> 16) * 4;
+
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+ ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+ ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+ ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ const int dyi = dy >> 16;
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + dyi) < (src_height - 1)) {
+ iter.MoveTo(iter, yi + dyi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ if (filtering == kFilterLinear) {
+ argb_cnv_rowstride = 0;
+ }
+ const int max_yi = src_height - 1;
+ const int max_yi_minus_dyi = max_yi - dyi;
+ for (j = 0; j < dst_height; ++j) {
+ if (yi != lastyi) {
+ if (yi > max_yi) {
+ yi = max_yi;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+ iter.MoveTo(iter, next_yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ ScaleARGBRowDownEven(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, col_step, dst_argb, dst_width);
+ dst_argb += dst_stride_argb;
+ yi += dyi;
+ }
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB down with bilinear interpolation.
+static void ScaleYUVToARGBBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64 xlast = x + (int64)(dst_width - 1) * dx;
+ int64 xl = (dx >= 0) ? x : xlast;
+ int64 xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ const ptrdiff_t xl_offset = xl * 4;
+ x -= (int)(xl << 16);
+
+ // Allocate 2 row of ARGB for source conversion.
+ const int kRowSize = (src_width * 4 + 15) & ~15;
+ align_buffer_64(argb_cnv_row, kRowSize * 2);
+ uint8* argb_cnv_rowptr = argb_cnv_row;
+ int argb_cnv_rowstride = kRowSize;
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ align_buffer_64(row, clip_src_width * 4);
+
+ int lastyi = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + 1) < src_height) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ const int max_yi = src_height - 1;
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lastyi) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ lastyi = yi;
+ } else {
+ // Prepare current row
+ if (yi == iter.y_index) {
+ argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+ argb_cnv_rowstride = - argb_cnv_rowstride;
+ } else {
+ iter.MoveTo(iter, yi);
+ argb_cnv_rowptr = argb_cnv_row;
+ argb_cnv_rowstride = kRowSize;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+ }
+ // Prepare next row if necessary
+ if (iter.y_index < max_yi) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+ } else {
+ argb_cnv_rowstride = 0;
+ }
+ lastyi = yi;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, argb_cnv_rowptr + xl_offset, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, argb_cnv_rowptr + xl_offset, argb_cnv_rowstride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lastyi = yi;
+
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+
+ if (filtering == kFilterLinear) {
+ rowstride = 0;
+ }
+ // Prepare next row if necessary
+ if (filtering != kFilterLinear) {
+ if ((yi + 1) < src_height) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+ }else {
+ rowstride = 0;
+ }
+ }
+
+ const int max_yi = src_height - 1;
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lastyi) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ }
+ if (yi != lastyi) {
+ if (filtering == kFilterLinear) {
+ iter.MoveToNextRow(iter);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+ } else {
+ // Prepare next row if necessary
+ if (yi < max_yi) {
+ iter.MoveToNextRow(iter);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ // TODO(fbarchard): Convert the clipped region of row.
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+ } else {
+ rowstride = 0;
+ }
+ }
+ lastyi = yi;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleYUVToARGBSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space) {
+ int j;
+ void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ int yi = y >> 16;
+
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+ iter.MoveTo(iter, yi);
+
+ int lasty = yi;
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ iter.MoveTo(iter, yi);
+ YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+ lasty = yi;
+ }
+ ScaleARGBCols(dst_argb, argb_cnv_row, dst_width, x, dx);
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(argb_cnv_row);
+}
+
+static void YUVToARGBCopy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space)
+{
+ YUVBuferIter iter;
+ iter.src_width = src_width;
+ iter.src_height = src_height;
+ iter.src_stride_y = src_stride_y;
+ iter.src_stride_u = src_stride_u;
+ iter.src_stride_v = src_stride_v;
+ iter.src_y = src_y;
+ iter.src_u = src_u;
+ iter.src_v = src_v;
+ YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+ for (int j = 0; j < dst_height; ++j) {
+ YUVBuferIter_ConvertToARGBRow(iter, dst_argb);
+ iter.MoveToNextRow(iter);
+ dst_argb += dst_stride_argb;
+ }
+}
+
+static void ScaleYUVToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space)
+{
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleYUVToARGBDown2(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ ScaleYUVToARGBDownEven(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight conversion and copy.
+ YUVToARGBCopy(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ }
+ }
+ }
+ if (filtering && dy < 65536) {
+ ScaleYUVToARGBBilinearUp(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ if (filtering) {
+ ScaleYUVToARGBBilinearDown(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return;
+ }
+ ScaleYUVToARGBSimple(src_width, src_height,
+ dst_width, dst_height,
+ src_stride_y,
+ src_stride_u,
+ src_stride_v,
+ dst_stride_argb,
+ src_y,
+ src_u,
+ src_v,
+ dst_argb,
+ x, dx, y, dy,
+ src_fourcc,
+ yuv_color_space);
+}
+
+bool IsConvertSupported(uint32 src_fourcc)
+{
+ if (src_fourcc == FOURCC_I444 ||
+ src_fourcc == FOURCC_I422 ||
+ src_fourcc == FOURCC_I420) {
+ return true;
+ }
+ return false;
+}
+
+LIBYUV_API
+int YUVToARGBScale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering)
+{
+ if (!src_y || !src_u || !src_v ||
+ src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ if (!IsConvertSupported(src_fourcc)) {
+ return -1;
+ }
+ ScaleYUVToARGB(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ filtering,
+ src_fourcc,
+ yuv_color_space);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/gfx/ycbcr/scale_yuv_argb.h b/gfx/ycbcr/scale_yuv_argb.h
new file mode 100644
index 000000000..d1a42db1b
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#include "ImageTypes.h" // For YUVColorSpace
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+int YUVToARGBScale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint32 src_fourcc,
+ mozilla::YUVColorSpace yuv_color_space,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ NOLINT
diff --git a/gfx/ycbcr/update.sh b/gfx/ycbcr/update.sh
new file mode 100644
index 000000000..3a38fe81a
--- /dev/null
+++ b/gfx/ycbcr/update.sh
@@ -0,0 +1,12 @@
+# update.sh <chromium-src-directory>
+cp $1/media/base/yuv_convert.h .
+cp $1/media/base/yuv_convert.cc yuv_convert.cpp
+cp $1/media/base/yuv_row.h .
+cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp
+cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp
+cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp
+cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp
+patch -p3 <convert.patch
+patch -p3 <win64.patch
+patch -p3 <TypeFromSize.patch
+patch -p3 <QuellGccWarnings.patch
diff --git a/gfx/ycbcr/win64.patch b/gfx/ycbcr/win64.patch
new file mode 100644
index 000000000..bdccf2784
--- /dev/null
+++ b/gfx/ycbcr/win64.patch
@@ -0,0 +1,210 @@
+diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
+new file mode 100644
+--- /dev/null
++++ b/gfx/ycbcr/yuv_row_win64.cpp
+@@ -0,0 +1,205 @@
++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style license that can be
++// found in the LICENSE file.
++
++#include "yuv_row.h"
++
++extern "C" {
++
++// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
++
++#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
++#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
++
++#include <emmintrin.h>
++
++static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width) {
++ __m128i xmm0, xmmY1, xmmY2;
++ __m128 xmmY;
++
++ while (width >= 2) {
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
++
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++ 0x44);
++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++ rgb_buf += 8;
++ width -= 2;
++ }
++
++ if (width) {
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++ xmmY1 = _mm_srai_epi16(xmmY1, 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++ }
++}
++
++static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
++ __m128i xmm0, xmmY1, xmmY2;
++ __m128 xmmY;
++ uint8 u, v, y;
++ int x = 0;
++
++ while (width >= 2) {
++ u = u_buf[x >> 17];
++ v = v_buf[x >> 17];
++ y = y_buf[x >> 16];
++ x += source_dx;
++
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++ y = y_buf[x >> 16];
++ x += source_dx;
++
++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++ 0x44);
++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++ rgb_buf += 8;
++ width -= 2;
++ }
++
++ if (width) {
++ u = u_buf[x >> 17];
++ v = v_buf[x >> 17];
++ y = y_buf[x >> 16];
++
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++ xmmY1 = _mm_srai_epi16(xmmY1, 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++ }
++}
++
++static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
++ __m128i xmm0, xmmY1, xmmY2;
++ __m128 xmmY;
++ uint8 u0, u1, v0, v1, y0, y1;
++ uint32 uv_frac, y_frac, u, v, y;
++ int x = 0;
++
++ if (source_dx >= 0x20000) {
++ x = 32768;
++ }
++
++ while(width >= 2) {
++ u0 = u_buf[x >> 17];
++ u1 = u_buf[(x >> 17) + 1];
++ v0 = v_buf[x >> 17];
++ v1 = v_buf[(x >> 17) + 1];
++ y0 = y_buf[x >> 16];
++ y1 = y_buf[(x >> 16) + 1];
++ uv_frac = (x & 0x1fffe);
++ y_frac = (x & 0xffff);
++ u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
++ v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
++ x += source_dx;
++
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++
++ y0 = y_buf[x >> 16];
++ y1 = y_buf[(x >> 16) + 1];
++ y_frac = (x & 0xffff);
++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
++ x += source_dx;
++
++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
++
++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
++ 0x44);
++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++
++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
++ rgb_buf += 8;
++ width -= 2;
++ }
++
++ if (width) {
++ u = u_buf[x >> 17];
++ v = v_buf[x >> 17];
++ y = y_buf[x >> 16];
++
++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
++
++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
++ xmmY1 = _mm_srai_epi16(xmmY1, 6);
++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
++ }
++}
++
++void FastConvertYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width) {
++ FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
++}
++
++void ScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
++ ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
++}
++
++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
++ const uint8* u_buf,
++ const uint8* v_buf,
++ uint8* rgb_buf,
++ int width,
++ int source_dx) {
++ LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
++ source_dx);
++}
++
++} // extern "C"
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.cpp b/gfx/ycbcr/ycbcr_to_rgb565.cpp
new file mode 100644
index 000000000..0572e3e09
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp
@@ -0,0 +1,672 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdlib.h>
+#include <limits.h>
+#include "nsDebug.h"
+#include "ycbcr_to_rgb565.h"
+#include "nsAlgorithm.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+/*This contains all of the parameters that are needed to convert a row.
+ Passing them in a struct instead of as individual parameters saves the need
+ to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_bilinear_ctx{
+ uint16_t *rgb_row;
+ const uint8_t *y_row;
+ const uint8_t *u_row;
+ const uint8_t *v_row;
+ int y_yweight;
+ int y_pitch;
+ int width;
+ int source_x0_q16;
+ int source_dx_q16;
+ /*Not used for 4:4:4, except with chroma-nearest.*/
+ int source_uv_xoffs_q16;
+ /*Not used for 4:4:4 or chroma-nearest.*/
+ int uv_pitch;
+ /*Not used for 4:2:2, 4:4:4, or chroma-nearest.*/
+ int uv_yweight;
+};
+
+
+
+/*This contains all of the parameters that are needed to convert a row.
+ Passing them in a struct instead of as individual parameters saves the need
+ to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_nearest_ctx{
+ uint16_t *rgb_row;
+ const uint8_t *y_row;
+ const uint8_t *u_row;
+ const uint8_t *v_row;
+ int width;
+ int source_x0_q16;
+ int source_dx_q16;
+ /*Not used for 4:4:4.*/
+ int source_uv_xoffs_q16;
+};
+
+
+
+typedef void (*yuv2rgb565_row_scale_bilinear_func)(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+typedef void (*yuv2rgb565_row_scale_nearest_func)(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither);
+
+
+
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+
+extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16 *dst,
+ const uint8 *y,
+ const uint8 *u,
+ const uint8 *v,
+ int n,
+ int oddflag);
+
+#endif
+
+
+
+/*Bilinear interpolation of a single value.
+ This uses the exact same formulas as the asm, even though it adds some extra
+ shifts that do nothing but reduce accuracy.*/
+static int bislerp(const uint8_t *row,
+ int pitch,
+ int source_x,
+ int xweight,
+ int yweight) {
+ int a;
+ int b;
+ int c;
+ int d;
+ a = row[source_x];
+ b = row[source_x+1];
+ c = row[source_x+pitch];
+ d = row[source_x+pitch+1];
+ a = ((a<<8)+(c-a)*yweight+128)>>8;
+ b = ((b<<8)+(d-b)*yweight+128)>>8;
+ return ((a<<8)+(b-a)*xweight+128)>>8;
+}
+
+/*Convert a single pixel from Y'CbCr to RGB565.
+ This uses the exact same formulas as the asm, even though we could make the
+ constants a lot more accurate with 32-bit wide registers.*/
+static uint16_t yu2rgb565(int y, int u, int v, int dither) {
+ /*This combines the constant offset that needs to be added during the Y'CbCr
+ conversion with a rounding offset that depends on the dither parameter.*/
+ static const int DITHER_BIAS[4][3]={
+ {-14240, 8704, -17696},
+ {-14240+128,8704+64, -17696+128},
+ {-14240+256,8704+128,-17696+256},
+ {-14240+384,8704+192,-17696+384}
+ };
+ int r;
+ int g;
+ int b;
+ r = clamped((74*y+102*v+DITHER_BIAS[dither][0])>>9, 0, 31);
+ g = clamped((74*y-25*u-52*v+DITHER_BIAS[dither][1])>>8, 0, 63);
+ b = clamped((74*y+129*u+DITHER_BIAS[dither][2])>>9, 0, 31);
+ return (uint16_t)(r<<11 | g<<5 | b);
+}
+
+static void ScaleYCbCr420ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+ v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr422ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+ v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ u = bislerp(ctx->u_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ v = bislerp(ctx->v_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr42xToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+ int x;
+ int source_x_q16;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ int source_x;
+ int xweight;
+ int y;
+ int u;
+ int v;
+ xweight = ((source_x_q16&0xFFFF)+128)>>8;
+ source_x = source_x_q16>>16;
+ y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr42xToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+ int y;
+ int u;
+ int v;
+ int x;
+ int source_x_q16;
+ int source_x;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ source_x = source_x_q16>>16;
+ y = ctx->y_row[source_x];
+ source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+ source_x_q16 += ctx->source_dx_q16;
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+static void ScaleYCbCr444ToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+ int y;
+ int u;
+ int v;
+ int x;
+ int source_x_q16;
+ int source_x;
+ source_x_q16 = ctx->source_x0_q16;
+ for (x = 0; x < ctx->width; x++) {
+ source_x = source_x_q16>>16;
+ source_x_q16 += ctx->source_dx_q16;
+ y = ctx->y_row[source_x];
+ u = ctx->u_row[source_x];
+ v = ctx->v_row[source_x];
+ ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+ dither ^= 3;
+ }
+}
+
+void ScaleYCbCrToRGB565(const uint8_t *y_buf,
+ const uint8_t *u_buf,
+ const uint8_t *v_buf,
+ uint8_t *rgb_buf,
+ int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ ScaleFilter filter) {
+ int source_x0_q16;
+ int source_y0_q16;
+ int source_dx_q16;
+ int source_dy_q16;
+ int source_uv_xoffs_q16;
+ int source_uv_yoffs_q16;
+ int x_shift;
+ int y_shift;
+ int ymin;
+ int ymax;
+ int uvmin;
+ int uvmax;
+ int dither;
+ /*We don't support negative destination rectangles (just flip the source
+ instead), and for empty ones there's nothing to do.*/
+ if (width <= 0 || height <= 0)
+ return;
+ /*These bounds are required to avoid 16.16 fixed-point overflow.*/
+ NS_ASSERTION(source_x0 > (INT_MIN>>16) && source_x0 < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source X offset out of bounds.");
+ NS_ASSERTION(source_x0+source_width > (INT_MIN>>16)
+ && source_x0+source_width < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source width out of bounds.");
+ NS_ASSERTION(source_y0 > (INT_MIN>>16) && source_y0 < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source Y offset out of bounds.");
+ NS_ASSERTION(source_y0+source_height > (INT_MIN>>16)
+ && source_y0+source_height < (INT_MAX>>16),
+ "ScaleYCbCrToRGB565 source height out of bounds.");
+ /*We require the same stride for Y' and Cb and Cr for 4:4:4 content.*/
+ NS_ASSERTION(yuv_type != YV24 || y_pitch == uv_pitch,
+ "ScaleYCbCrToRGB565 luma stride differs from chroma for 4:4:4 content.");
+ /*We assume we can read outside the bounds of the input, because it makes
+ the code much simpler (and in practice is true: both Theora and VP8 return
+ padded reference frames).
+ In practice, we do not even _have_ the actual bounds of the source, as
+ we are passed a crop rectangle from it, and not the dimensions of the full
+ image.
+ This assertion will not guarantee our out-of-bounds reads are safe, but it
+ should at least catch the simple case of passing in an unpadded buffer.*/
+ NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16,
+ "ScaleYCbCrToRGB565 source image unpadded?");
+ /*The NEON code requires the pointers to be aligned to a 16-byte boundary at
+ the start of each row.
+ This should be true for all of our sources.
+ We could try to fix this up if it's not true by adjusting source_x0, but
+ that would require the mis-alignment to be the same for the U and V
+ planes.*/
+ NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 &&
+ ((y_buf-(uint8_t *)nullptr)&15) == 0 &&
+ ((u_buf-(uint8_t *)nullptr)&15) == 0 &&
+ ((v_buf-(uint8_t *)nullptr)&15) == 0,
+ "ScaleYCbCrToRGB565 source image unaligned");
+ /*We take an area-based approach to pixel coverage to avoid shifting by small
+ amounts (or not so small, when up-scaling or down-scaling by a large
+ factor).
+
+ An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^.
+
+ + = RGB destination locations
+ * = Y' source locations
+ - = Cb, Cr source locations
+
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+ - -
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+ - -
+ + + + + + + + +
+ * * * *
+ + + + + + + + +
+
+ So, the coordinates of the upper-left + (first destination site) should
+ be (-0.25,-0.25) in the source Y' coordinate system.
+ Similarly, the coordinates should be (-0.375,-0.375) in the source Cb, Cr
+ coordinate system.
+ Note that the origin and scale of these two coordinate systems is not the
+ same!
+
+ ^JPEG cositing is required for Theora; VP8 doesn't specify cositing rules,
+ but nearly all software converters in existence (at least those that are
+ open source, and many that are not) use JPEG cositing instead of MPEG.*/
+ source_dx_q16 = (source_width<<16) / width;
+ source_x0_q16 = (source_x0<<16)+(source_dx_q16>>1)-0x8000;
+ source_dy_q16 = (source_height<<16) / height;
+ source_y0_q16 = (source_y0<<16)+(source_dy_q16>>1)-0x8000;
+ x_shift = (yuv_type != YV24);
+ y_shift = (yuv_type == YV12);
+ /*These two variables hold the difference between the origins of the Y' and
+ the Cb, Cr coordinate systems, using the scale of the Y' coordinate
+ system.*/
+ source_uv_xoffs_q16 = -(x_shift<<15);
+ source_uv_yoffs_q16 = -(y_shift<<15);
+ /*Compute the range of source rows we'll actually use.
+ This doesn't guarantee we won't read outside this range.*/
+ ymin = source_height >= 0 ? source_y0 : source_y0+source_height-1;
+ ymax = source_height >= 0 ? source_y0+source_height-1 : source_y0;
+ uvmin = ymin>>y_shift;
+ uvmax = ((ymax+1+y_shift)>>y_shift)-1;
+ /*Pick a dithering pattern.
+ The "&3" at the end is just in case RAND_MAX is lying.*/
+ dither = (rand()/(RAND_MAX>>2))&3;
+ /*Nearest-neighbor scaling.*/
+ if (filter == FILTER_NONE) {
+ yuv2rgb565_row_scale_nearest_ctx ctx;
+ yuv2rgb565_row_scale_nearest_func scale_row;
+ int y;
+ /*Add rounding offsets once, in advance.*/
+ source_x0_q16 += 0x8000;
+ source_y0_q16 += 0x8000;
+ source_uv_xoffs_q16 += (x_shift<<15);
+ source_uv_yoffs_q16 += (y_shift<<15);
+ if (yuv_type == YV12)
+ scale_row = ScaleYCbCr42xToRGB565_Nearest_Row_C;
+ else
+ scale_row = ScaleYCbCr444ToRGB565_Nearest_Row_C;
+ ctx.width = width;
+ ctx.source_x0_q16 = source_x0_q16;
+ ctx.source_dx_q16 = source_dx_q16;
+ ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+ for (y=0; y<height; y++) {
+ int source_y;
+ ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+ source_y = source_y0_q16>>16;
+ source_y = clamped(source_y, ymin, ymax);
+ ctx.y_row = y_buf + source_y*y_pitch;
+ source_y = (source_y0_q16+source_uv_yoffs_q16)>>(16+y_shift);
+ source_y = clamped(source_y, uvmin, uvmax);
+ source_y0_q16 += source_dy_q16;
+ ctx.u_row = u_buf + source_y*uv_pitch;
+ ctx.v_row = v_buf + source_y*uv_pitch;
+ (*scale_row)(&ctx, dither);
+ dither ^= 2;
+ }
+ }
+ /*Bilinear scaling.*/
+ else {
+ yuv2rgb565_row_scale_bilinear_ctx ctx;
+ yuv2rgb565_row_scale_bilinear_func scale_row;
+ int uvxscale_min;
+ int uvxscale_max;
+ int uvyscale_min;
+ int uvyscale_max;
+ int y;
+ /*Check how close the chroma scaling is to unity.
+ If it's close enough, we can get away with nearest-neighbor chroma
+ sub-sampling, and only doing bilinear on luma.
+ If a given axis is subsampled, we use bounds on the luma step of
+ [0.67...2], which is equivalent to scaling chroma by [1...3].
+ If it's not subsampled, we use bounds of [0.5...1.33], which is
+ equivalent to scaling chroma by [0.75...2].
+ The lower bound is chosen as a trade-off between speed and how terrible
+ nearest neighbor looks when upscaling.*/
+# define CHROMA_NEAREST_SUBSAMP_STEP_MIN 0xAAAA
+# define CHROMA_NEAREST_NORMAL_STEP_MIN 0x8000
+# define CHROMA_NEAREST_SUBSAMP_STEP_MAX 0x20000
+# define CHROMA_NEAREST_NORMAL_STEP_MAX 0x15555
+ uvxscale_min = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvxscale_max = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ uvyscale_min = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvyscale_max = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ if (uvxscale_min <= abs(source_dx_q16)
+ && abs(source_dx_q16) <= uvxscale_max
+ && uvyscale_min <= abs(source_dy_q16)
+ && abs(source_dy_q16) <= uvyscale_max) {
+ /*Add the rounding offsets now.*/
+ source_uv_xoffs_q16 += 1<<(15+x_shift);
+ source_uv_yoffs_q16 += 1<<(15+y_shift);
+ if (yuv_type != YV24) {
+ scale_row =
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+ supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON :
+# endif
+ ScaleYCbCr42xToRGB565_BilinearY_Row_C;
+ }
+ else
+ scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C;
+ }
+ else {
+ if (yuv_type == YV12)
+ scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C;
+ else if (yuv_type == YV16)
+ scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C;
+ else
+ scale_row = ScaleYCbCr444ToRGB565_Bilinear_Row_C;
+ }
+ ctx.width = width;
+ ctx.y_pitch = y_pitch;
+ ctx.source_x0_q16 = source_x0_q16;
+ ctx.source_dx_q16 = source_dx_q16;
+ ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+ ctx.uv_pitch = uv_pitch;
+ for (y=0; y<height; y++) {
+ int source_y;
+ int yweight;
+ int uvweight;
+ ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+ source_y = (source_y0_q16+128)>>16;
+ yweight = ((source_y0_q16+128)>>8)&0xFF;
+ if (source_y < ymin) {
+ source_y = ymin;
+ yweight = 0;
+ }
+ if (source_y > ymax) {
+ source_y = ymax;
+ yweight = 0;
+ }
+ ctx.y_row = y_buf + source_y*y_pitch;
+ source_y = source_y0_q16+source_uv_yoffs_q16+(128<<y_shift);
+ source_y0_q16 += source_dy_q16;
+ uvweight = source_y>>(8+y_shift)&0xFF;
+ source_y >>= 16+y_shift;
+ if (source_y < uvmin) {
+ source_y = uvmin;
+ uvweight = 0;
+ }
+ if (source_y > uvmax) {
+ source_y = uvmax;
+ uvweight = 0;
+ }
+ ctx.u_row = u_buf + source_y*uv_pitch;
+ ctx.v_row = v_buf + source_y*uv_pitch;
+ ctx.y_yweight = yweight;
+ ctx.uv_yweight = uvweight;
+ (*scale_row)(&ctx, dither);
+ dither ^= 2;
+ }
+ }
+}
+
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ YUVType yuv_type,
+ ScaleFilter filter)
+{
+ // Very fast.
+ if (width <= 0 || height <= 0)
+ return true;
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+ if (filter != FILTER_NONE) {
+ int source_dx_q16;
+ int source_dy_q16;
+ int uvxscale_min;
+ int uvxscale_max;
+ int uvyscale_min;
+ int uvyscale_max;
+ source_dx_q16 = (source_width<<16) / width;
+ source_dy_q16 = (source_height<<16) / height;
+ uvxscale_min = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvxscale_max = yuv_type != YV24 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ uvyscale_min = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+ uvyscale_max = yuv_type == YV12 ?
+ CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+ if (uvxscale_min <= abs(source_dx_q16)
+ && abs(source_dx_q16) <= uvxscale_max
+ && uvyscale_min <= abs(source_dy_q16)
+ && abs(source_dy_q16) <= uvyscale_max) {
+ if (yuv_type != YV24)
+ return supports_neon();
+ }
+ }
+# endif
+ return false;
+}
+
+
+
+void yuv_to_rgb565_row_c(uint16 *dst,
+ const uint8 *y,
+ const uint8 *u,
+ const uint8 *v,
+ int x_shift,
+ int pic_x,
+ int pic_width)
+{
+ int x;
+ for (x = 0; x < pic_width; x++)
+ {
+ dst[x] = yu2rgb565(y[pic_x+x],
+ u[(pic_x+x)>>x_shift],
+ v[(pic_x+x)>>x_shift],
+ 2); // Disable dithering for now.
+ }
+}
+
+void ConvertYCbCrToRGB565(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type)
+{
+ int x_shift;
+ int y_shift;
+ x_shift = yuv_type != YV24;
+ y_shift = yuv_type == YV12;
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+ if (yuv_type != YV24 && supports_neon())
+ {
+ for (int i = 0; i < pic_height; i++) {
+ int yoffs;
+ int uvoffs;
+ yoffs = y_pitch * (pic_y+i) + pic_x;
+ uvoffs = uv_pitch * ((pic_y+i)>>y_shift) + (pic_x>>x_shift);
+ yuv42x_to_rgb565_row_neon((uint16*)(rgb_buf + rgb_pitch * i),
+ y_buf + yoffs,
+ u_buf + uvoffs,
+ v_buf + uvoffs,
+ pic_width,
+ pic_x&x_shift);
+ }
+ }
+ else
+# endif
+ {
+ for (int i = 0; i < pic_height; i++) {
+ int yoffs;
+ int uvoffs;
+ yoffs = y_pitch * (pic_y+i);
+ uvoffs = uv_pitch * ((pic_y+i)>>y_shift);
+ yuv_to_rgb565_row_c((uint16*)(rgb_buf + rgb_pitch * i),
+ y_buf + yoffs,
+ u_buf + uvoffs,
+ v_buf + uvoffs,
+ x_shift,
+ pic_x,
+ pic_width);
+ }
+ }
+}
+
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ YUVType yuv_type)
+{
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+ return (yuv_type != YV24 && supports_neon());
+# else
+ return false;
+# endif
+}
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.h b/gfx/ycbcr/ycbcr_to_rgb565.h
new file mode 100644
index 000000000..41272223b
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef MEDIA_BASE_YCBCR_TO_RGB565_H_
+#define MEDIA_BASE_YCBCR_TO_RGB565_H_
+#include "yuv_convert.h"
+#include "mozilla/arm.h"
+
+// It's currently only worth including this if we have NEON support.
+#ifdef MOZILLA_MAY_SUPPORT_NEON
+#define HAVE_YCBCR_TO_RGB565 1
+#endif
+
+namespace mozilla {
+
+namespace gfx {
+
+#ifdef HAVE_YCBCR_TO_RGB565
+// Convert a frame of YUV to 16 bit RGB565.
+void ConvertYCbCrToRGB565(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+// Used to test if we have an accelerated version.
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ YUVType yuv_type);
+
+// Scale a frame of YUV to 16 bit RGB565.
+void ScaleYCbCrToRGB565(const uint8_t *yplane,
+ const uint8_t *uplane,
+ const uint8_t *vplane,
+ uint8_t *rgbframe,
+ int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ ScaleFilter filter);
+
+// Used to test if we have an accelerated version.
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+ int source_y0,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ YUVType yuv_type,
+ ScaleFilter filter);
+#endif // HAVE_YCBCR_TO_RGB565
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // MEDIA_BASE_YCBCR_TO_RGB565_H_
diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
new file mode 100644
index 000000000..78fd4ee89
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -0,0 +1,510 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This webpage shows layout of YV12 and other YUV formats
+// http://www.fourcc.org/yuv.php
+// The actual conversion is best described here
+// http://en.wikipedia.org/wiki/YUV
+// An article on optimizing YUV conversion using tables instead of multiplies
+// http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
+//
+// YV12 is a full plane of Y and a half height, half width chroma planes
+// YV16 is a full plane of Y and a full height, half width chroma planes
+// YV24 is a full plane of Y and a full height, full width chroma planes
+//
+// ARGB pixel format is output, which on little endian is stored as BGRA.
+// The alpha is set to 255, allowing the application to use RGBA or RGB32.
+
+#include "yuv_convert.h"
+
+#include "gfxPrefs.h"
+#include "libyuv.h"
+#include "scale_yuv_argb.h"
+// Header for low level row functions.
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// 16.16 fixed point arithmetic
+const int kFractionBits = 16;
+const int kFractionMax = 1 << kFractionBits;
+const int kFractionMask = ((1 << kFractionBits) - 1);
+
+YUVType TypeFromSize(int ywidth,
+ int yheight,
+ int cbcrwidth,
+ int cbcrheight)
+{
+ if (ywidth == cbcrwidth && yheight == cbcrheight) {
+ return YV24;
+ }
+ else if ((ywidth + 1) / 2 == cbcrwidth && yheight == cbcrheight) {
+ return YV16;
+ }
+ else {
+ return YV12;
+ }
+}
+
+libyuv::FourCC FourCCFromYUVType(YUVType aYUVType)
+{
+ if (aYUVType == YV24) {
+ return libyuv::FOURCC_I444;
+ } else if (aYUVType == YV16) {
+ return libyuv::FOURCC_I422;
+ } else if (aYUVType == YV12) {
+ return libyuv::FOURCC_I420;
+ } else {
+ return libyuv::FOURCC_ANY;
+ }
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space) {
+
+
+ // Deprecated function's conversion is accurate.
+ // libyuv converion is a bit inaccurate to get performance. It dynamically
+ // calculates RGB from YUV to use simd. In it, signed byte is used for conversion's
+ // coefficient, but it requests 129. libyuv cut 129 to 127. And only 6 bits are
+ // used for a decimal part during the dynamic calculation.
+ //
+ // The function is still fast on some old intel chips.
+ // See Bug 1256475.
+ bool use_deprecated = gfxPrefs::YCbCrAccurateConversion() ||
+ (supports_mmx() && supports_sse() && !supports_sse3() &&
+ yuv_color_space == YUVColorSpace::BT601);
+ // The deprecated function only support BT601.
+ // See Bug 1210357.
+ if (yuv_color_space != YUVColorSpace::BT601) {
+ use_deprecated = false;
+ }
+ if (use_deprecated) {
+ ConvertYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, rgb_buf,
+ pic_x, pic_y, pic_width, pic_height,
+ y_pitch, uv_pitch, rgb_pitch, yuv_type);
+ return;
+ }
+
+ if (yuv_type == YV24) {
+ const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+ const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x;
+ const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x;
+ DebugOnly<int> err = libyuv::I444ToARGB(src_y, y_pitch,
+ src_u, uv_pitch,
+ src_v, uv_pitch,
+ rgb_buf, rgb_pitch,
+ pic_width, pic_height);
+ MOZ_ASSERT(!err);
+ } else if (yuv_type == YV16) {
+ const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+ const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x / 2;
+ const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x / 2;
+ DebugOnly<int> err = libyuv::I422ToARGB(src_y, y_pitch,
+ src_u, uv_pitch,
+ src_v, uv_pitch,
+ rgb_buf, rgb_pitch,
+ pic_width, pic_height);
+ MOZ_ASSERT(!err);
+ } else {
+ MOZ_ASSERT(yuv_type == YV12);
+ const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+ const uint8* src_u = u_buf + (uv_pitch * pic_y + pic_x) / 2;
+ const uint8* src_v = v_buf + (uv_pitch * pic_y + pic_x) / 2;
+ if (yuv_color_space == YUVColorSpace::BT709) {
+ DebugOnly<int> err = libyuv::H420ToARGB(src_y, y_pitch,
+ src_u, uv_pitch,
+ src_v, uv_pitch,
+ rgb_buf, rgb_pitch,
+ pic_width, pic_height);
+ MOZ_ASSERT(!err);
+ } else {
+ MOZ_ASSERT(yuv_color_space == YUVColorSpace::BT601);
+ DebugOnly<int> err = libyuv::I420ToARGB(src_y, y_pitch,
+ src_u, uv_pitch,
+ src_v, uv_pitch,
+ rgb_buf, rgb_pitch,
+ pic_width, pic_height);
+ MOZ_ASSERT(!err);
+ }
+ }
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32_deprecated(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+ unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+ // Test for SSE because the optimized code uses movntq, which is not part of MMX.
+ bool has_sse = supports_mmx() && supports_sse();
+ // There is no optimized YV24 SSE routine so we check for this and
+ // fall back to the C code.
+ has_sse &= yuv_type != YV24;
+ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
+ int x_width = odd_pic_x ? pic_width - 1 : pic_width;
+
+ for (int y = pic_y; y < pic_height + pic_y; ++y) {
+ uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
+ const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
+ const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+ const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+
+ if (odd_pic_x) {
+ // Handle the single odd pixel manually and use the
+ // fast routines for the remaining.
+ FastConvertYUVToRGB32Row_C(y_ptr++,
+ u_ptr++,
+ v_ptr++,
+ rgb_row,
+ 1,
+ x_shift);
+ rgb_row += 4;
+ }
+
+ if (has_sse) {
+ FastConvertYUVToRGB32Row(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ x_width);
+ }
+ else {
+ FastConvertYUVToRGB32Row_C(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ x_width,
+ x_shift);
+ }
+ }
+
+ // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+ if (has_sse)
+ EMMS();
+}
+
+// C version does 8 at a time to mimic MMX code
+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ uint8* end = ybuf + source_width;
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+ ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
+}
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
+ const uint8* y1_ptr, int source_width,
+ int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ if (mozilla::supports_sse2()) {
+ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+ if (mozilla::supports_mmx()) {
+ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ return;
+ }
+#endif
+
+ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
+
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ScaleFilter filter) {
+
+ bool use_deprecated = gfxPrefs::YCbCrAccurateConversion() ||
+#if defined(XP_WIN) && defined(_M_X64)
+ // libyuv does not support SIMD scaling on win 64bit. See Bug 1295927.
+ supports_sse3() ||
+#endif
+ (supports_mmx() && supports_sse() && !supports_sse3());
+ // The deprecated function only support BT601.
+ // See Bug 1210357.
+ if (yuv_color_space != YUVColorSpace::BT601) {
+ use_deprecated = false;
+ }
+ if (use_deprecated) {
+ ScaleYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf,
+ rgb_buf,
+ source_width, source_height,
+ width, height,
+ y_pitch, uv_pitch,
+ rgb_pitch,
+ yuv_type,
+ ROTATE_0,
+ filter);
+ return;
+ }
+
+ DebugOnly<int> err =
+ libyuv::YUVToARGBScale(y_buf, y_pitch,
+ u_buf, uv_pitch,
+ v_buf, uv_pitch,
+ FourCCFromYUVType(yuv_type),
+ yuv_color_space,
+ source_width, source_height,
+ rgb_buf, rgb_pitch,
+ width, height,
+ libyuv::kFilterBilinear);
+ MOZ_ASSERT(!err);
+ return;
+}
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32_deprecated(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int y_pitch,
+ int uv_pitch,
+ int rgb_pitch,
+ YUVType yuv_type,
+ Rotate view_rotate,
+ ScaleFilter filter) {
+ bool has_mmx = supports_mmx();
+
+ // 4096 allows 3 buffers to fit in 12k.
+ // Helps performance on CPU with 16K L1 cache.
+ // Large enough for 3830x2160 and 30" displays which are 2560x1600.
+ const int kFilterBufferSize = 4096;
+ // Disable filtering if the screen is too big (to avoid buffer overflows).
+ // This should never happen to regular users: they don't have monitors
+ // wider than 4096 pixels.
+ // TODO(fbarchard): Allow rotated videos to filter.
+ if (source_width > kFilterBufferSize || view_rotate)
+ filter = FILTER_NONE;
+
+ unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+ // Diagram showing origin and direction of source sampling.
+ // ->0 4<-
+ // 7 3
+ //
+ // 6 5
+ // ->1 2<-
+ // Rotations that start at right side of image.
+ if ((view_rotate == ROTATE_180) ||
+ (view_rotate == ROTATE_270) ||
+ (view_rotate == MIRROR_ROTATE_0) ||
+ (view_rotate == MIRROR_ROTATE_90)) {
+ y_buf += source_width - 1;
+ u_buf += source_width / 2 - 1;
+ v_buf += source_width / 2 - 1;
+ source_width = -source_width;
+ }
+ // Rotations that start at bottom of image.
+ if ((view_rotate == ROTATE_90) ||
+ (view_rotate == ROTATE_180) ||
+ (view_rotate == MIRROR_ROTATE_90) ||
+ (view_rotate == MIRROR_ROTATE_180)) {
+ y_buf += (source_height - 1) * y_pitch;
+ u_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+ v_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+ source_height = -source_height;
+ }
+
+ // Handle zero sized destination.
+ if (width == 0 || height == 0)
+ return;
+ int source_dx = source_width * kFractionMax / width;
+ int source_dy = source_height * kFractionMax / height;
+ int source_dx_uv = source_dx;
+
+ if ((view_rotate == ROTATE_90) ||
+ (view_rotate == ROTATE_270)) {
+ int tmp = height;
+ height = width;
+ width = tmp;
+ tmp = source_height;
+ source_height = source_width;
+ source_width = tmp;
+ int original_dx = source_dx;
+ int original_dy = source_dy;
+ source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits;
+ source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits;
+ source_dy = original_dx;
+ if (view_rotate == ROTATE_90) {
+ y_pitch = -1;
+ uv_pitch = -1;
+ source_height = -source_height;
+ } else {
+ y_pitch = 1;
+ uv_pitch = 1;
+ }
+ }
+
+ // Need padding because FilterRows() will write 1 to 16 extra pixels
+ // after the end for SSE2 version.
+ uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
+ uint8* ybuf =
+ reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
+ uint8* ubuf = ybuf + kFilterBufferSize;
+ uint8* vbuf = ubuf + kFilterBufferSize;
+ // TODO(fbarchard): Fixed point math is off by 1 on negatives.
+ int yscale_fixed = (source_height << kFractionBits) / height;
+
+ // TODO(fbarchard): Split this into separate function for better efficiency.
+ for (int y = 0; y < height; ++y) {
+ uint8* dest_pixel = rgb_buf + y * rgb_pitch;
+ int source_y_subpixel = (y * yscale_fixed);
+ if (yscale_fixed >= (kFractionMax * 2)) {
+ source_y_subpixel += kFractionMax / 2; // For 1/2 or less, center filter.
+ }
+ int source_y = source_y_subpixel >> kFractionBits;
+
+ const uint8* y0_ptr = y_buf + source_y * y_pitch;
+ const uint8* y1_ptr = y0_ptr + y_pitch;
+
+ const uint8* u0_ptr = u_buf + (source_y >> y_shift) * uv_pitch;
+ const uint8* u1_ptr = u0_ptr + uv_pitch;
+ const uint8* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch;
+ const uint8* v1_ptr = v0_ptr + uv_pitch;
+
+ // vertical scaler uses 16.8 fixed point
+ int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8;
+ int source_uv_fraction =
+ ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
+
+ const uint8* y_ptr = y0_ptr;
+ const uint8* u_ptr = u0_ptr;
+ const uint8* v_ptr = v0_ptr;
+ // Apply vertical filtering if necessary.
+ // TODO(fbarchard): Remove memcpy when not necessary.
+ if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
+ if (yscale_fixed != kFractionMax &&
+ source_y_fraction && ((source_y + 1) < source_height)) {
+ FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ } else {
+ memcpy(ybuf, y0_ptr, source_width);
+ }
+ y_ptr = ybuf;
+ ybuf[source_width] = ybuf[source_width-1];
+ int uv_source_width = (source_width + 1) / 2;
+ if (yscale_fixed != kFractionMax &&
+ source_uv_fraction &&
+ (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {
+ FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
+ FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
+ } else {
+ memcpy(ubuf, u0_ptr, uv_source_width);
+ memcpy(vbuf, v0_ptr, uv_source_width);
+ }
+ u_ptr = ubuf;
+ v_ptr = vbuf;
+ ubuf[uv_source_width] = ubuf[uv_source_width - 1];
+ vbuf[uv_source_width] = vbuf[uv_source_width - 1];
+ }
+ if (source_dx == kFractionMax) { // Not scaled
+ FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width);
+ } else if (filter & FILTER_BILINEAR_H) {
+ LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+// Specialized scalers and rotation.
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
+ if(mozilla::supports_sse()) {
+ if (width == (source_width * 2)) {
+ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width);
+ } else if ((source_dx & kFractionMask) == 0) {
+ // Scaling by integer scale factor. ie half.
+ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width,
+ source_dx >> kFractionBits);
+ } else if (source_dx_uv == source_dx) { // Not rotated.
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ } else {
+ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width,
+ source_dx >> kFractionBits,
+ source_dx_uv >> kFractionBits);
+ }
+ }
+ else {
+ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+ }
+#else
+ (void)source_dx_uv;
+ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+ dest_pixel, width, source_dx);
+#endif
+ }
+ }
+ // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
+ if (has_mmx)
+ EMMS();
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
new file mode 100644
index 000000000..266a23d45
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_YUV_CONVERT_H_
+#define MEDIA_BASE_YUV_CONVERT_H_
+
+#include "chromium_types.h"
+#include "ImageTypes.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// Type of YUV surface.
+// The value of these enums matter as they are used to shift vertical indices.
+enum YUVType {
+ YV12 = 0, // YV12 is half width and half height chroma channels.
+ YV16 = 1, // YV16 is half width and full height chroma channels.
+ YV24 = 2 // YV24 is full width and full height chroma channels.
+};
+
+// Mirror means flip the image horizontally, as in looking in a mirror.
+// Rotate happens after mirroring.
+enum Rotate {
+ ROTATE_0, // Rotation off.
+ ROTATE_90, // Rotate clockwise.
+ ROTATE_180, // Rotate upside down.
+ ROTATE_270, // Rotate counter clockwise.
+ MIRROR_ROTATE_0, // Mirror horizontally.
+ MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
+ MIRROR_ROTATE_180, // Mirror vertically.
+ MIRROR_ROTATE_270 // Transpose.
+};
+
+// Filter affects how scaling looks.
+enum ScaleFilter {
+ FILTER_NONE = 0, // No filter (point sampled).
+ FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
+ FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
+ FILTER_BILINEAR = 3 // Bilinear filter.
+};
+
+YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight);
+
+// Convert a frame of YUV to 32 bit ARGB.
+// Pass in YV16/YV12 depending on source format
+void ConvertYCbCrToRGB32(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space);
+
+void ConvertYCbCrToRGB32_deprecated(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int pic_x,
+ int pic_y,
+ int pic_width,
+ int pic_height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+// Scale a frame of YUV to 32 bit ARGB.
+// Supports rotation and mirroring.
+void ScaleYCbCrToRGB32(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ YUVColorSpace yuv_color_space,
+ ScaleFilter filter);
+
+void ScaleYCbCrToRGB32_deprecated(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int source_width,
+ int source_height,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type,
+ Rotate view_rotate,
+ ScaleFilter filter);
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif // MEDIA_BASE_YUV_CONVERT_H_
diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
new file mode 100644
index 000000000..081343b0b
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_arm.cpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
+
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+# if defined(MOZILLA_MAY_SUPPORT_NEON)
+# if defined(__clang__)
+void __attribute((noinline))
+# else
+void __attribute((noinline,optimize("-fomit-frame-pointer")))
+# endif
+ yuv42x_to_rgb565_row_neon(uint16 *dst,
+ const uint8 *y,
+ const uint8 *u,
+ const uint8 *v,
+ int n,
+ int oddflag)
+{
+ static __attribute__((aligned(16))) uint16 acc_r[8] = {
+ 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
+ };
+ static __attribute__((aligned(16))) uint16 acc_g[8] = {
+ 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
+ };
+ static __attribute__((aligned(16))) uint16 acc_b[8] = {
+ 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
+ };
+ /*
+ * Registers:
+ * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data
+ * q2 : d4, d5 - are used for storing converted RGB data
+ * q3 : d6, d7 - are used for temporary storage
+ *
+ * q4-q7 - reserved
+ *
+ * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data
+ * q10 : d20, d21
+ * q11 : d22, d23
+ * q12 : d24, d25
+ * q13 : d26, d27
+ * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154)
+ */
+ asm volatile (
+".fpu neon\n"
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+".arch armv7-a\n"
+".object_arch armv4t\n"
+".macro convert_macroblock size\n"
+/* load up to 16 source pixels */
+ ".if \\size == 16\n"
+ "pld [%[y], #64]\n"
+ "pld [%[u], #64]\n"
+ "pld [%[v], #64]\n"
+ "vld1.8 {d1}, [%[y]]!\n"
+ "vld1.8 {d3}, [%[y]]!\n"
+ "vld1.8 {d0}, [%[u]]!\n"
+ "vld1.8 {d2}, [%[v]]!\n"
+ ".elseif \\size == 8\n"
+ "vld1.8 {d1}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d0[1]}, [%[u]]!\n"
+ "vld1.8 {d0[2]}, [%[u]]!\n"
+ "vld1.8 {d0[3]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ "vld1.8 {d2[1]}, [%[v]]!\n"
+ "vld1.8 {d2[2]}, [%[v]]!\n"
+ "vld1.8 {d2[3]}, [%[v]]!\n"
+ ".elseif \\size == 4\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d1[1]}, [%[y]]!\n"
+ "vld1.8 {d1[2]}, [%[y]]!\n"
+ "vld1.8 {d1[3]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d0[1]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ "vld1.8 {d2[1]}, [%[v]]!\n"
+ ".elseif \\size == 2\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d1[1]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ ".elseif \\size == 1\n"
+ "vld1.8 {d1[0]}, [%[y]]!\n"
+ "vld1.8 {d0[0]}, [%[u]]!\n"
+ "vld1.8 {d2[0]}, [%[v]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+
+ /* d1 - Y data (first 8 bytes) */
+ /* d3 - Y data (next 8 bytes) */
+ /* d0 - U data, d2 - V data */
+
+ /* split even and odd Y color components */
+ "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */
+ /* clip upper and lower boundaries */
+ "vqadd.u8 q0, q0, q4\n"
+ "vqadd.u8 q1, q1, q4\n"
+ "vqsub.u8 q0, q0, q5\n"
+ "vqsub.u8 q1, q1, q5\n"
+
+ "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */
+
+ "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */
+ "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */
+
+ "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
+ "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */
+ "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */
+ "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
+ "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */
+ "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */
+ "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
+ "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */
+ "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */
+
+ "vhsub.s16 q3, q8, q10\n" /* calculate even red components */
+ "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */
+ "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */
+ "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */
+
+ "vhadd.s16 q3, q8, q11\n" /* calculate even green components */
+ "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */
+ "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */
+ "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */
+
+ "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */
+ "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */
+ "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */
+ "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */
+
+ "vzip.8 d0, d3\n" /* join even and odd red components */
+ "vzip.8 d1, d4\n" /* join even and odd green components */
+ "vzip.8 d2, d5\n" /* join even and odd blue components */
+
+ "vshll.u8 q3, d0, #8\n\t"
+ "vshll.u8 q8, d1, #8\n\t"
+ "vshll.u8 q9, d2, #8\n\t"
+ "vsri.u16 q3, q8, #5\t\n"
+ "vsri.u16 q3, q9, #11\t\n"
+ /* store pixel data to memory */
+ ".if \\size == 16\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ " vshll.u8 q3, d3, #8\n\t"
+ " vshll.u8 q8, d4, #8\n\t"
+ " vshll.u8 q9, d5, #8\n\t"
+ " vsri.u16 q3, q8, #5\t\n"
+ " vsri.u16 q3, q9, #11\t\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ ".elseif \\size == 8\n"
+ " vst1.16 {d6, d7}, [%[dst]]!\n"
+ ".elseif \\size == 4\n"
+ " vst1.16 {d6}, [%[dst]]!\n"
+ ".elseif \\size == 2\n"
+ " vst1.16 {d6[0]}, [%[dst]]!\n"
+ " vst1.16 {d6[1]}, [%[dst]]!\n"
+ ".elseif \\size == 1\n"
+ " vst1.16 {d6[0]}, [%[dst]]!\n"
+ ".endif\n"
+ ".endm\n"
+
+ "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */
+ "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */
+ "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */
+ "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */
+
+ "vmov.u8 d26, #16\n"
+ "vmov.u8 d27, #149\n"
+ "vmov.u8 d28, #204\n"
+ "vmov.u8 d29, #50\n"
+ "vmov.u8 d30, #104\n"
+ "vmov.u8 d31, #154\n"
+
+ "cmp %[oddflag], #0\n"
+ "beq 1f\n"
+ "convert_macroblock 1\n"
+ "sub %[n], %[n], #1\n"
+ "1:\n"
+ "subs %[n], %[n], #16\n"
+ "blt 2f\n"
+ "1:\n"
+ "convert_macroblock 16\n"
+ "subs %[n], %[n], #16\n"
+ "bge 1b\n"
+ "2:\n"
+ "tst %[n], #8\n"
+ "beq 3f\n"
+ "convert_macroblock 8\n"
+ "3:\n"
+ "tst %[n], #4\n"
+ "beq 4f\n"
+ "convert_macroblock 4\n"
+ "4:\n"
+ "tst %[n], #2\n"
+ "beq 5f\n"
+ "convert_macroblock 2\n"
+ "5:\n"
+ "tst %[n], #1\n"
+ "beq 6f\n"
+ "convert_macroblock 1\n"
+ "6:\n"
+ ".purgem convert_macroblock\n"
+ : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
+ : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
+ [oddflag] "r" (oddflag)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+ );
+}
+# endif // MOZILLA_MAY_SUPPORT_NEON
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
new file mode 100644
index 000000000..b5353e500
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m64 zero = _mm_setzero_si64();
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
new file mode 100644
index 000000000..25fe20639
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
new file mode 100644
index 000000000..c89f54b8f
--- /dev/null
+++ b/gfx/ycbcr/yuv_row.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// yuv_row internal functions to handle YUV conversion and scaling to RGB.
+// These functions are used from both yuv_convert.cc and yuv_scale.cc.
+
+// TODO(fbarchard): Write function that can handle rotation and scaling.
+
+#ifndef MEDIA_BASE_YUV_ROW_H_
+#define MEDIA_BASE_YUV_ROW_H_
+
+#include "chromium_types.h"
+
+extern "C" {
+// Can only do 1x.
+// This is the second fastest of the scalers.
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ unsigned int x_shift);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+// Only defined on Windows x86-32.
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+// Only defined on Windows x86-32.
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int ystep,
+ int uvstep);
+
+// Doubler does 4 pixels at a time. Each pixel is replicated.
+// This is the fastest of the scalers.
+// Only defined on Windows x86-32.
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+// Handles arbitrary scaling up or down.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+// Handles arbitrary scaling up or down with bilinear filtering.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+// This is the slowest of the scalers.
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+
+// x64 uses MMX2 (SSE) so emms is not required.
+// Warning C4799: function has no EMMS instruction.
+// EMMS() is slow and should be called by the calling function once per image.
+#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
+#if defined(_MSC_VER)
+#define EMMS() __asm emms
+#pragma warning(disable: 4799)
+#else
+#define EMMS() asm("emms")
+#endif
+#else
+#define EMMS() ((void)0)
+#endif
+
+} // extern "C"
+
+#endif // MEDIA_BASE_YUV_ROW_H_
diff --git a/gfx/ycbcr/yuv_row_arm.s b/gfx/ycbcr/yuv_row_arm.s
new file mode 100644
index 000000000..6a6c81bee
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_arm.s
@@ -0,0 +1,304 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+ .arch armv7-a
+ .fpu neon
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+ .object_arch armv4t
+ .text
+ .align
+
+ .balign 64
+YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
+ .short -14240
+ .short -14240+384
+ .short 8672
+ .short 8672+192
+ .short -17696
+ .short -17696+384
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
+ .short -14240+128
+ .short -14240+256
+ .short 8672+64
+ .short 8672+128
+ .short -17696+128
+ .short -17696+256
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
+ .short -14240+256
+ .short -14240+128
+ .short 8672+128
+ .short 8672+64
+ .short -17696+256
+ .short -17696+128
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
+ .short -14240+384
+ .short -14240
+ .short 8672+192
+ .short 8672
+ .short -17696+384
+ .short -17696
+ .byte 102
+ .byte 25
+ .byte 52
+ .byte 129
+
+@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+@
+@ ctx = {
+@ uint16_t *rgb_row; /*r0*/
+@ const uint8_t *y_row; /*r1*/
+@ const uint8_t *u_row; /*r2*/
+@ const uint8_t *v_row; /*r3*/
+@ int y_yweight; /*r4*/
+@ int y_pitch; /*r5*/
+@ int width; /*r6*/
+@ int source_x0_q16; /*r7*/
+@ int source_dx_q16; /*r8*/
+@ int source_uv_xoffs_q16; /*r9*/
+@ };
+ .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+ .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
+ .balign 64
+ .fnstart
+ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
+ STMFD r13!,{r4-r9,r14} @ 8 words.
+ ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
+ VPUSH {Q4-Q7} @ 16 words.
+ ADD r14,r14,r1, LSL #4 @ Select the dither table to use
+ LDMIA r0, {r0-r9}
+ @ Set up image index registers.
+ ADD r12,r8, r8
+ VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16
+ VDUP.32 D17,r12
+ ADD r12,r12,r12
+ VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16
+ VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16
+ ADD r12,r12,r12
+ VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16
+ VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16
+ CMP r8, #0 @ If source_dx_q16 is negative...
+ VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16
+ ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block
+ VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
+ SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use)
+ VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
+ VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16
+ VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
+ VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
+ VLD1.64 {D30,D31},[r14,:128] @ Load some constants
+ VMOV.I8 D28,#52
+ VMOV.I8 D29,#129
+ @ The basic idea here is to do aligned loads of a block of data and then
+ @ index into it using VTBL to extract the data from the source X
+ @ coordinate corresponding to each destination pixel.
+ @ This is significantly less code and significantly fewer cycles than doing
+ @ a series of single-lane loads, but it means that the X step between
+ @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
+ @ that we could read 8 pixels from a single aligned 32-byte block of data.
+ @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
+ @ separated into even pixels and odd pixels to make extracting offsets and
+ @ weights easier.
+ @ We then pull out two bytes from the middle of each coordinate: the top
+ @ byte corresponds to the integer part of the X coordinate, and the bottom
+ @ byte corresponds to the weight to use for bilinear blending.
+ @ These are separated out into different registers with VTRN.
+ @ Then by subtracting the integer X coordinate of the first pixel in the
+ @ data block we loaded, we produce an index register suitable for use by
+ @ VTBL.
+s42xbily_neon_loop:
+ @ Load the Y' data.
+ MOV r12,r7, ASR #16
+ VRSHRN.S32 D16,Q0, #8
+ AND r12,r12,#~15 @ Read 16-byte aligned blocks
+ VDUP.I8 D20,r12
+ ADD r12,r1, r12 @ r12 = y_row+(source_x&~7)
+ VRSHRN.S32 D17,Q1, #8
+ PLD [r12,#64]
+ VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row
+ ADD r14,r7, r8, LSL #3
+ VRSHRN.S32 D18,Q2, #8
+ MOV r14,r14,ASR #16
+ VRSHRN.S32 D19,Q3, #8
+ AND r14,r14,#~15 @ Read 16-byte aligned blocks
+ VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row
+ PLD [r12,#64]
+ VDUP.I8 D21,r14
+ ADD r14,r1, r14 @ r14 = y_row+(source_x&~7)
+ VMOV.I8 Q13,#1
+ PLD [r14,#64]
+ VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+ @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+ VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded.
+ @ First 8 Y' pixels
+ VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x
+ VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x
+ VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x
+ VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1
+ VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1
+ @ Next 8 Y' pixels
+ VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row
+ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row
+ PLD [r14,#64]
+ VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x
+ VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x
+ VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1
+ VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1
+ @ Blend Y'.
+ VDUP.I16 Q9, r4 @ Load the y weights.
+ VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a
+ VSUBL.U8 Q5, D25,D21
+ VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b
+ VSUBL.U8 Q7, D27,D23
+ VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight
+ VMUL.S16 Q5, Q5, Q9
+ VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight
+ VMUL.S16 Q7, Q7, Q9
+ VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits.
+ VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW.
+ VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8
+ VRSHRN.S16 D9, Q5, #8
+ VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8
+ VRSHRN.S16 D13,Q7, #8
+ VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8)
+ VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8)
+ VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a
+ VSUBL.U8 Q5, D23,D21
+ VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight
+ VMUL.S16 Q5, Q5, Q13
+ VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8
+ ADD r12,r7, r9
+ VRSHRN.S16 D9, Q5, #8
+ MOV r12,r12,ASR #17
+ VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8)
+ @ Start extracting the chroma x coordinates, and load Cb and Cr.
+ AND r12,r12,#~15 @ Read 16-byte aligned blocks
+ VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4
+ ADD r14,r2, r12
+ VADD.I32 Q10,Q0, Q9
+ VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb
+ PLD [r14,#64]
+ VADD.I32 Q11,Q1, Q9
+ ADD r14,r3, r12
+ VADD.I32 Q12,Q2, Q9
+ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr
+ PLD [r14,#64]
+ VADD.I32 Q13,Q3, Q9
+ VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
+ VRSHRN.S32 D21,Q11,#9
+ VDUP.I8 Q9, r12
+ VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
+ VRSHRN.S32 D23,Q13,#9
+ @ We don't actually need the x weights, but we get them for free.
+ @ Free ALU slot
+ VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+ @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+ VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded.
+ VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x
+ VMOV.I8 D24,#74
+ VTBL.8 D19,{D8, D9, D10,D11},D23
+ VMOV.I8 D26,#102
+ VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x
+ VMOV.I8 D27,#25
+ VTBL.8 D21,{D12,D13,D14,D15},D23
+ @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
+ @ We use VDUP to expand constants, because it's a permute instruction, so
+ @ it can dual issue on the A8.
+ SUBS r6, r6, #16 @ width -= 16
+ VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74
+ VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G
+ VMULL.U8 Q5, D17,D24
+ VDUP.32 Q7, D30[1]
+ VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G
+ VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R
+ VMLSL.U8 Q7, D19,D27
+ VDUP.32 Q12,D30[0]
+ VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R
+ VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B
+ VMLAL.U8 Q12,D21,D26
+ VDUP.32 Q13,D31[0]
+ VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B
+ VMLAL.U8 Q13,D19,D29
+ VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G
+ VMLSL.U8 Q7, D21,D28
+ VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R
+ VADD.S16 Q12,Q5, Q12
+ VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B
+ VQADD.S16 Q13,Q5, Q13
+ VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G
+ VADD.S16 Q7, Q5, Q7
+ @ Push each value to the top of its word and saturate it.
+ VQSHLU.S16 Q11,Q11,#2
+ VQSHLU.S16 Q12,Q12,#2
+ VQSHLU.S16 Q6, Q6, #2
+ VQSHLU.S16 Q7, Q7, #2
+ VQSHLU.S16 Q8, Q8, #2
+ VQSHLU.S16 Q13,Q13,#2
+ @ Merge G and B into R.
+ VSRI.U16 Q11,Q6, #5
+ VSRI.U16 Q12,Q7, #5
+ VSRI.U16 Q11,Q8, #11
+ MOV r14,r8, LSL #4
+ VSRI.U16 Q12,Q13,#11
+ BLT s42xbily_neon_tail
+ VDUP.I32 Q13,r14
+ @ Store the result.
+ VST1.16 {D22,D23,D24,D25},[r0]!
+ BEQ s42xbily_neon_done
+ @ Advance the x coordinates.
+ VADD.I32 Q0, Q0, Q13
+ VADD.I32 Q1, Q1, Q13
+ ADD r7, r14
+ VADD.I32 Q2, Q2, Q13
+ VADD.I32 Q3, Q3, Q13
+ B s42xbily_neon_loop
+s42xbily_neon_tail:
+ @ We have between 1 and 15 pixels left to write.
+ @ -r6 == the number of pixels we need to skip writing.
+ @ Adjust r0 to point to the last one we need to write, because we're going
+ @ to write them in reverse order.
+ ADD r0, r0, r6, LSL #1
+ MOV r14,#-2
+ ADD r0, r0, #30
+ @ Skip past the ones we don't need to write.
+ SUB PC, PC, r6, LSL #2
+ ORR r0, r0, r0
+ VST1.16 {D25[3]},[r0,:16],r14
+ VST1.16 {D25[2]},[r0,:16],r14
+ VST1.16 {D25[1]},[r0,:16],r14
+ VST1.16 {D25[0]},[r0,:16],r14
+ VST1.16 {D24[3]},[r0,:16],r14
+ VST1.16 {D24[2]},[r0,:16],r14
+ VST1.16 {D24[1]},[r0,:16],r14
+ VST1.16 {D24[0]},[r0,:16],r14
+ VST1.16 {D23[3]},[r0,:16],r14
+ VST1.16 {D23[2]},[r0,:16],r14
+ VST1.16 {D23[1]},[r0,:16],r14
+ VST1.16 {D23[0]},[r0,:16],r14
+ VST1.16 {D22[3]},[r0,:16],r14
+ VST1.16 {D22[2]},[r0,:16],r14
+ VST1.16 {D22[1]},[r0,:16],r14
+ VST1.16 {D22[0]},[r0,:16]
+s42xbily_neon_done:
+ VPOP {Q4-Q7} @ 16 words.
+ LDMFD r13!,{r4-r9,PC} @ 8 words.
+ .fnend
+ .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+
+#if defined(__ELF__)&&defined(__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
new file mode 100644
index 000000000..d327f854e
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -0,0 +1,133 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+ (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YuvPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+ uint8* rgb_buf) {
+
+ int b = kCoefficientsRgbY[256+u][0];
+ int g = kCoefficientsRgbY[256+u][1];
+ int r = kCoefficientsRgbY[256+u][2];
+ int a = kCoefficientsRgbY[256+u][3];
+
+ b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+ g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+ r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+ a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+ b = paddsw(b, kCoefficientsRgbY[y][0]);
+ g = paddsw(g, kCoefficientsRgbY[y][1]);
+ r = paddsw(r, kCoefficientsRgbY[y][2]);
+ a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+ b >>= 6;
+ g >>= 6;
+ r >>= 6;
+ a >>= 6;
+
+ *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+ (packuswb(g) << 8) |
+ (packuswb(r) << 16) |
+ (packuswb(a) << 24);
+}
+
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ unsigned int x_shift) {
+ for (int x = 0; x < width; x += 2) {
+ uint8 u = u_buf[x >> x_shift];
+ uint8 v = v_buf[x >> x_shift];
+ uint8 y0 = y_buf[x];
+ YuvPixel(y0, u, v, rgb_buf);
+ if ((x + 1) < width) {
+ uint8 y1 = y_buf[x + 1];
+ if (x_shift == 0) {
+ u = u_buf[x + 1];
+ v = v_buf[x + 1];
+ }
+ YuvPixel(y1, u, v, rgb_buf + 4);
+ }
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+// 16.16 fixed point is used. A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ for (int i = 0; i < width; i += 2) {
+ int y = y_buf[x >> 16];
+ int u = u_buf[(x >> 17)];
+ int v = v_buf[(x >> 17)];
+ YuvPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y = y_buf[x >> 16];
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+ for (int i = 0; i < width; i += 2) {
+ int y0 = y_buf[x >> 16];
+ int y1 = y_buf[(x >> 16) + 1];
+ int u0 = u_buf[(x >> 17)];
+ int u1 = u_buf[(x >> 17) + 1];
+ int v0 = v_buf[(x >> 17)];
+ int v1 = v_buf[(x >> 17) + 1];
+ int y_frac = (x & 65535);
+ int uv_frac = ((x >> 1) & 65535);
+ int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+ int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+ YuvPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 65535);
+ y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ YuvPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+} // extern "C"
+
diff --git a/gfx/ycbcr/yuv_row_other.cpp b/gfx/ycbcr/yuv_row_other.cpp
new file mode 100644
index 000000000..c351139f9
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_other.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+}
diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
new file mode 100644
index 000000000..a84792d96
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -0,0 +1,917 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+#if defined(ARCH_CPU_X86_64)
+
+// We don't need CPUID guards here, since x86-64 implies SSE2.
+
+// AMD64 ABI uses register paremters.
+void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm(
+ "jmp 1f\n"
+"0:"
+ "movzb (%1),%%r10\n"
+ "add $0x1,%1\n"
+ "movzb (%2),%%r11\n"
+ "add $0x1,%2\n"
+ "movq 2048(%5,%%r10,8),%%xmm0\n"
+ "movzb (%0),%%r10\n"
+ "movq 4096(%5,%%r11,8),%%xmm1\n"
+ "movzb 0x1(%0),%%r11\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movq (%5,%%r10,8),%%xmm2\n"
+ "add $0x2,%0\n"
+ "movq (%5,%%r11,8),%%xmm3\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "paddsw %%xmm0,%%xmm3\n"
+ "shufps $0x44,%%xmm3,%%xmm2\n"
+ "psraw $0x6,%%xmm2\n"
+ "packuswb %%xmm2,%%xmm2\n"
+ "movq %%xmm2,0x0(%3)\n"
+ "add $0x8,%3\n"
+"1:"
+ "sub $0x2,%4\n"
+ "jns 0b\n"
+
+"2:"
+ "add $0x1,%4\n"
+ "js 3f\n"
+
+ "movzb (%1),%%r10\n"
+ "movq 2048(%5,%%r10,8),%%xmm0\n"
+ "movzb (%2),%%r10\n"
+ "movq 4096(%5,%%r10,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movzb (%0),%%r10\n"
+ "movq (%5,%%r10,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+"3:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY) // %5
+ : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width, // r8
+ int source_dx) { // r9
+ asm(
+ "xor %%r11,%%r11\n"
+ "sub $0x2,%4\n"
+ "js 1f\n"
+
+"0:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%1,%%r10,1),%%rax\n"
+ "movq 2048(%5,%%rax,8),%%xmm0\n"
+ "movzb (%2,%%r10,1),%%rax\n"
+ "movq 4096(%5,%%rax,8),%%xmm1\n"
+ "lea (%%r11,%6),%%r10\n"
+ "sar $0x10,%%r11\n"
+ "movzb (%0,%%r11,1),%%rax\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "movq (%5,%%rax,8),%%xmm1\n"
+ "lea (%%r10,%6),%%r11\n"
+ "sar $0x10,%%r10\n"
+ "movzb (%0,%%r10,1),%%rax\n"
+ "movq (%5,%%rax,8),%%xmm2\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%3)\n"
+ "add $0x8,%3\n"
+ "sub $0x2,%4\n"
+ "jns 0b\n"
+
+"1:"
+ "add $0x1,%4\n"
+ "js 2f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+ "movzb (%1,%%r10,1),%%rax\n"
+ "movq 2048(%5,%%rax,8),%%xmm0\n"
+ "movzb (%2,%%r10,1),%%rax\n"
+ "movq 4096(%5,%%rax,8),%%xmm1\n"
+ "paddsw %%xmm1,%%xmm0\n"
+ "sar $0x10,%%r11\n"
+ "movzb (%0,%%r11,1),%%rax\n"
+ "movq (%5,%%rax,8),%%xmm1\n"
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+
+"2:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY), // %5
+ "r"(static_cast<long>(source_dx)) // %6
+ : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ asm(
+ "xor %%r11,%%r11\n" // x = 0
+ "sub $0x2,%4\n"
+ "js 2f\n"
+ "cmp $0x20000,%6\n" // if source_dx >= 2.0
+ "jl 0f\n"
+ "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
+"0:"
+
+"1:"
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%1, %%r10, 1), %%r13 \n"
+ "movzb 1(%1, %%r10, 1), %%r14 \n"
+ "mov %%r11, %%rax \n"
+ "and $0x1fffe, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0x1fffe, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $17, %%r13 \n"
+ "movq 2048(%5,%%r13,8), %%xmm0\n"
+
+ "movzb (%2, %%r10, 1), %%r13 \n"
+ "movzb 1(%2, %%r10, 1), %%r14 \n"
+ "mov %%r11, %%rax \n"
+ "and $0x1fffe, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0x1fffe, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $17, %%r13 \n"
+ "movq 4096(%5,%%r13,8), %%xmm1\n"
+
+ "mov %%r11, %%rax \n"
+ "lea (%%r11,%6),%%r10\n"
+ "sar $0x10,%%r11\n"
+ "paddsw %%xmm1,%%xmm0\n"
+
+ "movzb (%0, %%r11, 1), %%r13 \n"
+ "movzb 1(%0, %%r11, 1), %%r14 \n"
+ "and $0xffff, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0xffff, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $16, %%r13 \n"
+ "movq (%5,%%r13,8),%%xmm1\n"
+
+ "mov %%r10, %%rax \n"
+ "lea (%%r10,%6),%%r11\n"
+ "sar $0x10,%%r10\n"
+
+ "movzb (%0,%%r10,1), %%r13 \n"
+ "movzb 1(%0,%%r10,1), %%r14 \n"
+ "and $0xffff, %%rax \n"
+ "imul %%rax, %%r14 \n"
+ "xor $0xffff, %%rax \n"
+ "imul %%rax, %%r13 \n"
+ "add %%r14, %%r13 \n"
+ "shr $16, %%r13 \n"
+ "movq (%5,%%r13,8),%%xmm2\n"
+
+ "paddsw %%xmm0,%%xmm1\n"
+ "paddsw %%xmm0,%%xmm2\n"
+ "shufps $0x44,%%xmm2,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movq %%xmm1,0x0(%3)\n"
+ "add $0x8,%3\n"
+ "sub $0x2,%4\n"
+ "jns 1b\n"
+
+"2:"
+ "add $0x1,%4\n"
+ "js 3f\n"
+
+ "mov %%r11,%%r10\n"
+ "sar $0x11,%%r10\n"
+
+ "movzb (%1,%%r10,1), %%r13 \n"
+ "movq 2048(%5,%%r13,8),%%xmm0\n"
+
+ "movzb (%2,%%r10,1), %%r13 \n"
+ "movq 4096(%5,%%r13,8),%%xmm1\n"
+
+ "paddsw %%xmm1,%%xmm0\n"
+ "sar $0x10,%%r11\n"
+
+ "movzb (%0,%%r11,1), %%r13 \n"
+ "movq (%5,%%r13,8),%%xmm1\n"
+
+ "paddsw %%xmm0,%%xmm1\n"
+ "psraw $0x6,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm1\n"
+ "movd %%xmm1,0x0(%3)\n"
+
+"3:"
+ :
+ : "r"(y_buf), // %0
+ "r"(u_buf), // %1
+ "r"(v_buf), // %2
+ "r"(rgb_buf), // %3
+ "r"(width), // %4
+ "r" (kCoefficientsRgbY), // %5
+ "r"(static_cast<long>(source_dx)) // %6
+ : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+
+// PIC version is slower because less registers are available, so
+// non-PIC is used on platforms where it is possible.
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+ asm(
+ ".text\n"
+ ".global FastConvertYUVToRGB32Row_SSE\n"
+ ".type FastConvertYUVToRGB32Row_SSE, @function\n"
+"FastConvertYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "jmp 1f\n"
+
+"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "add $0x2,%edx\n"
+ "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "sub $0x2,%ecx\n"
+ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width)
+{
+ if (mozilla::supports_sse()) {
+ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+ return;
+ }
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+ asm(
+ ".text\n"
+ ".global ScaleYUVToRGB32Row_SSE\n"
+ ".type ScaleYUVToRGB32Row_SSE, @function\n"
+"ScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "sub $0x2,%ecx\n"
+ "jns 0b\n"
+
+ "and $0x1,%ecx\n"
+ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+ asm(
+ ".text\n"
+ ".global LinearScaleYUVToRGB32Row_SSE\n"
+ ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
+"LinearScaleYUVToRGB32Row_SSE:\n"
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x30(%esp),%ebp\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%edi,%eax,1),%ecx\n"
+ "movzbl 1(%edi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
+
+ "mov 0x2c(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
+
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+"1:"
+ "cmp 0x34(%esp), %ebx\n"
+ "jl 0b\n"
+ "popa\n"
+ "ret\n"
+
+"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+
+void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ const int16 *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICConvertYUVToRGB32Row_SSE:\n"
+#else
+"PICConvertYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x38(%esp),%ecx\n"
+
+ "jmp 1f\n"
+
+"0:"
+ "movzbl (%edi),%eax\n"
+ "add $0x1,%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "add $0x1,%esi\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw 4096(%ecx,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "add $0x2,%edx\n"
+ "movq 0(%ecx,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "subl $0x2,0x34(%esp)\n"
+ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+ "je 2f\n"
+
+ "movzbl (%edi),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "movzbl (%esi),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width)
+{
+ if (mozilla::supports_sse()) {
+ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx,
+ const int16 *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICScaleYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x3c(%esp),%ecx\n"
+ "xor %ebx,%ebx\n"
+ "jmp 1f\n"
+
+"0:"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+"1:"
+ "subl $0x2,0x34(%esp)\n"
+ "jns 0b\n"
+
+ "andl $0x1,0x34(%esp)\n"
+ "je 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%edi,%eax,1),%eax\n"
+ "movq 2048(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+ "movzbl (%esi,%eax,1),%eax\n"
+ "paddsw 4096(%ecx,%eax,8),%mm0\n"
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%eax\n"
+ "movq 0(%ecx,%eax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%ebp)\n"
+
+"2:"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+ &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx,
+ const int16 *kCoefficientsRgbY);
+
+ asm(
+ ".text\n"
+#if defined(XP_MACOSX)
+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICLinearScaleYUVToRGB32Row_SSE:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+ "mov 0x3c(%esp),%edi\n"
+ "xor %ebx,%ebx\n"
+
+ // source_width = width * source_dx + ebx
+ "mov 0x34(%esp), %ecx\n"
+ "imull 0x38(%esp), %ecx\n"
+ "mov %ecx, 0x34(%esp)\n"
+
+ "mov 0x38(%esp), %ecx\n"
+ "xor %ebx,%ebx\n" // x = 0
+ "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
+ "jl 1f\n"
+ "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
+ "jmp 1f\n"
+
+"0:"
+ "mov 0x28(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "movq 2048(%edi,%ecx,8),%mm0\n"
+
+ "mov 0x2c(%esp),%esi\n"
+ "mov %ebx,%eax\n"
+ "sar $0x11,%eax\n"
+
+ "movzbl (%esi,%eax,1),%ecx\n"
+ "movzbl 1(%esi,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "andl $0x1fffe, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0x1fffe, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $17, %ecx \n"
+ "paddsw 4096(%edi,%ecx,8),%mm0\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq (%edi,%ecx,8),%mm1\n"
+
+ "cmp 0x34(%esp), %ebx\n"
+ "jge 2f\n"
+
+ "mov %ebx,%eax\n"
+ "sar $0x10,%eax\n"
+ "movzbl (%edx,%eax,1),%ecx\n"
+ "movzbl 1(%edx,%eax,1),%esi\n"
+ "mov %ebx,%eax\n"
+ "add 0x38(%esp),%ebx\n"
+ "andl $0xffff, %eax \n"
+ "imul %eax, %esi \n"
+ "xorl $0xffff, %eax \n"
+ "imul %eax, %ecx \n"
+ "addl %esi, %ecx \n"
+ "shrl $16, %ecx \n"
+ "movq (%edi,%ecx,8),%mm2\n"
+
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%ebp)\n"
+ "add $0x8,%ebp\n"
+
+"1:"
+ "cmp %ebx, 0x34(%esp)\n"
+ "jg 0b\n"
+ "popa\n"
+ "ret\n"
+
+"2:"
+ "paddsw %mm0, %mm1\n"
+ "psraw $6, %mm1\n"
+ "packuswb %mm1, %mm1\n"
+ "movd %mm1, (%ebp)\n"
+ "popa\n"
+ "ret\n"
+#if !defined(XP_MACOSX)
+ ".previous\n"
+#endif
+);
+
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx)
+{
+ if (mozilla::supports_sse()) {
+ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx, &kCoefficientsRgbY[0][0]);
+ return;
+ }
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#else
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#endif
+
+}
diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
new file mode 100644
index 000000000..c531b60c2
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_table.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+#define RGBY(i) { \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+ 0 \
+}
+
+#define RGBU(i) { \
+ static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
+ static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+ 0, \
+ static_cast<int16>(256 * 64 - 1) \
+}
+
+#define RGBV(i) { \
+ 0, \
+ static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+ static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+ 0 \
+}
+
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[256 * 3][4]) = {
+ RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03),
+ RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07),
+ RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B),
+ RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F),
+ RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13),
+ RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17),
+ RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B),
+ RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F),
+ RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23),
+ RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27),
+ RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B),
+ RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F),
+ RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33),
+ RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37),
+ RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B),
+ RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F),
+ RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43),
+ RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47),
+ RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B),
+ RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F),
+ RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53),
+ RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57),
+ RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B),
+ RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F),
+ RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63),
+ RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67),
+ RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B),
+ RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F),
+ RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73),
+ RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77),
+ RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B),
+ RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F),
+ RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83),
+ RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87),
+ RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B),
+ RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F),
+ RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93),
+ RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97),
+ RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B),
+ RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F),
+ RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3),
+ RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7),
+ RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB),
+ RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF),
+ RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3),
+ RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7),
+ RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB),
+ RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF),
+ RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3),
+ RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7),
+ RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB),
+ RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF),
+ RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3),
+ RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7),
+ RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB),
+ RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF),
+ RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3),
+ RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7),
+ RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB),
+ RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF),
+ RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3),
+ RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7),
+ RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB),
+ RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF),
+
+ // Chroma U table.
+ RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03),
+ RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07),
+ RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B),
+ RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F),
+ RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13),
+ RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17),
+ RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B),
+ RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F),
+ RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23),
+ RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27),
+ RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B),
+ RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F),
+ RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33),
+ RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37),
+ RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B),
+ RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F),
+ RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43),
+ RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47),
+ RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B),
+ RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F),
+ RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53),
+ RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57),
+ RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B),
+ RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F),
+ RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63),
+ RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67),
+ RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B),
+ RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F),
+ RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73),
+ RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77),
+ RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B),
+ RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F),
+ RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83),
+ RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87),
+ RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B),
+ RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F),
+ RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93),
+ RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97),
+ RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B),
+ RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F),
+ RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3),
+ RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7),
+ RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB),
+ RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF),
+ RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3),
+ RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7),
+ RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB),
+ RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF),
+ RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3),
+ RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7),
+ RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB),
+ RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF),
+ RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3),
+ RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7),
+ RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB),
+ RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF),
+ RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3),
+ RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7),
+ RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB),
+ RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF),
+ RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3),
+ RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7),
+ RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB),
+ RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF),
+
+ // Chroma V table.
+ RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03),
+ RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07),
+ RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B),
+ RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F),
+ RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13),
+ RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17),
+ RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B),
+ RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F),
+ RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23),
+ RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27),
+ RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B),
+ RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F),
+ RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33),
+ RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37),
+ RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B),
+ RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F),
+ RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43),
+ RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47),
+ RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B),
+ RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F),
+ RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53),
+ RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57),
+ RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B),
+ RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F),
+ RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63),
+ RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67),
+ RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B),
+ RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F),
+ RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73),
+ RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77),
+ RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B),
+ RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F),
+ RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83),
+ RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87),
+ RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B),
+ RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F),
+ RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93),
+ RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97),
+ RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B),
+ RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F),
+ RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3),
+ RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7),
+ RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB),
+ RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF),
+ RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3),
+ RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7),
+ RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB),
+ RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF),
+ RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3),
+ RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7),
+ RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB),
+ RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF),
+ RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3),
+ RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7),
+ RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB),
+ RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF),
+ RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3),
+ RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7),
+ RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB),
+ RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF),
+ RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3),
+ RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7),
+ RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB),
+ RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
+};
+
+#undef RGBY
+#undef RGBU
+#undef RGBV
+
+} // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
new file mode 100644
index 000000000..5cd931139
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -0,0 +1,498 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define kCoefficientsRgbU kCoefficientsRgbY + 2048
+#define kCoefficientsRgbV kCoefficientsRgbY + 4096
+
+extern "C" {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+__declspec(naked)
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp convertend
+
+ convertloop :
+ movzx eax, byte ptr [edi]
+ add edi, 1
+ movzx ebx, byte ptr [esi]
+ add esi, 1
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [edx]
+ paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
+ movzx ebx, byte ptr [edx + 1]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ add edx, 2
+ movq mm2, [kCoefficientsRgbY + 8 * ebx]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ convertend :
+ sub ecx, 2
+ jns convertloop
+
+ and ecx, 1 // odd number of pixels?
+ jz convertdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ convertdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int step) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ mov ebx, [esp + 32 + 24] // step
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ add edi, ebx
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ add esi, ebx
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ wend :
+ sub ecx, 2
+ jns wloop
+
+ and ecx, 1 // odd number of pixels?
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ wdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int ystep,
+ int uvstep) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ mov ebx, [esp + 32 + 28] // uvstep
+ add edi, ebx
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ add esi, ebx
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ mov ebx, [esp + 32 + 24] // ystep
+ add edx, ebx
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ movzx eax, byte ptr [edx]
+ add edx, ebx
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ wend :
+ sub ecx, 2
+ jns wloop
+
+ and ecx, 1 // odd number of pixels?
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ wdone :
+
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ jmp wend
+
+ wloop :
+ movzx eax, byte ptr [edi]
+ add edi, 1
+ movzx ebx, byte ptr [esi]
+ add esi, 1
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [edx]
+ paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ punpckldq mm1, mm1
+ movntq [ebp], mm1
+
+ movzx ebx, byte ptr [edx + 1]
+ add edx, 2
+ paddsw mm0, [kCoefficientsRgbY + 8 * ebx]
+ psraw mm0, 6
+ packuswb mm0, mm0
+ punpckldq mm0, mm0
+ movntq [ebp+8], mm0
+ add ebp, 16
+ wend :
+ sub ecx, 4
+ jns wloop
+
+ add ecx, 4
+ jz wdone
+
+ movzx eax, byte ptr [edi]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ movzx eax, byte ptr [esi]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ movzx eax, byte ptr [edx]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ jmp wend1
+
+ wloop1 :
+ movd [ebp], mm1
+ add ebp, 4
+ wend1 :
+ sub ecx, 1
+ jns wloop1
+ wdone :
+ popad
+ ret
+ }
+}
+
+// This version does general purpose scaling by any amount, up or down.
+// The only thing it cannot do is rotation by 90 or 270.
+// For performance the chroma is under-sampled, reducing cost of a 3x
+// 1080p scale from 8.4 ms to 5.4 ms.
+__declspec(naked)
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ mov esi, [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ xor ebx, ebx // x
+ jmp scaleend
+
+ scaleloop :
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [edi + eax]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [esi + eax]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24] // x += source_dx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24] // x += source_dx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm2, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 8
+ scaleend :
+ sub ecx, 2
+ jns scaleloop
+
+ and ecx, 1 // odd number of pixels?
+ jz scaledone
+
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [edi + eax]
+ movq mm0, [kCoefficientsRgbU + 8 * eax]
+ mov eax, ebx
+ sar eax, 17
+ movzx eax, byte ptr [esi + eax]
+ paddsw mm0, [kCoefficientsRgbV + 8 * eax]
+ mov eax, ebx
+ sar eax, 16
+ movzx eax, byte ptr [edx + eax]
+ movq mm1, [kCoefficientsRgbY + 8 * eax]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+
+ scaledone :
+ popad
+ ret
+ }
+}
+
+__declspec(naked)
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ __asm {
+ pushad
+ mov edx, [esp + 32 + 4] // Y
+ mov edi, [esp + 32 + 8] // U
+ // [esp + 32 + 12] // V
+ mov ebp, [esp + 32 + 16] // rgb
+ mov ecx, [esp + 32 + 20] // width
+ imul ecx, [esp + 32 + 24] // source_dx
+ mov [esp + 32 + 20], ecx // source_width = width * source_dx
+ mov ecx, [esp + 32 + 24] // source_dx
+ xor ebx, ebx // x = 0
+ cmp ecx, 0x20000
+ jl lscaleend
+ mov ebx, 0x8000 // x = 0.5 for 1/2 or less
+ jmp lscaleend
+lscaleloop:
+ mov eax, ebx
+ sar eax, 0x11
+
+ movzx ecx, byte ptr [edi + eax]
+ movzx esi, byte ptr [edi + eax + 1]
+ mov eax, ebx
+ and eax, 0x1fffe
+ imul esi, eax
+ xor eax, 0x1fffe
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 17
+ movq mm0, [kCoefficientsRgbU + 8 * ecx]
+
+ mov esi, [esp + 32 + 12]
+ mov eax, ebx
+ sar eax, 0x11
+
+ movzx ecx, byte ptr [esi + eax]
+ movzx esi, byte ptr [esi + eax + 1]
+ mov eax, ebx
+ and eax, 0x1fffe
+ imul esi, eax
+ xor eax, 0x1fffe
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 17
+ paddsw mm0, [kCoefficientsRgbV + 8 * ecx]
+
+ mov eax, ebx
+ sar eax, 0x10
+ movzx ecx, byte ptr [edx + eax]
+ movzx esi, byte ptr [1 + edx + eax]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24]
+ and eax, 0xffff
+ imul esi, eax
+ xor eax, 0xffff
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 16
+ movq mm1, [kCoefficientsRgbY + 8 * ecx]
+
+ cmp ebx, [esp + 32 + 20]
+ jge lscalelastpixel
+
+ mov eax, ebx
+ sar eax, 0x10
+ movzx ecx, byte ptr [edx + eax]
+ movzx esi, byte ptr [edx + eax + 1]
+ mov eax, ebx
+ add ebx, [esp + 32 + 24]
+ and eax, 0xffff
+ imul esi, eax
+ xor eax, 0xffff
+ imul ecx, eax
+ add ecx, esi
+ shr ecx, 16
+ movq mm2, [kCoefficientsRgbY + 8 * ecx]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ movntq [ebp], mm1
+ add ebp, 0x8
+
+lscaleend:
+ cmp ebx, [esp + 32 + 20]
+ jl lscaleloop
+ popad
+ ret
+
+lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ebp], mm1
+ popad
+ ret
+ };
+}
+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+ return;
+ }
+#endif
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+ return;
+ }
+#endif
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+ if (mozilla::supports_sse()) {
+ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx);
+ return;
+ }
+#endif
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+} // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
new file mode 100644
index 000000000..6a34f840a
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win64.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
+
+#define kCoefficientsRgbU (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 2048)
+#define kCoefficientsRgbV (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 4096)
+
+#include <emmintrin.h>
+
+static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+
+ while (width >= 2) {
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
+
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+ uint8 u, v, y;
+ int x = 0;
+
+ while (width >= 2) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+ x += source_dx;
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ y = y_buf[x >> 16];
+ x += source_dx;
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ __m128i xmm0, xmmY1, xmmY2;
+ __m128 xmmY;
+ uint8 u0, u1, v0, v1, y0, y1;
+ uint32 uv_frac, y_frac, u, v, y;
+ int x = 0;
+
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+
+ while(width >= 2) {
+ u0 = u_buf[x >> 17];
+ u1 = u_buf[(x >> 17) + 1];
+ v0 = v_buf[x >> 17];
+ v1 = v_buf[(x >> 17) + 1];
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ uv_frac = (x & 0x1fffe);
+ y_frac = (x & 0xffff);
+ u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
+ v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
+ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+ x += source_dx;
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 0xffff);
+ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+ x += source_dx;
+
+ xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+ xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+ 0x44);
+ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+ rgb_buf += 8;
+ width -= 2;
+ }
+
+ if (width) {
+ u = u_buf[x >> 17];
+ v = v_buf[x >> 17];
+ y = y_buf[x >> 16];
+
+ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+ xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+
+ xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+ xmmY1 = _mm_srai_epi16(xmmY1, 6);
+ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+ }
+}
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
+ source_dx);
+}
+
+} // extern "C"