diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /gfx/ycbcr | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'gfx/ycbcr')
28 files changed, 9140 insertions, 0 deletions
diff --git a/gfx/ycbcr/LICENSE b/gfx/ycbcr/LICENSE new file mode 100644 index 000000000..8dc35041d --- /dev/null +++ b/gfx/ycbcr/LICENSE @@ -0,0 +1,27 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/gfx/ycbcr/QuellGccWarnings.patch b/gfx/ycbcr/QuellGccWarnings.patch new file mode 100644 index 000000000..d580ac981 --- /dev/null +++ b/gfx/ycbcr/QuellGccWarnings.patch @@ -0,0 +1,40 @@ +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp +--- a/gfx/ycbcr/yuv_convert.cpp ++++ b/gfx/ycbcr/yuv_convert.cpp +@@ -337,16 +337,17 @@ void ScaleYCbCrToRGB32(const uint* yplan + source_dx_uv >> kFractionBits); + } + } + else { + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } + #else ++ (void)source_dx_uv; + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + #endif + } + } + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. + if (has_mmx) + EMMS(); +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h +--- a/gfx/ycbcr/yuv_row.h ++++ b/gfx/ycbcr/yuv_row.h +@@ -129,14 +129,14 @@ extern SIMD_ALIGNED(int16 kCoefficientsR + #if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) + #if defined(_MSC_VER) + #define EMMS() __asm emms + #pragma warning(disable: 4799) + #else + #define EMMS() asm("emms") + #endif + #else +-#define EMMS() ++#define EMMS() ((void)0) + #endif + + } // extern "C" + + #endif // MEDIA_BASE_YUV_ROW_H_ diff --git a/gfx/ycbcr/README b/gfx/ycbcr/README new file mode 100644 index 000000000..a951bc83a --- /dev/null +++ b/gfx/ycbcr/README @@ -0,0 +1,29 @@ +This color conversion code is from the Chromium open source project available here: + +http://code.google.com/chromium/ + +The code comes from svn revision 63840 on 2010-10-26. + +If you just want to check out this individual directory, use: + +svn co -r 63840 http://src.chromium.org/svn/trunk/src/media/base + +The code was copied from a Chromium svn checkout using the 'update.sh' script which then applies patches for our build and to add dynamic CPU detection. + +convert.patch contains the following changes: + + * Change Chromium code to build using Mozilla build system. + * Add runtime CPU detection for MMX + * Move default C implementation to work on all platforms. + * Change Chromium code to allow a picture region. + * The YUV conversion will convert within this picture region only. + * Add YCbCr 4:4:4 support + * Bug 619178 - Update CPU detection in yuv_convert to new SSE.h interface. + * Bug 616778 - Split yuv_convert FilterRows vectorized code into separate files so it can + be properly guarded with cpuid() calls. + +win64.patch: SSE2 optimization for Microsoft Visual C++ x64 version + +TypeFromSize.patch: Bug 656185 - Add a method to detect YUVType from plane sizes. + +QuellGccWarnings.patch: Bug 711895 - Avoid some GCC compilation warnings. diff --git a/gfx/ycbcr/TypeFromSize.patch b/gfx/ycbcr/TypeFromSize.patch new file mode 100644 index 000000000..d08a19690 --- /dev/null +++ b/gfx/ycbcr/TypeFromSize.patch @@ -0,0 +1,58 @@ +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp +--- a/gfx/ycbcr/yuv_convert.cpp ++++ b/gfx/ycbcr/yuv_convert.cpp +@@ -26,16 +26,32 @@ namespace mozilla { + + namespace gfx { + + // 16.16 fixed point arithmetic + const int kFractionBits = 16; + const int kFractionMax = 1 << kFractionBits; + const int kFractionMask = ((1 << kFractionBits) - 1); + ++YUVType TypeFromSize(int ywidth, ++ int yheight, ++ int cbcrwidth, ++ int cbcrheight) ++{ ++ if (ywidth == cbcrwidth && yheight == cbcrheight) { ++ return YV24; ++ } ++ else if (ywidth / 2 == cbcrwidth && yheight == cbcrheight) { ++ return YV16; ++ } ++ else { ++ return YV12; ++ } ++} ++ + // Convert a frame of YUV to 32 bit ARGB. + void ConvertYCbCrToRGB32(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int pic_x, + int pic_y, + int pic_width, +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h +--- a/gfx/ycbcr/yuv_convert.h ++++ b/gfx/ycbcr/yuv_convert.h +@@ -36,16 +36,18 @@ enum Rotate { + // Filter affects how scaling looks. + enum ScaleFilter { + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. + FILTER_BILINEAR = 3 // Bilinear filter. + }; + ++YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight); ++ + // Convert a frame of YUV to 32 bit ARGB. + // Pass in YV16/YV12 depending on source format + void ConvertYCbCrToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int pic_x, + int pic_y, diff --git a/gfx/ycbcr/YCbCrUtils.cpp b/gfx/ycbcr/YCbCrUtils.cpp new file mode 100644 index 000000000..882197857 --- /dev/null +++ b/gfx/ycbcr/YCbCrUtils.cpp @@ -0,0 +1,157 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "gfx2DGlue.h" + +#include "YCbCrUtils.h" +#include "yuv_convert.h" +#include "ycbcr_to_rgb565.h" + +namespace mozilla { +namespace gfx { + +void +GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData, + SurfaceFormat& aSuggestedFormat, + IntSize& aSuggestedSize) +{ + YUVType yuvtype = + TypeFromSize(aData.mYSize.width, + aData.mYSize.height, + aData.mCbCrSize.width, + aData.mCbCrSize.height); + + // 'prescale' is true if the scaling is to be done as part of the + // YCbCr to RGB conversion rather than on the RGB data when rendered. + bool prescale = aSuggestedSize.width > 0 && aSuggestedSize.height > 0 && + aSuggestedSize != aData.mPicSize; + + if (aSuggestedFormat == SurfaceFormat::R5G6B5_UINT16) { +#if defined(HAVE_YCBCR_TO_RGB565) + if (prescale && + !IsScaleYCbCrToRGB565Fast(aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + aSuggestedSize.width, + aSuggestedSize.height, + yuvtype, + FILTER_BILINEAR) && + IsConvertYCbCrToRGB565Fast(aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + yuvtype)) { + prescale = false; + } +#else + // yuv2rgb16 function not available + aSuggestedFormat = SurfaceFormat::B8G8R8X8; +#endif + } + else if (aSuggestedFormat != SurfaceFormat::B8G8R8X8) { + // No other formats are currently supported. + aSuggestedFormat = SurfaceFormat::B8G8R8X8; + } + if (aSuggestedFormat == SurfaceFormat::B8G8R8X8) { + /* ScaleYCbCrToRGB32 does not support a picture offset, nor 4:4:4 data. + See bugs 639415 and 640073. */ + if (aData.mPicX != 0 || aData.mPicY != 0 || yuvtype == YV24) + prescale = false; + } + if (!prescale) { + aSuggestedSize = aData.mPicSize; + } +} + +void +ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, + unsigned char* aDestBuffer, + int32_t aStride) +{ + // ConvertYCbCrToRGB et al. assume the chroma planes are rounded up if the + // luma plane is odd sized. + MOZ_ASSERT((aData.mCbCrSize.width == aData.mYSize.width || + aData.mCbCrSize.width == (aData.mYSize.width + 1) >> 1) && + (aData.mCbCrSize.height == aData.mYSize.height || + aData.mCbCrSize.height == (aData.mYSize.height + 1) >> 1)); + YUVType yuvtype = + TypeFromSize(aData.mYSize.width, + aData.mYSize.height, + aData.mCbCrSize.width, + aData.mCbCrSize.height); + + // Convert from YCbCr to RGB now, scaling the image if needed. + if (aDestSize != aData.mPicSize) { +#if defined(HAVE_YCBCR_TO_RGB565) + if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) { + ScaleYCbCrToRGB565(aData.mYChannel, + aData.mCbChannel, + aData.mCrChannel, + aDestBuffer, + aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + aDestSize.width, + aDestSize.height, + aData.mYStride, + aData.mCbCrStride, + aStride, + yuvtype, + FILTER_BILINEAR); + } else +#endif + ScaleYCbCrToRGB32(aData.mYChannel, // + aData.mCbChannel, + aData.mCrChannel, + aDestBuffer, + aData.mPicSize.width, + aData.mPicSize.height, + aDestSize.width, + aDestSize.height, + aData.mYStride, + aData.mCbCrStride, + aStride, + yuvtype, + aData.mYUVColorSpace, + FILTER_BILINEAR); + } else { // no prescale +#if defined(HAVE_YCBCR_TO_RGB565) + if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) { + ConvertYCbCrToRGB565(aData.mYChannel, + aData.mCbChannel, + aData.mCrChannel, + aDestBuffer, + aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + aData.mYStride, + aData.mCbCrStride, + aStride, + yuvtype); + } else // aDestFormat != SurfaceFormat::R5G6B5_UINT16 +#endif + ConvertYCbCrToRGB32(aData.mYChannel, // + aData.mCbChannel, + aData.mCrChannel, + aDestBuffer, + aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + aData.mYStride, + aData.mCbCrStride, + aStride, + yuvtype, + aData.mYUVColorSpace); + } +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/YCbCrUtils.h b/gfx/ycbcr/YCbCrUtils.h new file mode 100644 index 000000000..1cd2e1c4f --- /dev/null +++ b/gfx/ycbcr/YCbCrUtils.h @@ -0,0 +1,30 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef MOZILLA_GFX_UTILS_H_ +#define MOZILLA_GFX_UTILS_H_ + +#include "mozilla/gfx/Types.h" +#include "ImageContainer.h" + +namespace mozilla { +namespace gfx { + +void +GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData, + SurfaceFormat& aSuggestedFormat, + IntSize& aSuggestedSize); + +void +ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData, + const SurfaceFormat& aDestFormat, + const IntSize& aDestSize, + unsigned char* aDestBuffer, + int32_t aStride); + +} // namespace gfx +} // namespace mozilla + +#endif /* MOZILLA_GFX_UTILS_H_ */ diff --git a/gfx/ycbcr/chromium_types.h b/gfx/ycbcr/chromium_types.h new file mode 100644 index 000000000..dceac4766 --- /dev/null +++ b/gfx/ycbcr/chromium_types.h @@ -0,0 +1,50 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef GFX_CHROMIUMTYPES_H +#define GFX_CHROMIUMTYPES_H + +#include <stdint.h> + +#include "libyuv/basic_types.h" + +// From Chromium build_config.h: +// Processor architecture detection. For more info on what's defined, see: +// http://msdn.microsoft.com/en-us/library/b0084kay.aspx +// http://www.agner.org/optimize/calling_conventions.pdf +// or with gcc, run: "echo | gcc -E -dM -" +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_64 1 +#define ARCH_CPU_64_BITS 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_32 1 +#define ARCH_CPU_X86 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARMEL 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__ppc__) || defined(__powerpc) || defined(__PPC__) +#define ARCH_CPU_PPC_FAMILY 1 +#define ARCH_CPU_PPC 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__sparc) +#define ARCH_CPU_SPARC_FAMILY 1 +#define ARCH_CPU_SPARC 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__sparcv9) +#define ARCH_CPU_SPARC_FAMILY 1 +#define ARCH_CPU_SPARC 1 +#define ARCH_CPU_64_BITS 1 +#elif defined(__aarch64__) +#define ARCH_CPU_AARCH64_FAMILY 1 +#define ARCH_CPU_AARCH64 1 +#define ARCH_CPU_64_BITS 1 +#else +#warning Please add support for your architecture in chromium_types.h +#endif + +#endif // GFX_CHROMIUMTYPES_H diff --git a/gfx/ycbcr/convert.patch b/gfx/ycbcr/convert.patch new file mode 100644 index 000000000..e39f923b3 --- /dev/null +++ b/gfx/ycbcr/convert.patch @@ -0,0 +1,3143 @@ +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp +--- a/gfx/ycbcr/yuv_convert.cpp ++++ b/gfx/ycbcr/yuv_convert.cpp +@@ -6,145 +6,102 @@ + // http://www.fourcc.org/yuv.php + // The actual conversion is best described here + // http://en.wikipedia.org/wiki/YUV + // An article on optimizing YUV conversion using tables instead of multiplies + // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf + // + // YV12 is a full plane of Y and a half height, half width chroma planes + // YV16 is a full plane of Y and a full height, half width chroma planes ++// YV24 is a full plane of Y and a full height, full width chroma planes + // + // ARGB pixel format is output, which on little endian is stored as BGRA. + // The alpha is set to 255, allowing the application to use RGBA or RGB32. + +-#include "media/base/yuv_convert.h" ++#include "yuv_convert.h" + + // Header for low level row functions. +-#include "media/base/yuv_row.h" +- +-#if USE_MMX +-#if defined(_MSC_VER) +-#include <intrin.h> +-#else +-#include <mmintrin.h> +-#endif +-#endif +- +-#if USE_SSE2 +-#include <emmintrin.h> +-#endif +- +-namespace media { +- ++#include "yuv_row.h" ++#include "mozilla/SSE.h" ++ ++namespace mozilla { ++ ++namespace gfx { ++ + // 16.16 fixed point arithmetic + const int kFractionBits = 16; + const int kFractionMax = 1 << kFractionBits; + const int kFractionMask = ((1 << kFractionBits) - 1); + + // Convert a frame of YUV to 32 bit ARGB. +-void ConvertYUVToRGB32(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int height, +- int y_pitch, +- int uv_pitch, +- int rgb_pitch, +- YUVType yuv_type) { +- unsigned int y_shift = yuv_type; +- for (int y = 0; y < height; ++y) { +- uint8* rgb_row = rgb_buf + y * rgb_pitch; +- const uint8* y_ptr = y_buf + y * y_pitch; +- const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; +- const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; +- +- FastConvertYUVToRGB32Row(y_ptr, +- u_ptr, +- v_ptr, +- rgb_row, +- width); +- } ++void ConvertYCbCrToRGB32(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int pic_x, ++ int pic_y, ++ int pic_width, ++ int pic_height, ++ int y_pitch, ++ int uv_pitch, ++ int rgb_pitch, ++ YUVType yuv_type) { ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; ++ unsigned int x_shift = yuv_type == YV24 ? 0 : 1; ++ // Test for SSE because the optimized code uses movntq, which is not part of MMX. ++ bool has_sse = supports_mmx() && supports_sse(); ++ // There is no optimized YV24 SSE routine so we check for this and ++ // fall back to the C code. ++ has_sse &= yuv_type != YV24; ++ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; ++ int x_width = odd_pic_x ? pic_width - 1 : pic_width; ++ ++ for (int y = pic_y; y < pic_height + pic_y; ++y) { ++ uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; ++ const uint8* y_ptr = y_buf + y * y_pitch + pic_x; ++ const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); ++ const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); ++ ++ if (odd_pic_x) { ++ // Handle the single odd pixel manually and use the ++ // fast routines for the remaining. ++ FastConvertYUVToRGB32Row_C(y_ptr++, ++ u_ptr++, ++ v_ptr++, ++ rgb_row, ++ 1, ++ x_shift); ++ rgb_row += 4; ++ } ++ ++ if (has_sse) { ++ FastConvertYUVToRGB32Row(y_ptr, ++ u_ptr, ++ v_ptr, ++ rgb_row, ++ x_width); ++ } ++ else { ++ FastConvertYUVToRGB32Row_C(y_ptr, ++ u_ptr, ++ v_ptr, ++ rgb_row, ++ x_width, ++ x_shift); ++ } ++ } + + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. +- EMMS(); +-} +- +-#if USE_SSE2 +-// FilterRows combines two rows of the image using linear interpolation. +-// SSE2 version does 16 pixels at a time +- +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { +- __m128i zero = _mm_setzero_si128(); +- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); +- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); +- +- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); +- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); +- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); +- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); +- +- do { +- __m128i y0 = _mm_loadu_si128(y0_ptr128); +- __m128i y1 = _mm_loadu_si128(y1_ptr128); +- __m128i y2 = _mm_unpackhi_epi8(y0, zero); +- __m128i y3 = _mm_unpackhi_epi8(y1, zero); +- y0 = _mm_unpacklo_epi8(y0, zero); +- y1 = _mm_unpacklo_epi8(y1, zero); +- y0 = _mm_mullo_epi16(y0, y0_fraction); +- y1 = _mm_mullo_epi16(y1, y1_fraction); +- y2 = _mm_mullo_epi16(y2, y0_fraction); +- y3 = _mm_mullo_epi16(y3, y1_fraction); +- y0 = _mm_add_epi16(y0, y1); +- y2 = _mm_add_epi16(y2, y3); +- y0 = _mm_srli_epi16(y0, 8); +- y2 = _mm_srli_epi16(y2, 8); +- y0 = _mm_packus_epi16(y0, y2); +- *dest128++ = y0; +- ++y0_ptr128; +- ++y1_ptr128; +- } while (dest128 < end128); +-} +-#elif USE_MMX +-// MMX version does 8 pixels at a time +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { +- __m64 zero = _mm_setzero_si64(); +- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); +- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); +- +- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); +- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); +- __m64* dest64 = reinterpret_cast<__m64*>(ybuf); +- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); +- +- do { +- __m64 y0 = *y0_ptr64++; +- __m64 y1 = *y1_ptr64++; +- __m64 y2 = _mm_unpackhi_pi8(y0, zero); +- __m64 y3 = _mm_unpackhi_pi8(y1, zero); +- y0 = _mm_unpacklo_pi8(y0, zero); +- y1 = _mm_unpacklo_pi8(y1, zero); +- y0 = _mm_mullo_pi16(y0, y0_fraction); +- y1 = _mm_mullo_pi16(y1, y1_fraction); +- y2 = _mm_mullo_pi16(y2, y0_fraction); +- y3 = _mm_mullo_pi16(y3, y1_fraction); +- y0 = _mm_add_pi16(y0, y1); +- y2 = _mm_add_pi16(y2, y3); +- y0 = _mm_srli_pi16(y0, 8); +- y2 = _mm_srli_pi16(y2, 8); +- y0 = _mm_packs_pu16(y0, y2); +- *dest64++ = y0; +- } while (dest64 < end64); +-} +-#else // no MMX or SSE2 ++ if (has_sse) ++ EMMS(); ++} ++ + // C version does 8 at a time to mimic MMX code +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, +- int source_width, int source_y_fraction) { ++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; +@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); + } +-#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_MMX ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction); ++#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction); ++#endif ++ ++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, ++ const uint8* y1_ptr, int source_width, ++ int source_y_fraction) { ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 ++ if (mozilla::supports_sse2()) { ++ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++ return; ++ } ++#endif ++ ++#ifdef MOZILLA_MAY_SUPPORT_MMX ++ if (mozilla::supports_mmx()) { ++ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++ return; ++ } ++#endif ++ ++ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); ++} + + + // Scale a frame of YUV to 32 bit ARGB. +-void ScaleYUVToRGB32(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int source_width, +- int source_height, +- int width, +- int height, +- int y_pitch, +- int uv_pitch, +- int rgb_pitch, +- YUVType yuv_type, +- Rotate view_rotate, +- ScaleFilter filter) { ++void ScaleYCbCrToRGB32(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int source_width, ++ int source_height, ++ int width, ++ int height, ++ int y_pitch, ++ int uv_pitch, ++ int rgb_pitch, ++ YUVType yuv_type, ++ Rotate view_rotate, ++ ScaleFilter filter) { ++ bool has_mmx = supports_mmx(); ++ + // 4096 allows 3 buffers to fit in 12k. + // Helps performance on CPU with 16K L1 cache. + // Large enough for 3830x2160 and 30" displays which are 2560x1600. + const int kFilterBufferSize = 4096; + // Disable filtering if the screen is too big (to avoid buffer overflows). + // This should never happen to regular users: they don't have monitors + // wider than 4096 pixels. + // TODO(fbarchard): Allow rotated videos to filter. + if (source_width > kFilterBufferSize || view_rotate) + filter = FILTER_NONE; + +- unsigned int y_shift = yuv_type; ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + // Diagram showing origin and direction of source sampling. + // ->0 4<- + // 7 3 + // + // 6 5 + // ->1 2<- + // Rotations that start at right side of image. + if ((view_rotate == ROTATE_180) || +@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, + int source_uv_fraction = + ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; + + const uint8* y_ptr = y0_ptr; + const uint8* u_ptr = u0_ptr; + const uint8* v_ptr = v0_ptr; + // Apply vertical filtering if necessary. + // TODO(fbarchard): Remove memcpy when not necessary. +- if (filter & media::FILTER_BILINEAR_V) { ++ if (filter & mozilla::gfx::FILTER_BILINEAR_V) { + if (yscale_fixed != kFractionMax && + source_y_fraction && ((source_y + 1) < source_height)) { + FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + } else { + memcpy(ybuf, y0_ptr, source_width); + } + y_ptr = ybuf; + ybuf[source_width] = ybuf[source_width-1]; +@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf, + u_ptr = ubuf; + v_ptr = vbuf; + ubuf[uv_source_width] = ubuf[uv_source_width - 1]; + vbuf[uv_source_width] = vbuf[uv_source_width - 1]; + } + if (source_dx == kFractionMax) { // Not scaled + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width); +- } else { +- if (filter & FILTER_BILINEAR_H) { ++ } else if (filter & FILTER_BILINEAR_H) { + LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { + // Specialized scalers and rotation. +-#if USE_MMX && defined(_MSC_VER) ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) ++ if(mozilla::supports_sse()) { + if (width == (source_width * 2)) { +- DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width); ++ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width); + } else if ((source_dx & kFractionMask) == 0) { + // Scaling by integer scale factor. ie half. +- ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, +- source_dx >> kFractionBits); ++ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, ++ source_dx >> kFractionBits); + } else if (source_dx_uv == source_dx) { // Not rotated. + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { +- RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, +- source_dx >> kFractionBits, +- source_dx_uv >> kFractionBits); ++ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, ++ source_dx >> kFractionBits, ++ source_dx_uv >> kFractionBits); + } ++ } ++ else { ++ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, source_dx); ++ } + #else +- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, +- dest_pixel, width, source_dx); +-#endif +- } ++ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, width, source_dx); ++#endif + } + } + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. +- EMMS(); +-} +- +-} // namespace media ++ if (has_mmx) ++ EMMS(); ++} ++ ++} // namespace gfx ++} // namespace mozilla +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h +--- a/gfx/ycbcr/yuv_convert.h ++++ b/gfx/ycbcr/yuv_convert.h +@@ -1,72 +1,79 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + + #ifndef MEDIA_BASE_YUV_CONVERT_H_ + #define MEDIA_BASE_YUV_CONVERT_H_ + +-#include "base/basictypes.h" +- +-namespace media { +- ++#include "chromium_types.h" ++#include "gfxCore.h" ++ ++namespace mozilla { ++ ++namespace gfx { ++ + // Type of YUV surface. + // The value of these enums matter as they are used to shift vertical indices. + enum YUVType { +- YV16 = 0, // YV16 is half width and full height chroma channels. +- YV12 = 1, // YV12 is half width and half height chroma channels. ++ YV12 = 0, // YV12 is half width and half height chroma channels. ++ YV16 = 1, // YV16 is half width and full height chroma channels. ++ YV24 = 2 // YV24 is full width and full height chroma channels. + }; + + // Mirror means flip the image horizontally, as in looking in a mirror. + // Rotate happens after mirroring. + enum Rotate { + ROTATE_0, // Rotation off. + ROTATE_90, // Rotate clockwise. + ROTATE_180, // Rotate upside down. + ROTATE_270, // Rotate counter clockwise. + MIRROR_ROTATE_0, // Mirror horizontally. + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. + MIRROR_ROTATE_180, // Mirror vertically. +- MIRROR_ROTATE_270, // Transpose. ++ MIRROR_ROTATE_270 // Transpose. + }; + + // Filter affects how scaling looks. + enum ScaleFilter { + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. +- FILTER_BILINEAR = 3, // Bilinear filter. ++ FILTER_BILINEAR = 3 // Bilinear filter. + }; + + // Convert a frame of YUV to 32 bit ARGB. + // Pass in YV16/YV12 depending on source format +-void ConvertYUVToRGB32(const uint8* yplane, +- const uint8* uplane, +- const uint8* vplane, +- uint8* rgbframe, +- int width, +- int height, +- int ystride, +- int uvstride, +- int rgbstride, +- YUVType yuv_type); ++void ConvertYCbCrToRGB32(const uint8* yplane, ++ const uint8* uplane, ++ const uint8* vplane, ++ uint8* rgbframe, ++ int pic_x, ++ int pic_y, ++ int pic_width, ++ int pic_height, ++ int ystride, ++ int uvstride, ++ int rgbstride, ++ YUVType yuv_type); + + // Scale a frame of YUV to 32 bit ARGB. + // Supports rotation and mirroring. +-void ScaleYUVToRGB32(const uint8* yplane, +- const uint8* uplane, +- const uint8* vplane, +- uint8* rgbframe, +- int source_width, +- int source_height, +- int width, +- int height, +- int ystride, +- int uvstride, +- int rgbstride, +- YUVType yuv_type, +- Rotate view_rotate, +- ScaleFilter filter); +- +-} // namespace media +- ++void ScaleYCbCrToRGB32(const uint8* yplane, ++ const uint8* uplane, ++ const uint8* vplane, ++ uint8* rgbframe, ++ int source_width, ++ int source_height, ++ int width, ++ int height, ++ int ystride, ++ int uvstride, ++ int rgbstride, ++ YUVType yuv_type, ++ Rotate view_rotate, ++ ScaleFilter filter); ++ ++} // namespace gfx ++} // namespace mozilla ++ + #endif // MEDIA_BASE_YUV_CONVERT_H_ +diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp +new file mode 100644 +--- /dev/null ++++ b/gfx/ycbcr/yuv_convert_mmx.cpp +@@ -0,0 +1,45 @@ ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style license that can be ++// found in the LICENSE file. ++ ++#include <mmintrin.h> ++#include "yuv_row.h" ++ ++namespace mozilla { ++namespace gfx { ++ ++// FilterRows combines two rows of the image using linear interpolation. ++// MMX version does 8 pixels at a time. ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { ++ __m64 zero = _mm_setzero_si64(); ++ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); ++ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); ++ ++ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); ++ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); ++ __m64* dest64 = reinterpret_cast<__m64*>(ybuf); ++ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); ++ ++ do { ++ __m64 y0 = *y0_ptr64++; ++ __m64 y1 = *y1_ptr64++; ++ __m64 y2 = _mm_unpackhi_pi8(y0, zero); ++ __m64 y3 = _mm_unpackhi_pi8(y1, zero); ++ y0 = _mm_unpacklo_pi8(y0, zero); ++ y1 = _mm_unpacklo_pi8(y1, zero); ++ y0 = _mm_mullo_pi16(y0, y0_fraction); ++ y1 = _mm_mullo_pi16(y1, y1_fraction); ++ y2 = _mm_mullo_pi16(y2, y0_fraction); ++ y3 = _mm_mullo_pi16(y3, y1_fraction); ++ y0 = _mm_add_pi16(y0, y1); ++ y2 = _mm_add_pi16(y2, y3); ++ y0 = _mm_srli_pi16(y0, 8); ++ y2 = _mm_srli_pi16(y2, 8); ++ y0 = _mm_packs_pu16(y0, y2); ++ *dest64++ = y0; ++ } while (dest64 < end64); ++} ++ ++} ++} +diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp +new file mode 100644 +--- /dev/null ++++ b/gfx/ycbcr/yuv_convert_sse2.cpp +@@ -0,0 +1,47 @@ ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style license that can be ++// found in the LICENSE file. ++ ++#include <emmintrin.h> ++#include "yuv_row.h" ++ ++namespace mozilla { ++namespace gfx { ++ ++// FilterRows combines two rows of the image using linear interpolation. ++// SSE2 version does 16 pixels at a time. ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, ++ int source_width, int source_y_fraction) { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); ++ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); ++ ++ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); ++ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); ++ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); ++ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); ++ ++ do { ++ __m128i y0 = _mm_loadu_si128(y0_ptr128); ++ __m128i y1 = _mm_loadu_si128(y1_ptr128); ++ __m128i y2 = _mm_unpackhi_epi8(y0, zero); ++ __m128i y3 = _mm_unpackhi_epi8(y1, zero); ++ y0 = _mm_unpacklo_epi8(y0, zero); ++ y1 = _mm_unpacklo_epi8(y1, zero); ++ y0 = _mm_mullo_epi16(y0, y0_fraction); ++ y1 = _mm_mullo_epi16(y1, y1_fraction); ++ y2 = _mm_mullo_epi16(y2, y0_fraction); ++ y3 = _mm_mullo_epi16(y3, y1_fraction); ++ y0 = _mm_add_epi16(y0, y1); ++ y2 = _mm_add_epi16(y2, y3); ++ y0 = _mm_srli_epi16(y0, 8); ++ y2 = _mm_srli_epi16(y2, 8); ++ y0 = _mm_packus_epi16(y0, y2); ++ *dest128++ = y0; ++ ++y0_ptr128; ++ ++y1_ptr128; ++ } while (dest128 < end128); ++} ++ ++} ++} +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h +--- a/gfx/ycbcr/yuv_row.h ++++ b/gfx/ycbcr/yuv_row.h +@@ -5,109 +5,133 @@ + // yuv_row internal functions to handle YUV conversion and scaling to RGB. + // These functions are used from both yuv_convert.cc and yuv_scale.cc. + + // TODO(fbarchard): Write function that can handle rotation and scaling. + + #ifndef MEDIA_BASE_YUV_ROW_H_ + #define MEDIA_BASE_YUV_ROW_H_ + +-#include "base/basictypes.h" ++#include "chromium_types.h" + + extern "C" { + // Can only do 1x. + // This is the second fastest of the scalers. + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +-// Can do 1x, half size or any scale down by an integer amount. +-// Step can be negative (mirroring, rotate 180). +-// This is the third fastest of the scalers. +-void ConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int step); +- +-// Rotate is like Convert, but applies different step to Y versus U and V. +-// This allows rotation by 90 or 270, by stepping by stride. +-// This is the forth fastest of the scalers. +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, +- int ystep, +- int uvstep); ++ unsigned int x_shift); ++ ++void FastConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); ++ ++// Can do 1x, half size or any scale down by an integer amount. ++// Step can be negative (mirroring, rotate 180). ++// This is the third fastest of the scalers. ++// Only defined on Windows x86-32. ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step); ++ ++// Rotate is like Convert, but applies different step to Y versus U and V. ++// This allows rotation by 90 or 270, by stepping by stride. ++// This is the forth fastest of the scalers. ++// Only defined on Windows x86-32. ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep); + + // Doubler does 4 pixels at a time. Each pixel is replicated. + // This is the fastest of the scalers. +-void DoubleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); ++// Only defined on Windows x86-32. ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); + + // Handles arbitrary scaling up or down. + // Mirroring is supported, but not 90 or 270 degree rotation. + // Chroma is under sampled every 2 pixels for performance. + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ + // Handles arbitrary scaling up or down with bilinear filtering. + // Mirroring is supported, but not 90 or 270 degree rotation. + // Chroma is under sampled every 2 pixels for performance. + // This is the slowest of the scalers. + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); ++ ++ + #if defined(_MSC_VER) + #define SIMD_ALIGNED(var) __declspec(align(16)) var + #else + #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) + #endif + extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]); + +-// Method to force C version. +-//#define USE_MMX 0 +-//#define USE_SSE2 0 +- +-#if !defined(USE_MMX) +-// Windows, Mac and Linux/BSD use MMX +-#if defined(__MMX__) || defined(_MSC_VER) +-#define USE_MMX 1 +-#else +-#define USE_MMX 0 +-#endif +-#endif +- +-#if !defined(USE_SSE2) +-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 +-#define USE_SSE2 1 +-#else +-#define USE_SSE2 0 +-#endif +-#endif +- + // x64 uses MMX2 (SSE) so emms is not required. + // Warning C4799: function has no EMMS instruction. + // EMMS() is slow and should be called by the calling function once per image. +-#if USE_MMX && !defined(ARCH_CPU_X86_64) ++#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) + #if defined(_MSC_VER) + #define EMMS() __asm emms + #pragma warning(disable: 4799) + #else + #define EMMS() asm("emms") + #endif + #else + #define EMMS() +diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp +--- a/gfx/ycbcr/yuv_row_c.cpp ++++ b/gfx/ycbcr/yuv_row_c.cpp +@@ -1,812 +1,18 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" +- +-#ifdef _DEBUG +-#include "base/logging.h" +-#else ++#include "yuv_row.h" ++ + #define DCHECK(a) +-#endif + + extern "C" { + +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) +- +-// AMD64 ABI uses register paremters. +-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi +- const uint8* u_buf, // rsi +- const uint8* v_buf, // rdx +- uint8* rgb_buf, // rcx +- int width) { // r8 +- asm( +- "jmp convertend\n" +-"convertloop:" +- "movzb (%1),%%r10\n" +- "add $0x1,%1\n" +- "movzb (%2),%%r11\n" +- "add $0x1,%2\n" +- "movq 2048(%5,%%r10,8),%%xmm0\n" +- "movzb (%0),%%r10\n" +- "movq 4096(%5,%%r11,8),%%xmm1\n" +- "movzb 0x1(%0),%%r11\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movq (%5,%%r10,8),%%xmm2\n" +- "add $0x2,%0\n" +- "movq (%5,%%r11,8),%%xmm3\n" +- "paddsw %%xmm0,%%xmm2\n" +- "paddsw %%xmm0,%%xmm3\n" +- "shufps $0x44,%%xmm3,%%xmm2\n" +- "psraw $0x6,%%xmm2\n" +- "packuswb %%xmm2,%%xmm2\n" +- "movq %%xmm2,0x0(%3)\n" +- "add $0x8,%3\n" +-"convertend:" +- "sub $0x2,%4\n" +- "jns convertloop\n" +- +-"convertnext:" +- "add $0x1,%4\n" +- "js convertdone\n" +- +- "movzb (%1),%%r10\n" +- "movq 2048(%5,%%r10,8),%%xmm0\n" +- "movzb (%2),%%r10\n" +- "movq 4096(%5,%%r10,8),%%xmm1\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movzb (%0),%%r10\n" +- "movq (%5,%%r10,8),%%xmm1\n" +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +-"convertdone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY) // %5 +- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +-); +-} +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi +- const uint8* u_buf, // rsi +- const uint8* v_buf, // rdx +- uint8* rgb_buf, // rcx +- int width, // r8 +- int source_dx) { // r9 +- asm( +- "xor %%r11,%%r11\n" +- "sub $0x2,%4\n" +- "js scalenext\n" +- +-"scaleloop:" +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- "movzb (%1,%%r10,1),%%rax\n" +- "movq 2048(%5,%%rax,8),%%xmm0\n" +- "movzb (%2,%%r10,1),%%rax\n" +- "movq 4096(%5,%%rax,8),%%xmm1\n" +- "lea (%%r11,%6),%%r10\n" +- "sar $0x10,%%r11\n" +- "movzb (%0,%%r11,1),%%rax\n" +- "paddsw %%xmm1,%%xmm0\n" +- "movq (%5,%%rax,8),%%xmm1\n" +- "lea (%%r10,%6),%%r11\n" +- "sar $0x10,%%r10\n" +- "movzb (%0,%%r10,1),%%rax\n" +- "movq (%5,%%rax,8),%%xmm2\n" +- "paddsw %%xmm0,%%xmm1\n" +- "paddsw %%xmm0,%%xmm2\n" +- "shufps $0x44,%%xmm2,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movq %%xmm1,0x0(%3)\n" +- "add $0x8,%3\n" +- "sub $0x2,%4\n" +- "jns scaleloop\n" +- +-"scalenext:" +- "add $0x1,%4\n" +- "js scaledone\n" +- +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- "movzb (%1,%%r10,1),%%rax\n" +- "movq 2048(%5,%%rax,8),%%xmm0\n" +- "movzb (%2,%%r10,1),%%rax\n" +- "movq 4096(%5,%%rax,8),%%xmm1\n" +- "paddsw %%xmm1,%%xmm0\n" +- "sar $0x10,%%r11\n" +- "movzb (%0,%%r11,1),%%rax\n" +- "movq (%5,%%rax,8),%%xmm1\n" +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +- +-"scaledone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY), // %5 +- "r"(static_cast<long>(source_dx)) // %6 +- : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" +-); +-} +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- asm( +- "xor %%r11,%%r11\n" // x = 0 +- "sub $0x2,%4\n" +- "js .lscalenext\n" +- "cmp $0x20000,%6\n" // if source_dx >= 2.0 +- "jl .lscalehalf\n" +- "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +-".lscalehalf:" +- +-".lscaleloop:" +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- +- "movzb (%1, %%r10, 1), %%r13 \n" +- "movzb 1(%1, %%r10, 1), %%r14 \n" +- "mov %%r11, %%rax \n" +- "and $0x1fffe, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0x1fffe, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $17, %%r13 \n" +- "movq 2048(%5,%%r13,8), %%xmm0\n" +- +- "movzb (%2, %%r10, 1), %%r13 \n" +- "movzb 1(%2, %%r10, 1), %%r14 \n" +- "mov %%r11, %%rax \n" +- "and $0x1fffe, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0x1fffe, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $17, %%r13 \n" +- "movq 4096(%5,%%r13,8), %%xmm1\n" +- +- "mov %%r11, %%rax \n" +- "lea (%%r11,%6),%%r10\n" +- "sar $0x10,%%r11\n" +- "paddsw %%xmm1,%%xmm0\n" +- +- "movzb (%0, %%r11, 1), %%r13 \n" +- "movzb 1(%0, %%r11, 1), %%r14 \n" +- "and $0xffff, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0xffff, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $16, %%r13 \n" +- "movq (%5,%%r13,8),%%xmm1\n" +- +- "mov %%r10, %%rax \n" +- "lea (%%r10,%6),%%r11\n" +- "sar $0x10,%%r10\n" +- +- "movzb (%0,%%r10,1), %%r13 \n" +- "movzb 1(%0,%%r10,1), %%r14 \n" +- "and $0xffff, %%rax \n" +- "imul %%rax, %%r14 \n" +- "xor $0xffff, %%rax \n" +- "imul %%rax, %%r13 \n" +- "add %%r14, %%r13 \n" +- "shr $16, %%r13 \n" +- "movq (%5,%%r13,8),%%xmm2\n" +- +- "paddsw %%xmm0,%%xmm1\n" +- "paddsw %%xmm0,%%xmm2\n" +- "shufps $0x44,%%xmm2,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movq %%xmm1,0x0(%3)\n" +- "add $0x8,%3\n" +- "sub $0x2,%4\n" +- "jns .lscaleloop\n" +- +-".lscalenext:" +- "add $0x1,%4\n" +- "js .lscaledone\n" +- +- "mov %%r11,%%r10\n" +- "sar $0x11,%%r10\n" +- +- "movzb (%1,%%r10,1), %%r13 \n" +- "movq 2048(%5,%%r13,8),%%xmm0\n" +- +- "movzb (%2,%%r10,1), %%r13 \n" +- "movq 4096(%5,%%r13,8),%%xmm1\n" +- +- "paddsw %%xmm1,%%xmm0\n" +- "sar $0x10,%%r11\n" +- +- "movzb (%0,%%r11,1), %%r13 \n" +- "movq (%5,%%r13,8),%%xmm1\n" +- +- "paddsw %%xmm0,%%xmm1\n" +- "psraw $0x6,%%xmm1\n" +- "packuswb %%xmm1,%%xmm1\n" +- "movd %%xmm1,0x0(%3)\n" +- +-".lscaledone:" +- : +- : "r"(y_buf), // %0 +- "r"(u_buf), // %1 +- "r"(v_buf), // %2 +- "r"(rgb_buf), // %3 +- "r"(width), // %4 +- "r" (kCoefficientsRgbY), // %5 +- "r"(static_cast<long>(source_dx)) // %6 +- : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" +-); +-} +- +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) +- +-// PIC version is slower because less registers are available, so +-// non-PIC is used on platforms where it is possible. +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); +- asm( +- ".text\n" +- ".global FastConvertYUVToRGB32Row\n" +-"FastConvertYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "jmp convertend\n" +- +-"convertloop:" +- "movzbl (%edi),%eax\n" +- "add $0x1,%edi\n" +- "movzbl (%esi),%ebx\n" +- "add $0x1,%esi\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" +- "movzbl 0x1(%edx),%ebx\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "add $0x2,%edx\n" +- "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"convertend:" +- "sub $0x2,%ecx\n" +- "jns convertloop\n" +- +- "and $0x1,%ecx\n" +- "je convertdone\n" +- +- "movzbl (%edi),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "movzbl (%esi),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +-"convertdone:" +- "popa\n" +- "ret\n" +-); +- +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); +- asm( +- ".text\n" +- ".global ScaleYUVToRGB32Row\n" +-"ScaleYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "xor %ebx,%ebx\n" +- "jmp scaleend\n" +- +-"scaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"scaleend:" +- "sub $0x2,%ecx\n" +- "jns scaleloop\n" +- +- "and $0x1,%ecx\n" +- "je scaledone\n" +- +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +- +-"scaledone:" +- "popa\n" +- "ret\n" +-); +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); +- asm( +- ".text\n" +- ".global LinearScaleYUVToRGB32Row\n" +-"LinearScaleYUVToRGB32Row:\n" +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x30(%esp),%ebp\n" +- +- // source_width = width * source_dx + ebx +- "mov 0x34(%esp), %ecx\n" +- "imull 0x38(%esp), %ecx\n" +- "mov %ecx, 0x34(%esp)\n" +- +- "mov 0x38(%esp), %ecx\n" +- "xor %ebx,%ebx\n" // x = 0 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%edi,%eax,1),%ecx\n" +- "movzbl 1(%edi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" +- +- "mov 0x2c(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" +- +- "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" +- +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +- +-".lscaleend:" +- "cmp 0x34(%esp), %ebx\n" +- "jl .lscaleloop\n" +- "popa\n" +- "ret\n" +- +-".lscalelastpixel:" +- "paddsw %mm0, %mm1\n" +- "psraw $6, %mm1\n" +- "packuswb %mm1, %mm1\n" +- "movd %mm1, (%ebp)\n" +- "popa\n" +- "ret\n" +-); +- +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) +- +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int16 *kCoefficientsRgbY); +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICConvertYUVToRGB32Row:\n" +-#else +-"PICConvertYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x38(%esp),%ecx\n" +- +- "jmp .Lconvertend\n" +- +-".Lconvertloop:" +- "movzbl (%edi),%eax\n" +- "add $0x1,%edi\n" +- "movzbl (%esi),%ebx\n" +- "add $0x1,%esi\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "paddsw 4096(%ecx,%ebx,8),%mm0\n" +- "movzbl 0x1(%edx),%ebx\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "add $0x2,%edx\n" +- "movq 0(%ecx,%ebx,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-".Lconvertend:" +- "subl $0x2,0x34(%esp)\n" +- "jns .Lconvertloop\n" +- +- "andl $0x1,0x34(%esp)\n" +- "je .Lconvertdone\n" +- +- "movzbl (%edi),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "movzbl (%esi),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "movzbl (%edx),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +-".Lconvertdone:\n" +- "popa\n" +- "ret\n" +-); +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, +- &kCoefficientsRgbY[0][0]); +-} +- +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); +- +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICScaleYUVToRGB32Row:\n" +-#else +-"PICScaleYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x28(%esp),%edi\n" +- "mov 0x2c(%esp),%esi\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x3c(%esp),%ecx\n" +- "xor %ebx,%ebx\n" +- "jmp Lscaleend\n" +- +-"Lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm2\n" +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +-"Lscaleend:" +- "subl $0x2,0x34(%esp)\n" +- "jns Lscaleloop\n" +- +- "andl $0x1,0x34(%esp)\n" +- "je Lscaledone\n" +- +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%edi,%eax,1),%eax\n" +- "movq 2048(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- "movzbl (%esi,%eax,1),%eax\n" +- "paddsw 4096(%ecx,%eax,8),%mm0\n" +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%eax\n" +- "movq 0(%ecx,%eax,8),%mm1\n" +- "paddsw %mm0,%mm1\n" +- "psraw $0x6,%mm1\n" +- "packuswb %mm1,%mm1\n" +- "movd %mm1,0x0(%ebp)\n" +- +-"Lscaledone:" +- "popa\n" +- "ret\n" +-); +- +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); +- asm( +- ".text\n" +-#if defined(OS_MACOSX) +-"_PICLinearScaleYUVToRGB32Row:\n" +-#else +-"PICLinearScaleYUVToRGB32Row:\n" +-#endif +- "pusha\n" +- "mov 0x24(%esp),%edx\n" +- "mov 0x30(%esp),%ebp\n" +- "mov 0x34(%esp),%ecx\n" +- "mov 0x3c(%esp),%edi\n" +- "xor %ebx,%ebx\n" +- +- // source_width = width * source_dx + ebx +- "mov 0x34(%esp), %ecx\n" +- "imull 0x38(%esp), %ecx\n" +- "mov %ecx, 0x34(%esp)\n" +- +- "mov 0x38(%esp), %ecx\n" +- "xor %ebx,%ebx\n" // x = 0 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov 0x28(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "movq 2048(%edi,%ecx,8),%mm0\n" +- +- "mov 0x2c(%esp),%esi\n" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" +- +- "movzbl (%esi,%eax,1),%ecx\n" +- "movzbl 1(%esi,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "andl $0x1fffe, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0x1fffe, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $17, %ecx \n" +- "paddsw 4096(%edi,%ecx,8),%mm0\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq (%edi,%ecx,8),%mm1\n" +- +- "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" +- +- "mov %ebx,%eax\n" +- "sar $0x10,%eax\n" +- "movzbl (%edx,%eax,1),%ecx\n" +- "movzbl 1(%edx,%eax,1),%esi\n" +- "mov %ebx,%eax\n" +- "add 0x38(%esp),%ebx\n" +- "andl $0xffff, %eax \n" +- "imul %eax, %esi \n" +- "xorl $0xffff, %eax \n" +- "imul %eax, %ecx \n" +- "addl %esi, %ecx \n" +- "shrl $16, %ecx \n" +- "movq (%edi,%ecx,8),%mm2\n" +- +- "paddsw %mm0,%mm1\n" +- "paddsw %mm0,%mm2\n" +- "psraw $0x6,%mm1\n" +- "psraw $0x6,%mm2\n" +- "packuswb %mm2,%mm1\n" +- "movntq %mm1,0x0(%ebp)\n" +- "add $0x8,%ebp\n" +- +-".lscaleend:" +- "cmp %ebx, 0x34(%esp)\n" +- "jg .lscaleloop\n" +- "popa\n" +- "ret\n" +- +-".lscalelastpixel:" +- "paddsw %mm0, %mm1\n" +- "psraw $6, %mm1\n" +- "packuswb %mm1, %mm1\n" +- "movd %mm1, (%ebp)\n" +- "popa\n" +- "ret\n" +-); +- +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-#else // USE_MMX +- + // C reference code that mimic the YUV assembly. + #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) + #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + + static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, +@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y, + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); + } + +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ unsigned int x_shift) { + for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; ++ uint8 u = u_buf[x >> x_shift]; ++ uint8 v = v_buf[x >> x_shift]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; ++ if (x_shift == 0) { ++ u = u_buf[x + 1]; ++ v = v_buf[x + 1]; ++ } + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } + } + + // 16.16 fixed point is used. A shift by 16 isolates the integer. + // A shift by 17 is used to further subsample the chrominence channels. + // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, + // for 1/65536 pixel accurate interpolation. +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } + } + +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; +@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } + } + +-#endif // USE_MMX + } // extern "C" + +diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp +--- a/gfx/ycbcr/yuv_row_posix.cpp ++++ b/gfx/ycbcr/yuv_row_posix.cpp +@@ -1,33 +1,32 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" +- +-#ifdef _DEBUG +-#include "base/logging.h" +-#else ++#include "yuv_row.h" ++#include "mozilla/SSE.h" ++ + #define DCHECK(a) +-#endif + + extern "C" { + +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) ++#if defined(ARCH_CPU_X86_64) ++ ++// We don't need CPUID guards here, since x86-64 implies SSE2. + + // AMD64 ABI uses register paremters. + void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm( +- "jmp convertend\n" +-"convertloop:" ++ "jmp 1f\n" ++"0:" + "movzb (%1),%%r10\n" + "add $0x1,%1\n" + "movzb (%2),%%r11\n" + "add $0x1,%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" +@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "add $0x8,%3\n" +-"convertend:" ++"1:" + "sub $0x2,%4\n" +- "jns convertloop\n" +- +-"convertnext:" ++ "jns 0b\n" ++ ++"2:" + "add $0x1,%4\n" +- "js convertdone\n" ++ "js 3f\n" + + "movzb (%1),%%r10\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%2),%%r10\n" + "movq 4096(%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movzb (%0),%%r10\n" + "movq (%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" +-"convertdone:" ++"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width, // r8 + int source_dx) { // r9 + asm( + "xor %%r11,%%r11\n" + "sub $0x2,%4\n" +- "js scalenext\n" +- +-"scaleloop:" ++ "js 1f\n" ++ ++"0:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "lea (%%r11,%6),%%r10\n" + "sar $0x10,%%r11\n" +@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" +- "jns scaleloop\n" +- +-"scalenext:" ++ "jns 0b\n" ++ ++"1:" + "add $0x1,%4\n" +- "js scaledone\n" ++ "js 2f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +-"scaledone:" ++"2:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 +@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + asm( + "xor %%r11,%%r11\n" // x = 0 + "sub $0x2,%4\n" +- "js .lscalenext\n" ++ "js 2f\n" + "cmp $0x20000,%6\n" // if source_dx >= 2.0 +- "jl .lscalehalf\n" ++ "jl 0f\n" + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +-".lscalehalf:" +- +-".lscaleloop:" ++"0:" ++ ++"1:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1, %%r10, 1), %%r13 \n" + "movzb 1(%1, %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" +@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" +- "jns .lscaleloop\n" +- +-".lscalenext:" ++ "jns 1b\n" ++ ++"2:" + "add $0x1,%4\n" +- "js .lscaledone\n" ++ "js 3f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1,%%r10,1), %%r13 \n" + "movq 2048(%5,%%r13,8),%%xmm0\n" + + "movzb (%2,%%r10,1), %%r13 \n" +@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint + "movzb (%0,%%r11,1), %%r13 \n" + "movq (%5,%%r13,8),%%xmm1\n" + + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +-".lscaledone:" ++"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 + : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" + ); + } + +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) + + // PIC version is slower because less registers are available, so + // non-PIC is used on platforms where it is possible. +- +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width); ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); + asm( + ".text\n" +- ".global FastConvertYUVToRGB32Row\n" +-"FastConvertYUVToRGB32Row:\n" ++ ".global FastConvertYUVToRGB32Row_SSE\n" ++ ".type FastConvertYUVToRGB32Row_SSE, @function\n" ++"FastConvertYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" +- "jmp convertend\n" +- +-"convertloop:" ++ "jmp 1f\n" ++ ++"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" +@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"convertend:" ++"1:" + "sub $0x2,%ecx\n" +- "jns convertloop\n" ++ "jns 0b\n" + + "and $0x1,%ecx\n" +- "je convertdone\n" ++ "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +-"convertdone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +- +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); ++void FastConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) ++{ ++ if (mozilla::supports_sse()) { ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); ++ return; ++ } ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ ++ ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); + asm( + ".text\n" +- ".global ScaleYUVToRGB32Row\n" +-"ScaleYUVToRGB32Row:\n" ++ ".global ScaleYUVToRGB32Row_SSE\n" ++ ".type ScaleYUVToRGB32Row_SSE, @function\n" ++"ScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "xor %ebx,%ebx\n" +- "jmp scaleend\n" +- +-"scaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" +@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"scaleend:" ++"1:" + "sub $0x2,%ecx\n" +- "jns scaleloop\n" ++ "jns 0b\n" + + "and $0x1,%ecx\n" +- "je scaledone\n" ++ "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" +@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +-"scaledone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx); ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++ } ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++} ++ ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx); + asm( + ".text\n" +- ".global LinearScaleYUVToRGB32Row\n" +-"LinearScaleYUVToRGB32Row:\n" ++ ".global LinearScaleYUVToRGB32Row_SSE\n" ++ ".type LinearScaleYUVToRGB32Row_SSE, @function\n" ++"LinearScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x30(%esp),%ebp\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" ++ "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" +- "mov %ebx,%eax\n" +- "sar $0x11,%eax\n" ++ "jmp 1f\n" ++ ++"0:" ++ "mov %ebx,%eax\n" ++ "sar $0x11,%eax\n" + + "movzbl (%edi,%eax,1),%ecx\n" + "movzbl 1(%edi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" +@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" ++ "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" +@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +-".lscaleend:" ++"1:" + "cmp 0x34(%esp), %ebx\n" +- "jl .lscaleloop\n" ++ "jl 0b\n" + "popa\n" + "ret\n" + +-".lscalelastpixel:" ++"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) +- +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int16 *kCoefficientsRgbY); ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++ } ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, ++ width, source_dx); ++} ++ ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) ++ ++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int16 *kCoefficientsRgbY); ++ + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICConvertYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICConvertYUVToRGB32Row_SSE:\n" + #else +-"PICConvertYUVToRGB32Row:\n" ++"PICConvertYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x38(%esp),%ecx\n" + +- "jmp .Lconvertend\n" +- +-".Lconvertloop:" ++ "jmp 1f\n" ++ ++"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw 4096(%ecx,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" +@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons + "movq 0(%ecx,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-".Lconvertend:" ++"1:" + "subl $0x2,0x34(%esp)\n" +- "jns .Lconvertloop\n" ++ "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" +- "je .Lconvertdone\n" ++ "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +-".Lconvertdone:\n" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, +- int width) { +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, +- &kCoefficientsRgbY[0][0]); +-} +- +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, ++ int width) ++{ ++ if (mozilla::supports_sse()) { ++ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ ++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx, + int16 *kCoefficientsRgbY); + + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICScaleYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICScaleYUVToRGB32Row_SSE:\n" + #else +-"PICScaleYUVToRGB32Row:\n" ++"PICScaleYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x3c(%esp),%ecx\n" + "xor %ebx,%ebx\n" +- "jmp Lscaleend\n" +- +-"Lscaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" +@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const + "movq 0(%ecx,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +-"Lscaleend:" ++"1:" + "subl $0x2,0x34(%esp)\n" +- "jns Lscaleloop\n" ++ "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" +- "je Lscaledone\n" ++ "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" +@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +-"Lscaledone:" ++"2:" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + +- + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, +- int source_dx) { +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx, +- int16 *kCoefficientsRgbY); ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, ++ &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++ ++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx, ++ int16 *kCoefficientsRgbY); ++ + asm( + ".text\n" +-#if defined(OS_MACOSX) +-"_PICLinearScaleYUVToRGB32Row:\n" ++#if defined(XP_MACOSX) ++"_PICLinearScaleYUVToRGB32Row_SSE:\n" + #else +-"PICLinearScaleYUVToRGB32Row:\n" ++"PICLinearScaleYUVToRGB32Row_SSE:\n" + #endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "mov 0x3c(%esp),%edi\n" + "xor %ebx,%ebx\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 +- "jl .lscaleend\n" ++ "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less +- "jmp .lscaleend\n" +- +-".lscaleloop:" ++ "jmp 1f\n" ++ ++"0:" + "mov 0x28(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" +@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" +- "jge .lscalelastpixel\n" ++ "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" +@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +-".lscaleend:" ++"1:" + "cmp %ebx, 0x34(%esp)\n" +- "jg .lscaleloop\n" ++ "jg 0b\n" + "popa\n" + "ret\n" + +-".lscalelastpixel:" ++"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" ++#if !defined(XP_MACOSX) ++ ".previous\n" ++#endif + ); + ++ + void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, +- &kCoefficientsRgbY[0][0]); +-} +- +-#else // USE_MMX +- +-// C reference code that mimic the YUV assembly. +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) +- +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- int b = kCoefficientsRgbY[256+u][0]; +- int g = kCoefficientsRgbY[256+u][1]; +- int r = kCoefficientsRgbY[256+u][2]; +- int a = kCoefficientsRgbY[256+u][3]; +- +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); +- +- b = paddsw(b, kCoefficientsRgbY[y][0]); +- g = paddsw(g, kCoefficientsRgbY[y][1]); +- r = paddsw(r, kCoefficientsRgbY[y][2]); +- a = paddsw(a, kCoefficientsRgbY[y][3]); +- +- b >>= 6; +- g >>= 6; +- r >>= 6; +- a >>= 6; +- +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | +- (packuswb(g) << 8) | +- (packuswb(r) << 16) | +- (packuswb(a) << 24); +-} +- ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) ++{ ++ if (mozilla::supports_sse()) { ++ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ source_dx, &kCoefficientsRgbY[0][0]); ++ return; ++ } ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++#else + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { +- for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; +- uint8 y0 = y_buf[x]; +- YuvPixel(y0, u, v, rgb_buf); +- if ((x + 1) < width) { +- uint8 y1 = y_buf[x + 1]; +- YuvPixel(y1, u, v, rgb_buf + 4); +- } +- rgb_buf += 8; // Advance 2 pixels. +- } +-} +- +-// 16.16 fixed point is used. A shift by 16 isolates the integer. +-// A shift by 17 is used to further subsample the chrominence channels. +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +-// for 1/65536 pixel accurate interpolation. ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- for (int i = 0; i < width; i += 2) { +- int y = y_buf[x >> 16]; +- int u = u_buf[(x >> 17)]; +- int v = v_buf[(x >> 17)]; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y = y_buf[x >> 16]; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} + + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- if (source_dx >= 0x20000) { +- x = 32768; +- } +- for (int i = 0; i < width; i += 2) { +- int y0 = y_buf[x >> 16]; +- int y1 = y_buf[(x >> 16) + 1]; +- int u0 = u_buf[(x >> 17)]; +- int u1 = u_buf[(x >> 17) + 1]; +- int v0 = v_buf[(x >> 17)]; +- int v1 = v_buf[(x >> 17) + 1]; +- int y_frac = (x & 65535); +- int uv_frac = ((x >> 1) & 65535); +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y0 = y_buf[x >> 16]; +- y1 = y_buf[(x >> 16) + 1]; +- y_frac = (x & 65535); +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} +- +-#endif // USE_MMX +-} // extern "C" +- ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++#endif ++ ++} +diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp +--- a/gfx/ycbcr/yuv_row_table.cpp ++++ b/gfx/ycbcr/yuv_row_table.cpp +@@ -1,13 +1,13 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" ++#include "yuv_row.h" + + extern "C" { + + #define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp +--- a/gfx/ycbcr/yuv_row_win.cpp ++++ b/gfx/ycbcr/yuv_row_win.cpp +@@ -1,26 +1,27 @@ + // Copyright (c) 2010 The Chromium Authors. All rights reserved. + // Use of this source code is governed by a BSD-style license that can be + // found in the LICENSE file. + +-#include "media/base/yuv_row.h" ++#include "yuv_row.h" ++#include "mozilla/SSE.h" + + #define kCoefficientsRgbU kCoefficientsRgbY + 2048 + #define kCoefficientsRgbV kCoefficientsRgbY + 4096 + + extern "C" { + +-#if USE_MMX +-__declspec(naked) +-void FastConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++__declspec(naked) ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp convertend +@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint + convertdone : + + popad + ret + } + } + + __declspec(naked) +-void ConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int step) { ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step +@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y + wdone : + + popad + ret + } + } + + __declspec(naked) +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int ystep, +- int uvstep) { ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend +@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui + wdone : + + popad + ret + } + } + + __declspec(naked) +-void DoubleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width) { ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend +@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_ + jns wloop1 + wdone : + popad + ret + } + } + + // This version does general purpose scaling by any amount, up or down. +-// The only thing it can not do it rotation by 90 or 270. +-// For performance the chroma is under sampled, reducing cost of a 3x ++// The only thing it cannot do is rotation by 90 or 270. ++// For performance the chroma is under-sampled, reducing cost of a 3x + // 1080p scale from 8.4 ms to 5.4 ms. + __declspec(naked) +-void ScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x +@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b + + scaledone : + popad + ret + } + } + + __declspec(naked) +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, +- const uint8* u_buf, +- const uint8* v_buf, +- uint8* rgb_buf, +- int width, +- int source_dx) { ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + // [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + imul ecx, [esp + 32 + 24] // source_dx +@@ -438,152 +439,60 @@ lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + popad + ret + }; + } +-#else // USE_MMX +- +-// C reference code that mimic the YUV assembly. +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) +- +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- int b = kCoefficientsRgbY[256+u][0]; +- int g = kCoefficientsRgbY[256+u][1]; +- int r = kCoefficientsRgbY[256+u][2]; +- int a = kCoefficientsRgbY[256+u][3]; +- +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); +- +- b = paddsw(b, kCoefficientsRgbY[y][0]); +- g = paddsw(g, kCoefficientsRgbY[y][1]); +- r = paddsw(r, kCoefficientsRgbY[y][2]); +- a = paddsw(a, kCoefficientsRgbY[y][3]); +- +- b >>= 6; +- g >>= 6; +- r >>= 6; +- a >>= 6; +- +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | +- (packuswb(g) << 8) | +- (packuswb(r) << 16) | +- (packuswb(a) << 24); +-} +- +-#if TEST_MMX_YUV +-static inline void YuvPixel(uint8 y, +- uint8 u, +- uint8 v, +- uint8* rgb_buf) { +- +- __asm { +- movzx eax, u +- movq mm0, [kCoefficientsRgbY+2048 + 8 * eax] +- movzx eax, v +- paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax] +- movzx eax, y +- movq mm1, [kCoefficientsRgbY + 8 * eax] +- paddsw mm1, mm0 +- psraw mm1, 6 +- packuswb mm1, mm1 +- mov eax, rgb_buf +- movd [eax], mm1 +- emms +- } +-} +-#endif ++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { +- for (int x = 0; x < width; x += 2) { +- uint8 u = u_buf[x >> 1]; +- uint8 v = v_buf[x >> 1]; +- uint8 y0 = y_buf[x]; +- YuvPixel(y0, u, v, rgb_buf); +- if ((x + 1) < width) { +- uint8 y1 = y_buf[x + 1]; +- YuvPixel(y1, u, v, rgb_buf + 4); +- } +- rgb_buf += 8; // Advance 2 pixels. +- } +-} +- +-// 16.16 fixed point is used. A shift by 16 isolates the integer. +-// A shift by 17 is used to further subsample the chrominence channels. +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +-// for 1/65536 pixel accurate interpolation. ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); ++ return; ++ } ++#endif ++ ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); ++} ++ + void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- for (int i = 0; i < width; i += 2) { +- int y = y_buf[x >> 16]; +- int u = u_buf[(x >> 17)]; +- int v = v_buf[(x >> 17)]; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y = y_buf[x >> 16]; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} ++ ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++ return; ++ } ++#endif ++ ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} + + void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +- int x = 0; +- if (source_dx >= 0x20000) { +- x = 32768; +- } +- for (int i = 0; i < width; i += 2) { +- int y0 = y_buf[x >> 16]; +- int y1 = y_buf[(x >> 16) + 1]; +- int u0 = u_buf[(x >> 17)]; +- int u1 = u_buf[(x >> 17) + 1]; +- int v0 = v_buf[(x >> 17)]; +- int v1 = v_buf[(x >> 17) + 1]; +- int y_frac = (x & 65535); +- int uv_frac = ((x >> 1) & 65535); +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; +- YuvPixel(y, u, v, rgb_buf); +- x += source_dx; +- if ((i + 1) < width) { +- y0 = y_buf[x >> 16]; +- y1 = y_buf[(x >> 16) + 1]; +- y_frac = (x & 65535); +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; +- YuvPixel(y, u, v, rgb_buf+4); +- x += source_dx; +- } +- rgb_buf += 8; +- } +-} +- +-#endif // USE_MMX +-} // extern "C" +- ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) ++ if (mozilla::supports_sse()) { ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, ++ source_dx); ++ return; ++ } ++#endif ++ ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++ ++} // extern "C" diff --git a/gfx/ycbcr/moz.build b/gfx/ycbcr/moz.build new file mode 100644 index 000000000..04855e2e9 --- /dev/null +++ b/gfx/ycbcr/moz.build @@ -0,0 +1,65 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS += [ + 'YCbCrUtils.h', +] + +UNIFIED_SOURCES += [ + 'scale_yuv_argb.cpp', + 'ycbcr_to_rgb565.cpp', + 'YCbCrUtils.cpp', + 'yuv_convert.cpp', + 'yuv_row_c.cpp', + 'yuv_row_table.cpp', +] + +if CONFIG['INTEL_ARCHITECTURE']: + # These files use MMX and SSE2 intrinsics, so they need special compile flags + # on some compilers. + SOURCES += ['yuv_convert_sse2.cpp'] + SOURCES['yuv_convert_sse2.cpp'].flags += CONFIG['SSE2_FLAGS'] + + # MSVC doesn't support MMX when targeting AMD64. + if CONFIG['_MSC_VER']: + if CONFIG['OS_TEST'] != 'x86_64': + SOURCES += [ + 'yuv_convert_mmx.cpp', + ] + else: + SOURCES += ['yuv_convert_mmx.cpp'] + SOURCES['yuv_convert_mmx.cpp'].flags += CONFIG['MMX_FLAGS'] + +if CONFIG['_MSC_VER']: + if CONFIG['OS_TEST'] == 'x86_64': + SOURCES += [ + 'yuv_row_win64.cpp', + ] + else: + SOURCES += [ + 'yuv_row_win.cpp', + ] +elif CONFIG['OS_ARCH'] in ('Linux', 'SunOS', 'Darwin', 'DragonFly', + 'FreeBSD', 'NetBSD', 'OpenBSD'): + SOURCES += [ + 'yuv_row_posix.cpp', + ] +else: + SOURCES += [ + 'yuv_row_other.cpp', + ] + +if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['HAVE_ARM_NEON']: + SOURCES += [ + 'yuv_row_arm.s', + ] + SOURCES += [ + 'yuv_convert_arm.cpp', + ] + +LOCAL_INCLUDES += ['/media/libyuv/include'] + +FINAL_LIBRARY = 'xul' diff --git a/gfx/ycbcr/scale_yuv_argb.cpp b/gfx/ycbcr/scale_yuv_argb.cpp new file mode 100644 index 000000000..91a96cb9f --- /dev/null +++ b/gfx/ycbcr/scale_yuv_argb.cpp @@ -0,0 +1,1126 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * Copyright 2016 Mozilla Foundation + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// YUV to RGB conversion and scaling functions were implemented by referencing +// scale_argb.cc +// +// libyuv already has ScaleYUVToARGBBilinearUp(), but its implementation is not +// completed yet. Implementations of the functions are based on it. +// At first, ScaleYUVToARGBBilinearUp() was implemented by modidying the +// libyuv's one. Then all another functions were implemented similarly. +// +// Function relationship between yuv_convert.cpp abd scale_argb.cc are like +// the followings +// - ScaleYUVToARGBDown2() <-- ScaleARGBDown2() +// - ScaleYUVToARGBDownEven() <-- ScaleARGBDownEven() +// - ScaleYUVToARGBBilinearDown() <-- ScaleARGBBilinearDown() +// - ScaleYUVToARGBBilinearUp() <-- ScaleARGBBilinearUp() and ScaleYUVToARGBBilinearUp() in libyuv +// - ScaleYUVToARGBSimple() <-- ScaleARGBSimple() +// - ScaleYUVToARGB() <-- ScaleARGB() // Removed some function calls for simplicity. +// - YUVToARGBScale() <-- ARGBScale() +// +// Callings and selections of InterpolateRow() and ScaleARGBFilterCols() were +// kept as same as possible. +// +// The followings changes were done to each scaling functions. +// +// -[1] Allocate YUV conversion buffer and use it as source buffer of scaling. +// Its usage is borrowed from the libyuv's ScaleYUVToARGBBilinearUp(). +// -[2] Conversion from YUV to RGB was abstracted as YUVBuferIter. +// It is for handling multiple yuv color formats. +// -[3] Modified scaling functions as to handle YUV conversion buffer and +// use YUVBuferIter. +// -[4] Color conversion function selections in YUVBuferIter were borrowed from +// I444ToARGBMatrix(), I422ToARGBMatrix() and I420ToARGBMatrix() + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +struct YUVBuferIter { + int src_width; + int src_height; + int src_stride_y; + int src_stride_u; + int src_stride_v; + const uint8* src_y; + const uint8* src_u; + const uint8* src_v; + + uint32 src_fourcc; + const struct YuvConstants* yuvconstants; + int y_index; + const uint8* src_row_y; + const uint8* src_row_u; + const uint8* src_row_v; + + void (*YUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); + void (*MoveTo)(YUVBuferIter& iter, int y_index); + void (*MoveToNextRow)(YUVBuferIter& iter); +}; + +void YUVBuferIter_InitI422(YUVBuferIter& iter) { + iter.YUVToARGBRow = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(iter.src_width, 16)) { + iter.YUVToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + iter.YUVToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(iter.src_width, 4) && + IS_ALIGNED(iter.src_y, 4) && IS_ALIGNED(iter.src_stride_y, 4) && + IS_ALIGNED(iter.src_u, 2) && IS_ALIGNED(iter.src_stride_u, 2) && + IS_ALIGNED(iter.src_v, 2) && IS_ALIGNED(iter.src_stride_v, 2) { + // Always satisfy IS_ALIGNED(argb_cnv_row, 4) && IS_ALIGNED(argb_cnv_rowstride, 4) + iter.YUVToARGBRow = I422ToARGBRow_DSPR2; + } +#endif +} + +void YUVBuferIter_InitI444(YUVBuferIter& iter) { + iter.YUVToARGBRow = I444ToARGBRow_C; +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(iter.src_width, 16)) { + iter.YUVToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + iter.YUVToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(iter.src_width, 8)) { + iter.YUVToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +} + + +static void YUVBuferIter_MoveToForI444(YUVBuferIter& iter, int y_index) { + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI444(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + iter.y_index++; +} + +static void YUVBuferIter_MoveToForI422(YUVBuferIter& iter, int y_index) { + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI422(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + iter.y_index++; +} + +static void YUVBuferIter_MoveToForI420(YUVBuferIter& iter, int y_index) { + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int uv_y_index = y_index >> kYShift; + + iter.y_index = y_index; + iter.src_row_y = iter.src_y + y_index * iter.src_stride_y; + iter.src_row_u = iter.src_u + uv_y_index * iter.src_stride_u; + iter.src_row_v = iter.src_v + uv_y_index * iter.src_stride_v; +} + +static void YUVBuferIter_MoveToNextRowForI420(YUVBuferIter& iter) { + iter.src_row_y += iter.src_stride_y; + if (iter.y_index & 1) { + iter.src_row_u += iter.src_stride_u; + iter.src_row_v += iter.src_stride_v; + } + iter.y_index++; +} + +static __inline void YUVBuferIter_ConvertToARGBRow(YUVBuferIter& iter, uint8* argb_row) { + iter.YUVToARGBRow(iter.src_row_y, iter.src_row_u, iter.src_row_v, argb_row, iter.yuvconstants, iter.src_width); +} + +void YUVBuferIter_Init(YUVBuferIter& iter, uint32 src_fourcc, mozilla::YUVColorSpace yuv_color_space) { + iter.src_fourcc = src_fourcc; + iter.y_index = 0; + iter.src_row_y = iter.src_y; + iter.src_row_u = iter.src_u; + iter.src_row_v = iter.src_v; + if (yuv_color_space == mozilla::YUVColorSpace::BT709) { + iter.yuvconstants = &kYuvH709Constants; + } else { + iter.yuvconstants = &kYuvI601Constants; + } + + if (src_fourcc == FOURCC_I444) { + YUVBuferIter_InitI444(iter); + iter.MoveTo = YUVBuferIter_MoveToForI444; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI444; + } else if(src_fourcc == FOURCC_I422){ + YUVBuferIter_InitI422(iter); + iter.MoveTo = YUVBuferIter_MoveToForI422; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI422; + } else { + assert(src_fourcc == FOURCC_I420); // Should be FOURCC_I420 + YUVBuferIter_InitI422(iter); + iter.MoveTo = YUVBuferIter_MoveToForI420; + iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI420; + } +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleYUVToARGBDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) { + int j; + + // Allocate 2 rows of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + int yi = y >> 16; + iter.MoveTo(iter, yi); + ptrdiff_t x_offset; + if (filtering == kFilterBilinear) { + x_offset = (x >> 16) * 4; + } else { + x_offset = ((x >> 16) - 1) * 4; + } +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : + ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } + } + +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : + ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : + ScaleARGBRowDown2Box_NEON); + } + } +#endif + + const int dyi = dy >> 16; + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + dyi) < (src_height - 1)) { + iter.MoveTo(iter, yi + dyi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + if (filtering == kFilterLinear) { + argb_cnv_rowstride = 0; + } + const int max_yi = src_height - 1; + const int max_yi_minus_dyi = max_yi - dyi; + for (j = 0; j < dst_height; ++j) { + if (yi != lastyi) { + if (yi > max_yi) { + yi = max_yi; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi; + iter.MoveTo(iter, next_yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + ScaleARGBRowDown2(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, dst_argb, dst_width); + dst_argb += dst_stride_argb; + yi += dyi; + } + + free_aligned_buffer_64(argb_cnv_row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleYUVToARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) { + int j; + // Allocate 2 rows of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + + int col_step = dx >> 16; + void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, + int src_step, uint8* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + int yi = y >> 16; + const ptrdiff_t x_offset = (x >> 16) * 4; + +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : + ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : + ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } + } +#endif + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + const int dyi = dy >> 16; + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + dyi) < (src_height - 1)) { + iter.MoveTo(iter, yi + dyi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + if (filtering == kFilterLinear) { + argb_cnv_rowstride = 0; + } + const int max_yi = src_height - 1; + const int max_yi_minus_dyi = max_yi - dyi; + for (j = 0; j < dst_height; ++j) { + if (yi != lastyi) { + if (yi > max_yi) { + yi = max_yi; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi; + iter.MoveTo(iter, next_yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + ScaleARGBRowDownEven(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, col_step, dst_argb, dst_width); + dst_argb += dst_stride_argb; + yi += dyi; + } + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale YUV to ARGB down with bilinear interpolation. +static void ScaleYUVToARGBBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64 xlast = x + (int64)(dst_width - 1) * dx; + int64 xl = (dx >= 0) ? x : xlast; + int64 xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + const ptrdiff_t xl_offset = xl * 4; + x -= (int)(xl << 16); + + // Allocate 2 row of ARGB for source conversion. + const int kRowSize = (src_width * 4 + 15) & ~15; + align_buffer_64(argb_cnv_row, kRowSize * 2); + uint8* argb_cnv_rowptr = argb_cnv_row; + int argb_cnv_rowstride = kRowSize; + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)) { + InterpolateRow = InterpolateRow_Any_DSPR2; + if (IS_ALIGNED(clip_src_width, 4)) { + InterpolateRow = InterpolateRow_DSPR2; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + align_buffer_64(row, clip_src_width * 4); + + int lastyi = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + 1) < src_height) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + } + + const int max_y = (src_height - 1) << 16; + const int max_yi = src_height - 1; + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lastyi) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + lastyi = yi; + } else { + // Prepare current row + if (yi == iter.y_index) { + argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride; + argb_cnv_rowstride = - argb_cnv_rowstride; + } else { + iter.MoveTo(iter, yi); + argb_cnv_rowptr = argb_cnv_row; + argb_cnv_rowstride = kRowSize; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr); + } + // Prepare next row if necessary + if (iter.y_index < max_yi) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride); + } else { + argb_cnv_rowstride = 0; + } + lastyi = yi; + } + } + } + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, argb_cnv_rowptr + xl_offset, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, argb_cnv_rowptr + xl_offset, argb_cnv_rowstride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_cnv_row, src_width * 4); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_DSPR2; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lastyi = yi; + + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx); + + if (filtering == kFilterLinear) { + rowstride = 0; + } + // Prepare next row if necessary + if (filtering != kFilterLinear) { + if ((yi + 1) < src_height) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx); + }else { + rowstride = 0; + } + } + + const int max_yi = src_height - 1; + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lastyi) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + } + if (yi != lastyi) { + if (filtering == kFilterLinear) { + iter.MoveToNextRow(iter); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx); + } else { + // Prepare next row if necessary + if (yi < max_yi) { + iter.MoveToNextRow(iter); + rowptr += rowstride; + rowstride = -rowstride; + // TODO(fbarchard): Convert the clipped region of row. + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx); + } else { + rowstride = 0; + } + } + lastyi = yi; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(argb_cnv_row); +} + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleYUVToARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) { + int j; + void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_cnv_row, src_width * 4); + +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + int yi = y >> 16; + + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + iter.MoveTo(iter, yi); + + int lasty = yi; + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + iter.MoveTo(iter, yi); + YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row); + lasty = yi; + } + ScaleARGBCols(dst_argb, argb_cnv_row, dst_width, x, dx); + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(argb_cnv_row); +} + +static void YUVToARGBCopy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) +{ + YUVBuferIter iter; + iter.src_width = src_width; + iter.src_height = src_height; + iter.src_stride_y = src_stride_y; + iter.src_stride_u = src_stride_u; + iter.src_stride_v = src_stride_v; + iter.src_y = src_y; + iter.src_u = src_u; + iter.src_v = src_v; + YUVBuferIter_Init(iter, src_fourcc, yuv_color_space); + + for (int j = 0; j < dst_height; ++j) { + YUVBuferIter_ConvertToARGBRow(iter, dst_argb); + iter.MoveToNextRow(iter); + dst_argb += dst_stride_argb; + } +} + +static void ScaleYUVToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space) +{ + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleYUVToARGBDown2(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + ScaleYUVToARGBDownEven(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight conversion and copy. + YUVToARGBCopy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_argb, dst_stride_argb, + dst_width, dst_height, + src_fourcc, + yuv_color_space); + return; + } + } + } + } + if (filtering && dy < 65536) { + ScaleYUVToARGBBilinearUp(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + if (filtering) { + ScaleYUVToARGBBilinearDown(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + filtering, + src_fourcc, + yuv_color_space); + return; + } + ScaleYUVToARGBSimple(src_width, src_height, + dst_width, dst_height, + src_stride_y, + src_stride_u, + src_stride_v, + dst_stride_argb, + src_y, + src_u, + src_v, + dst_argb, + x, dx, y, dy, + src_fourcc, + yuv_color_space); +} + +bool IsConvertSupported(uint32 src_fourcc) +{ + if (src_fourcc == FOURCC_I444 || + src_fourcc == FOURCC_I422 || + src_fourcc == FOURCC_I420) { + return true; + } + return false; +} + +LIBYUV_API +int YUVToARGBScale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering) +{ + if (!src_y || !src_u || !src_v || + src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + if (!IsConvertSupported(src_fourcc)) { + return -1; + } + ScaleYUVToARGB(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_argb, dst_stride_argb, + dst_width, dst_height, + filtering, + src_fourcc, + yuv_color_space); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/gfx/ycbcr/scale_yuv_argb.h b/gfx/ycbcr/scale_yuv_argb.h new file mode 100644 index 000000000..d1a42db1b --- /dev/null +++ b/gfx/ycbcr/scale_yuv_argb.h @@ -0,0 +1,39 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#include "ImageTypes.h" // For YUVColorSpace + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +int YUVToARGBScale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + mozilla::YUVColorSpace yuv_color_space, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_ NOLINT diff --git a/gfx/ycbcr/update.sh b/gfx/ycbcr/update.sh new file mode 100644 index 000000000..3a38fe81a --- /dev/null +++ b/gfx/ycbcr/update.sh @@ -0,0 +1,12 @@ +# update.sh <chromium-src-directory> +cp $1/media/base/yuv_convert.h . +cp $1/media/base/yuv_convert.cc yuv_convert.cpp +cp $1/media/base/yuv_row.h . +cp $1/media/base/yuv_row_table.cc yuv_row_table.cpp +cp $1/media/base/yuv_row_posix.cc yuv_row_posix.cpp +cp $1/media/base/yuv_row_win.cc yuv_row_win.cpp +cp $1/media/base/yuv_row_posix.cc yuv_row_c.cpp +patch -p3 <convert.patch +patch -p3 <win64.patch +patch -p3 <TypeFromSize.patch +patch -p3 <QuellGccWarnings.patch diff --git a/gfx/ycbcr/win64.patch b/gfx/ycbcr/win64.patch new file mode 100644 index 000000000..bdccf2784 --- /dev/null +++ b/gfx/ycbcr/win64.patch @@ -0,0 +1,210 @@ +diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp +new file mode 100644 +--- /dev/null ++++ b/gfx/ycbcr/yuv_row_win64.cpp +@@ -0,0 +1,205 @@ ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style license that can be ++// found in the LICENSE file. ++ ++#include "yuv_row.h" ++ ++extern "C" { ++ ++// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. ++ ++#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048) ++#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096) ++ ++#include <emmintrin.h> ++ ++static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { ++ __m128i xmm0, xmmY1, xmmY2; ++ __m128 xmmY; ++ ++ while (width >= 2) { ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); ++ ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); ++ ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), ++ 0x44); ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); ++ rgb_buf += 8; ++ width -= 2; ++ } ++ ++ if (width) { ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); ++ } ++} ++ ++static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { ++ __m128i xmm0, xmmY1, xmmY2; ++ __m128 xmmY; ++ uint8 u, v, y; ++ int x = 0; ++ ++ while (width >= 2) { ++ u = u_buf[x >> 17]; ++ v = v_buf[x >> 17]; ++ y = y_buf[x >> 16]; ++ x += source_dx; ++ ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ ++ y = y_buf[x >> 16]; ++ x += source_dx; ++ ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); ++ ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), ++ 0x44); ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); ++ rgb_buf += 8; ++ width -= 2; ++ } ++ ++ if (width) { ++ u = u_buf[x >> 17]; ++ v = v_buf[x >> 17]; ++ y = y_buf[x >> 16]; ++ ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); ++ } ++} ++ ++static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { ++ __m128i xmm0, xmmY1, xmmY2; ++ __m128 xmmY; ++ uint8 u0, u1, v0, v1, y0, y1; ++ uint32 uv_frac, y_frac, u, v, y; ++ int x = 0; ++ ++ if (source_dx >= 0x20000) { ++ x = 32768; ++ } ++ ++ while(width >= 2) { ++ u0 = u_buf[x >> 17]; ++ u1 = u_buf[(x >> 17) + 1]; ++ v0 = v_buf[x >> 17]; ++ v1 = v_buf[(x >> 17) + 1]; ++ y0 = y_buf[x >> 16]; ++ y1 = y_buf[(x >> 16) + 1]; ++ uv_frac = (x & 0x1fffe); ++ y_frac = (x & 0xffff); ++ u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; ++ v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; ++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; ++ x += source_dx; ++ ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ ++ y0 = y_buf[x >> 16]; ++ y1 = y_buf[(x >> 16) + 1]; ++ y_frac = (x & 0xffff); ++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; ++ x += source_dx; ++ ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); ++ ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), ++ 0x44); ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); ++ rgb_buf += 8; ++ width -= 2; ++ } ++ ++ if (width) { ++ u = u_buf[x >> 17]; ++ v = v_buf[x >> 17]; ++ y = y_buf[x >> 16]; ++ ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); ++ ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); ++ } ++} ++ ++void FastConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { ++ FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); ++} ++ ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { ++ ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); ++} ++ ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int source_dx) { ++ LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, ++ source_dx); ++} ++ ++} // extern "C" diff --git a/gfx/ycbcr/ycbcr_to_rgb565.cpp b/gfx/ycbcr/ycbcr_to_rgb565.cpp new file mode 100644 index 000000000..0572e3e09 --- /dev/null +++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp @@ -0,0 +1,672 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdlib.h> +#include <limits.h> +#include "nsDebug.h" +#include "ycbcr_to_rgb565.h" +#include "nsAlgorithm.h" + + + +#ifdef HAVE_YCBCR_TO_RGB565 + +namespace mozilla { + +namespace gfx { + +/*This contains all of the parameters that are needed to convert a row. + Passing them in a struct instead of as individual parameters saves the need + to continually push onto the stack the ones that are fixed for every row.*/ +struct yuv2rgb565_row_scale_bilinear_ctx{ + uint16_t *rgb_row; + const uint8_t *y_row; + const uint8_t *u_row; + const uint8_t *v_row; + int y_yweight; + int y_pitch; + int width; + int source_x0_q16; + int source_dx_q16; + /*Not used for 4:4:4, except with chroma-nearest.*/ + int source_uv_xoffs_q16; + /*Not used for 4:4:4 or chroma-nearest.*/ + int uv_pitch; + /*Not used for 4:2:2, 4:4:4, or chroma-nearest.*/ + int uv_yweight; +}; + + + +/*This contains all of the parameters that are needed to convert a row. + Passing them in a struct instead of as individual parameters saves the need + to continually push onto the stack the ones that are fixed for every row.*/ +struct yuv2rgb565_row_scale_nearest_ctx{ + uint16_t *rgb_row; + const uint8_t *y_row; + const uint8_t *u_row; + const uint8_t *v_row; + int width; + int source_x0_q16; + int source_dx_q16; + /*Not used for 4:4:4.*/ + int source_uv_xoffs_q16; +}; + + + +typedef void (*yuv2rgb565_row_scale_bilinear_func)( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); + +typedef void (*yuv2rgb565_row_scale_nearest_func)( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither); + + + +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + +extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); + +void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16 *dst, + const uint8 *y, + const uint8 *u, + const uint8 *v, + int n, + int oddflag); + +#endif + + + +/*Bilinear interpolation of a single value. + This uses the exact same formulas as the asm, even though it adds some extra + shifts that do nothing but reduce accuracy.*/ +static int bislerp(const uint8_t *row, + int pitch, + int source_x, + int xweight, + int yweight) { + int a; + int b; + int c; + int d; + a = row[source_x]; + b = row[source_x+1]; + c = row[source_x+pitch]; + d = row[source_x+pitch+1]; + a = ((a<<8)+(c-a)*yweight+128)>>8; + b = ((b<<8)+(d-b)*yweight+128)>>8; + return ((a<<8)+(b-a)*xweight+128)>>8; +} + +/*Convert a single pixel from Y'CbCr to RGB565. + This uses the exact same formulas as the asm, even though we could make the + constants a lot more accurate with 32-bit wide registers.*/ +static uint16_t yu2rgb565(int y, int u, int v, int dither) { + /*This combines the constant offset that needs to be added during the Y'CbCr + conversion with a rounding offset that depends on the dither parameter.*/ + static const int DITHER_BIAS[4][3]={ + {-14240, 8704, -17696}, + {-14240+128,8704+64, -17696+128}, + {-14240+256,8704+128,-17696+256}, + {-14240+384,8704+192,-17696+384} + }; + int r; + int g; + int b; + r = clamped((74*y+102*v+DITHER_BIAS[dither][0])>>9, 0, 31); + g = clamped((74*y-25*u-52*v+DITHER_BIAS[dither][1])>>8, 0, 63); + b = clamped((74*y+129*u+DITHER_BIAS[dither][2])>>9, 0, 31); + return (uint16_t)(r<<11 | g<<5 | b); +} + +static void ScaleYCbCr420ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight); + v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr422ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight); + v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_Bilinear_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + source_x_q16 += ctx->source_dx_q16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + u = bislerp(ctx->u_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + v = bislerp(ctx->v_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr42xToRGB565_BilinearY_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_BilinearY_Row_C( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){ + int x; + int source_x_q16; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + int source_x; + int xweight; + int y; + int u; + int v; + xweight = ((source_x_q16&0xFFFF)+128)>>8; + source_x = source_x_q16>>16; + y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight); + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>16; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr42xToRGB565_Nearest_Row_C( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){ + int y; + int u; + int v; + int x; + int source_x_q16; + int source_x; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + source_x = source_x_q16>>16; + y = ctx->y_row[source_x]; + source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17; + source_x_q16 += ctx->source_dx_q16; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +static void ScaleYCbCr444ToRGB565_Nearest_Row_C( + const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){ + int y; + int u; + int v; + int x; + int source_x_q16; + int source_x; + source_x_q16 = ctx->source_x0_q16; + for (x = 0; x < ctx->width; x++) { + source_x = source_x_q16>>16; + source_x_q16 += ctx->source_dx_q16; + y = ctx->y_row[source_x]; + u = ctx->u_row[source_x]; + v = ctx->v_row[source_x]; + ctx->rgb_row[x] = yu2rgb565(y, u, v, dither); + dither ^= 3; + } +} + +void ScaleYCbCrToRGB565(const uint8_t *y_buf, + const uint8_t *u_buf, + const uint8_t *v_buf, + uint8_t *rgb_buf, + int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + ScaleFilter filter) { + int source_x0_q16; + int source_y0_q16; + int source_dx_q16; + int source_dy_q16; + int source_uv_xoffs_q16; + int source_uv_yoffs_q16; + int x_shift; + int y_shift; + int ymin; + int ymax; + int uvmin; + int uvmax; + int dither; + /*We don't support negative destination rectangles (just flip the source + instead), and for empty ones there's nothing to do.*/ + if (width <= 0 || height <= 0) + return; + /*These bounds are required to avoid 16.16 fixed-point overflow.*/ + NS_ASSERTION(source_x0 > (INT_MIN>>16) && source_x0 < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source X offset out of bounds."); + NS_ASSERTION(source_x0+source_width > (INT_MIN>>16) + && source_x0+source_width < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source width out of bounds."); + NS_ASSERTION(source_y0 > (INT_MIN>>16) && source_y0 < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source Y offset out of bounds."); + NS_ASSERTION(source_y0+source_height > (INT_MIN>>16) + && source_y0+source_height < (INT_MAX>>16), + "ScaleYCbCrToRGB565 source height out of bounds."); + /*We require the same stride for Y' and Cb and Cr for 4:4:4 content.*/ + NS_ASSERTION(yuv_type != YV24 || y_pitch == uv_pitch, + "ScaleYCbCrToRGB565 luma stride differs from chroma for 4:4:4 content."); + /*We assume we can read outside the bounds of the input, because it makes + the code much simpler (and in practice is true: both Theora and VP8 return + padded reference frames). + In practice, we do not even _have_ the actual bounds of the source, as + we are passed a crop rectangle from it, and not the dimensions of the full + image. + This assertion will not guarantee our out-of-bounds reads are safe, but it + should at least catch the simple case of passing in an unpadded buffer.*/ + NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16, + "ScaleYCbCrToRGB565 source image unpadded?"); + /*The NEON code requires the pointers to be aligned to a 16-byte boundary at + the start of each row. + This should be true for all of our sources. + We could try to fix this up if it's not true by adjusting source_x0, but + that would require the mis-alignment to be the same for the U and V + planes.*/ + NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 && + ((y_buf-(uint8_t *)nullptr)&15) == 0 && + ((u_buf-(uint8_t *)nullptr)&15) == 0 && + ((v_buf-(uint8_t *)nullptr)&15) == 0, + "ScaleYCbCrToRGB565 source image unaligned"); + /*We take an area-based approach to pixel coverage to avoid shifting by small + amounts (or not so small, when up-scaling or down-scaling by a large + factor). + + An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^. + + + = RGB destination locations + * = Y' source locations + - = Cb, Cr source locations + + + + + + + + + + + * * * * + + + + + + + + + + - - + + + + + + + + + + * * * * + + + + + + + + + + + + + + + + + + + + * * * * + + + + + + + + + + - - + + + + + + + + + + * * * * + + + + + + + + + + + So, the coordinates of the upper-left + (first destination site) should + be (-0.25,-0.25) in the source Y' coordinate system. + Similarly, the coordinates should be (-0.375,-0.375) in the source Cb, Cr + coordinate system. + Note that the origin and scale of these two coordinate systems is not the + same! + + ^JPEG cositing is required for Theora; VP8 doesn't specify cositing rules, + but nearly all software converters in existence (at least those that are + open source, and many that are not) use JPEG cositing instead of MPEG.*/ + source_dx_q16 = (source_width<<16) / width; + source_x0_q16 = (source_x0<<16)+(source_dx_q16>>1)-0x8000; + source_dy_q16 = (source_height<<16) / height; + source_y0_q16 = (source_y0<<16)+(source_dy_q16>>1)-0x8000; + x_shift = (yuv_type != YV24); + y_shift = (yuv_type == YV12); + /*These two variables hold the difference between the origins of the Y' and + the Cb, Cr coordinate systems, using the scale of the Y' coordinate + system.*/ + source_uv_xoffs_q16 = -(x_shift<<15); + source_uv_yoffs_q16 = -(y_shift<<15); + /*Compute the range of source rows we'll actually use. + This doesn't guarantee we won't read outside this range.*/ + ymin = source_height >= 0 ? source_y0 : source_y0+source_height-1; + ymax = source_height >= 0 ? source_y0+source_height-1 : source_y0; + uvmin = ymin>>y_shift; + uvmax = ((ymax+1+y_shift)>>y_shift)-1; + /*Pick a dithering pattern. + The "&3" at the end is just in case RAND_MAX is lying.*/ + dither = (rand()/(RAND_MAX>>2))&3; + /*Nearest-neighbor scaling.*/ + if (filter == FILTER_NONE) { + yuv2rgb565_row_scale_nearest_ctx ctx; + yuv2rgb565_row_scale_nearest_func scale_row; + int y; + /*Add rounding offsets once, in advance.*/ + source_x0_q16 += 0x8000; + source_y0_q16 += 0x8000; + source_uv_xoffs_q16 += (x_shift<<15); + source_uv_yoffs_q16 += (y_shift<<15); + if (yuv_type == YV12) + scale_row = ScaleYCbCr42xToRGB565_Nearest_Row_C; + else + scale_row = ScaleYCbCr444ToRGB565_Nearest_Row_C; + ctx.width = width; + ctx.source_x0_q16 = source_x0_q16; + ctx.source_dx_q16 = source_dx_q16; + ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16; + for (y=0; y<height; y++) { + int source_y; + ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch); + source_y = source_y0_q16>>16; + source_y = clamped(source_y, ymin, ymax); + ctx.y_row = y_buf + source_y*y_pitch; + source_y = (source_y0_q16+source_uv_yoffs_q16)>>(16+y_shift); + source_y = clamped(source_y, uvmin, uvmax); + source_y0_q16 += source_dy_q16; + ctx.u_row = u_buf + source_y*uv_pitch; + ctx.v_row = v_buf + source_y*uv_pitch; + (*scale_row)(&ctx, dither); + dither ^= 2; + } + } + /*Bilinear scaling.*/ + else { + yuv2rgb565_row_scale_bilinear_ctx ctx; + yuv2rgb565_row_scale_bilinear_func scale_row; + int uvxscale_min; + int uvxscale_max; + int uvyscale_min; + int uvyscale_max; + int y; + /*Check how close the chroma scaling is to unity. + If it's close enough, we can get away with nearest-neighbor chroma + sub-sampling, and only doing bilinear on luma. + If a given axis is subsampled, we use bounds on the luma step of + [0.67...2], which is equivalent to scaling chroma by [1...3]. + If it's not subsampled, we use bounds of [0.5...1.33], which is + equivalent to scaling chroma by [0.75...2]. + The lower bound is chosen as a trade-off between speed and how terrible + nearest neighbor looks when upscaling.*/ +# define CHROMA_NEAREST_SUBSAMP_STEP_MIN 0xAAAA +# define CHROMA_NEAREST_NORMAL_STEP_MIN 0x8000 +# define CHROMA_NEAREST_SUBSAMP_STEP_MAX 0x20000 +# define CHROMA_NEAREST_NORMAL_STEP_MAX 0x15555 + uvxscale_min = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvxscale_max = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + uvyscale_min = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvyscale_max = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + if (uvxscale_min <= abs(source_dx_q16) + && abs(source_dx_q16) <= uvxscale_max + && uvyscale_min <= abs(source_dy_q16) + && abs(source_dy_q16) <= uvyscale_max) { + /*Add the rounding offsets now.*/ + source_uv_xoffs_q16 += 1<<(15+x_shift); + source_uv_yoffs_q16 += 1<<(15+y_shift); + if (yuv_type != YV24) { + scale_row = +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON : +# endif + ScaleYCbCr42xToRGB565_BilinearY_Row_C; + } + else + scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C; + } + else { + if (yuv_type == YV12) + scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C; + else if (yuv_type == YV16) + scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C; + else + scale_row = ScaleYCbCr444ToRGB565_Bilinear_Row_C; + } + ctx.width = width; + ctx.y_pitch = y_pitch; + ctx.source_x0_q16 = source_x0_q16; + ctx.source_dx_q16 = source_dx_q16; + ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16; + ctx.uv_pitch = uv_pitch; + for (y=0; y<height; y++) { + int source_y; + int yweight; + int uvweight; + ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch); + source_y = (source_y0_q16+128)>>16; + yweight = ((source_y0_q16+128)>>8)&0xFF; + if (source_y < ymin) { + source_y = ymin; + yweight = 0; + } + if (source_y > ymax) { + source_y = ymax; + yweight = 0; + } + ctx.y_row = y_buf + source_y*y_pitch; + source_y = source_y0_q16+source_uv_yoffs_q16+(128<<y_shift); + source_y0_q16 += source_dy_q16; + uvweight = source_y>>(8+y_shift)&0xFF; + source_y >>= 16+y_shift; + if (source_y < uvmin) { + source_y = uvmin; + uvweight = 0; + } + if (source_y > uvmax) { + source_y = uvmax; + uvweight = 0; + } + ctx.u_row = u_buf + source_y*uv_pitch; + ctx.v_row = v_buf + source_y*uv_pitch; + ctx.y_yweight = yweight; + ctx.uv_yweight = uvweight; + (*scale_row)(&ctx, dither); + dither ^= 2; + } + } +} + +bool IsScaleYCbCrToRGB565Fast(int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + YUVType yuv_type, + ScaleFilter filter) +{ + // Very fast. + if (width <= 0 || height <= 0) + return true; +# if defined(MOZILLA_MAY_SUPPORT_NEON) + if (filter != FILTER_NONE) { + int source_dx_q16; + int source_dy_q16; + int uvxscale_min; + int uvxscale_max; + int uvyscale_min; + int uvyscale_max; + source_dx_q16 = (source_width<<16) / width; + source_dy_q16 = (source_height<<16) / height; + uvxscale_min = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvxscale_max = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + uvyscale_min = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvyscale_max = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + if (uvxscale_min <= abs(source_dx_q16) + && abs(source_dx_q16) <= uvxscale_max + && uvyscale_min <= abs(source_dy_q16) + && abs(source_dy_q16) <= uvyscale_max) { + if (yuv_type != YV24) + return supports_neon(); + } + } +# endif + return false; +} + + + +void yuv_to_rgb565_row_c(uint16 *dst, + const uint8 *y, + const uint8 *u, + const uint8 *v, + int x_shift, + int pic_x, + int pic_width) +{ + int x; + for (x = 0; x < pic_width; x++) + { + dst[x] = yu2rgb565(y[pic_x+x], + u[(pic_x+x)>>x_shift], + v[(pic_x+x)>>x_shift], + 2); // Disable dithering for now. + } +} + +void ConvertYCbCrToRGB565(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type) +{ + int x_shift; + int y_shift; + x_shift = yuv_type != YV24; + y_shift = yuv_type == YV12; +//TODO: fix NEON asm for iOS +# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__) + if (yuv_type != YV24 && supports_neon()) + { + for (int i = 0; i < pic_height; i++) { + int yoffs; + int uvoffs; + yoffs = y_pitch * (pic_y+i) + pic_x; + uvoffs = uv_pitch * ((pic_y+i)>>y_shift) + (pic_x>>x_shift); + yuv42x_to_rgb565_row_neon((uint16*)(rgb_buf + rgb_pitch * i), + y_buf + yoffs, + u_buf + uvoffs, + v_buf + uvoffs, + pic_width, + pic_x&x_shift); + } + } + else +# endif + { + for (int i = 0; i < pic_height; i++) { + int yoffs; + int uvoffs; + yoffs = y_pitch * (pic_y+i); + uvoffs = uv_pitch * ((pic_y+i)>>y_shift); + yuv_to_rgb565_row_c((uint16*)(rgb_buf + rgb_pitch * i), + y_buf + yoffs, + u_buf + uvoffs, + v_buf + uvoffs, + x_shift, + pic_x, + pic_width); + } + } +} + +bool IsConvertYCbCrToRGB565Fast(int pic_x, + int pic_y, + int pic_width, + int pic_height, + YUVType yuv_type) +{ +# if defined(MOZILLA_MAY_SUPPORT_NEON) + return (yuv_type != YV24 && supports_neon()); +# else + return false; +# endif +} + +} // namespace gfx + +} // namespace mozilla + +#endif // HAVE_YCBCR_TO_RGB565 diff --git a/gfx/ycbcr/ycbcr_to_rgb565.h b/gfx/ycbcr/ycbcr_to_rgb565.h new file mode 100644 index 000000000..41272223b --- /dev/null +++ b/gfx/ycbcr/ycbcr_to_rgb565.h @@ -0,0 +1,72 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#ifndef MEDIA_BASE_YCBCR_TO_RGB565_H_ +#define MEDIA_BASE_YCBCR_TO_RGB565_H_ +#include "yuv_convert.h" +#include "mozilla/arm.h" + +// It's currently only worth including this if we have NEON support. +#ifdef MOZILLA_MAY_SUPPORT_NEON +#define HAVE_YCBCR_TO_RGB565 1 +#endif + +namespace mozilla { + +namespace gfx { + +#ifdef HAVE_YCBCR_TO_RGB565 +// Convert a frame of YUV to 16 bit RGB565. +void ConvertYCbCrToRGB565(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +// Used to test if we have an accelerated version. +bool IsConvertYCbCrToRGB565Fast(int pic_x, + int pic_y, + int pic_width, + int pic_height, + YUVType yuv_type); + +// Scale a frame of YUV to 16 bit RGB565. +void ScaleYCbCrToRGB565(const uint8_t *yplane, + const uint8_t *uplane, + const uint8_t *vplane, + uint8_t *rgbframe, + int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + ScaleFilter filter); + +// Used to test if we have an accelerated version. +bool IsScaleYCbCrToRGB565Fast(int source_x0, + int source_y0, + int source_width, + int source_height, + int width, + int height, + YUVType yuv_type, + ScaleFilter filter); +#endif // HAVE_YCBCR_TO_RGB565 + +} // namespace gfx + +} // namespace mozilla + +#endif // MEDIA_BASE_YCBCR_TO_RGB565_H_ diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp new file mode 100644 index 000000000..78fd4ee89 --- /dev/null +++ b/gfx/ycbcr/yuv_convert.cpp @@ -0,0 +1,510 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This webpage shows layout of YV12 and other YUV formats +// http://www.fourcc.org/yuv.php +// The actual conversion is best described here +// http://en.wikipedia.org/wiki/YUV +// An article on optimizing YUV conversion using tables instead of multiplies +// http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf +// +// YV12 is a full plane of Y and a half height, half width chroma planes +// YV16 is a full plane of Y and a full height, half width chroma planes +// YV24 is a full plane of Y and a full height, full width chroma planes +// +// ARGB pixel format is output, which on little endian is stored as BGRA. +// The alpha is set to 255, allowing the application to use RGBA or RGB32. + +#include "yuv_convert.h" + +#include "gfxPrefs.h" +#include "libyuv.h" +#include "scale_yuv_argb.h" +// Header for low level row functions. +#include "yuv_row.h" +#include "mozilla/SSE.h" + +namespace mozilla { + +namespace gfx { + +// 16.16 fixed point arithmetic +const int kFractionBits = 16; +const int kFractionMax = 1 << kFractionBits; +const int kFractionMask = ((1 << kFractionBits) - 1); + +YUVType TypeFromSize(int ywidth, + int yheight, + int cbcrwidth, + int cbcrheight) +{ + if (ywidth == cbcrwidth && yheight == cbcrheight) { + return YV24; + } + else if ((ywidth + 1) / 2 == cbcrwidth && yheight == cbcrheight) { + return YV16; + } + else { + return YV12; + } +} + +libyuv::FourCC FourCCFromYUVType(YUVType aYUVType) +{ + if (aYUVType == YV24) { + return libyuv::FOURCC_I444; + } else if (aYUVType == YV16) { + return libyuv::FOURCC_I422; + } else if (aYUVType == YV12) { + return libyuv::FOURCC_I420; + } else { + return libyuv::FOURCC_ANY; + } +} + +// Convert a frame of YUV to 32 bit ARGB. +void ConvertYCbCrToRGB32(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + YUVColorSpace yuv_color_space) { + + + // Deprecated function's conversion is accurate. + // libyuv converion is a bit inaccurate to get performance. It dynamically + // calculates RGB from YUV to use simd. In it, signed byte is used for conversion's + // coefficient, but it requests 129. libyuv cut 129 to 127. And only 6 bits are + // used for a decimal part during the dynamic calculation. + // + // The function is still fast on some old intel chips. + // See Bug 1256475. + bool use_deprecated = gfxPrefs::YCbCrAccurateConversion() || + (supports_mmx() && supports_sse() && !supports_sse3() && + yuv_color_space == YUVColorSpace::BT601); + // The deprecated function only support BT601. + // See Bug 1210357. + if (yuv_color_space != YUVColorSpace::BT601) { + use_deprecated = false; + } + if (use_deprecated) { + ConvertYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, rgb_buf, + pic_x, pic_y, pic_width, pic_height, + y_pitch, uv_pitch, rgb_pitch, yuv_type); + return; + } + + if (yuv_type == YV24) { + const uint8* src_y = y_buf + y_pitch * pic_y + pic_x; + const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x; + const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x; + DebugOnly<int> err = libyuv::I444ToARGB(src_y, y_pitch, + src_u, uv_pitch, + src_v, uv_pitch, + rgb_buf, rgb_pitch, + pic_width, pic_height); + MOZ_ASSERT(!err); + } else if (yuv_type == YV16) { + const uint8* src_y = y_buf + y_pitch * pic_y + pic_x; + const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x / 2; + const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x / 2; + DebugOnly<int> err = libyuv::I422ToARGB(src_y, y_pitch, + src_u, uv_pitch, + src_v, uv_pitch, + rgb_buf, rgb_pitch, + pic_width, pic_height); + MOZ_ASSERT(!err); + } else { + MOZ_ASSERT(yuv_type == YV12); + const uint8* src_y = y_buf + y_pitch * pic_y + pic_x; + const uint8* src_u = u_buf + (uv_pitch * pic_y + pic_x) / 2; + const uint8* src_v = v_buf + (uv_pitch * pic_y + pic_x) / 2; + if (yuv_color_space == YUVColorSpace::BT709) { + DebugOnly<int> err = libyuv::H420ToARGB(src_y, y_pitch, + src_u, uv_pitch, + src_v, uv_pitch, + rgb_buf, rgb_pitch, + pic_width, pic_height); + MOZ_ASSERT(!err); + } else { + MOZ_ASSERT(yuv_color_space == YUVColorSpace::BT601); + DebugOnly<int> err = libyuv::I420ToARGB(src_y, y_pitch, + src_u, uv_pitch, + src_v, uv_pitch, + rgb_buf, rgb_pitch, + pic_width, pic_height); + MOZ_ASSERT(!err); + } + } +} + +// Convert a frame of YUV to 32 bit ARGB. +void ConvertYCbCrToRGB32_deprecated(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type) { + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + unsigned int x_shift = yuv_type == YV24 ? 0 : 1; + // Test for SSE because the optimized code uses movntq, which is not part of MMX. + bool has_sse = supports_mmx() && supports_sse(); + // There is no optimized YV24 SSE routine so we check for this and + // fall back to the C code. + has_sse &= yuv_type != YV24; + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; + int x_width = odd_pic_x ? pic_width - 1 : pic_width; + + for (int y = pic_y; y < pic_height + pic_y; ++y) { + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; + const uint8* y_ptr = y_buf + y * y_pitch + pic_x; + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); + + if (odd_pic_x) { + // Handle the single odd pixel manually and use the + // fast routines for the remaining. + FastConvertYUVToRGB32Row_C(y_ptr++, + u_ptr++, + v_ptr++, + rgb_row, + 1, + x_shift); + rgb_row += 4; + } + + if (has_sse) { + FastConvertYUVToRGB32Row(y_ptr, + u_ptr, + v_ptr, + rgb_row, + x_width); + } + else { + FastConvertYUVToRGB32Row_C(y_ptr, + u_ptr, + v_ptr, + rgb_row, + x_width, + x_shift); + } + } + + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. + if (has_sse) + EMMS(); +} + +// C version does 8 at a time to mimic MMX code +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; + ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8; + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); +} + +#ifdef MOZILLA_MAY_SUPPORT_MMX +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); +#endif + +#ifdef MOZILLA_MAY_SUPPORT_SSE2 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); +#endif + +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, + const uint8* y1_ptr, int source_width, + int source_y_fraction) { +#ifdef MOZILLA_MAY_SUPPORT_SSE2 + if (mozilla::supports_sse2()) { + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + return; + } +#endif + +#ifdef MOZILLA_MAY_SUPPORT_MMX + if (mozilla::supports_mmx()) { + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + return; + } +#endif + + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); +} + + +// Scale a frame of YUV to 32 bit ARGB. +void ScaleYCbCrToRGB32(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + YUVColorSpace yuv_color_space, + ScaleFilter filter) { + + bool use_deprecated = gfxPrefs::YCbCrAccurateConversion() || +#if defined(XP_WIN) && defined(_M_X64) + // libyuv does not support SIMD scaling on win 64bit. See Bug 1295927. + supports_sse3() || +#endif + (supports_mmx() && supports_sse() && !supports_sse3()); + // The deprecated function only support BT601. + // See Bug 1210357. + if (yuv_color_space != YUVColorSpace::BT601) { + use_deprecated = false; + } + if (use_deprecated) { + ScaleYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, + rgb_buf, + source_width, source_height, + width, height, + y_pitch, uv_pitch, + rgb_pitch, + yuv_type, + ROTATE_0, + filter); + return; + } + + DebugOnly<int> err = + libyuv::YUVToARGBScale(y_buf, y_pitch, + u_buf, uv_pitch, + v_buf, uv_pitch, + FourCCFromYUVType(yuv_type), + yuv_color_space, + source_width, source_height, + rgb_buf, rgb_pitch, + width, height, + libyuv::kFilterBilinear); + MOZ_ASSERT(!err); + return; +} + +// Scale a frame of YUV to 32 bit ARGB. +void ScaleYCbCrToRGB32_deprecated(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int source_width, + int source_height, + int width, + int height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + Rotate view_rotate, + ScaleFilter filter) { + bool has_mmx = supports_mmx(); + + // 4096 allows 3 buffers to fit in 12k. + // Helps performance on CPU with 16K L1 cache. + // Large enough for 3830x2160 and 30" displays which are 2560x1600. + const int kFilterBufferSize = 4096; + // Disable filtering if the screen is too big (to avoid buffer overflows). + // This should never happen to regular users: they don't have monitors + // wider than 4096 pixels. + // TODO(fbarchard): Allow rotated videos to filter. + if (source_width > kFilterBufferSize || view_rotate) + filter = FILTER_NONE; + + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + // Diagram showing origin and direction of source sampling. + // ->0 4<- + // 7 3 + // + // 6 5 + // ->1 2<- + // Rotations that start at right side of image. + if ((view_rotate == ROTATE_180) || + (view_rotate == ROTATE_270) || + (view_rotate == MIRROR_ROTATE_0) || + (view_rotate == MIRROR_ROTATE_90)) { + y_buf += source_width - 1; + u_buf += source_width / 2 - 1; + v_buf += source_width / 2 - 1; + source_width = -source_width; + } + // Rotations that start at bottom of image. + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_180) || + (view_rotate == MIRROR_ROTATE_90) || + (view_rotate == MIRROR_ROTATE_180)) { + y_buf += (source_height - 1) * y_pitch; + u_buf += ((source_height >> y_shift) - 1) * uv_pitch; + v_buf += ((source_height >> y_shift) - 1) * uv_pitch; + source_height = -source_height; + } + + // Handle zero sized destination. + if (width == 0 || height == 0) + return; + int source_dx = source_width * kFractionMax / width; + int source_dy = source_height * kFractionMax / height; + int source_dx_uv = source_dx; + + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_270)) { + int tmp = height; + height = width; + width = tmp; + tmp = source_height; + source_height = source_width; + source_width = tmp; + int original_dx = source_dx; + int original_dy = source_dy; + source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits; + source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits; + source_dy = original_dx; + if (view_rotate == ROTATE_90) { + y_pitch = -1; + uv_pitch = -1; + source_height = -source_height; + } else { + y_pitch = 1; + uv_pitch = 1; + } + } + + // Need padding because FilterRows() will write 1 to 16 extra pixels + // after the end for SSE2 version. + uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16]; + uint8* ybuf = + reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15); + uint8* ubuf = ybuf + kFilterBufferSize; + uint8* vbuf = ubuf + kFilterBufferSize; + // TODO(fbarchard): Fixed point math is off by 1 on negatives. + int yscale_fixed = (source_height << kFractionBits) / height; + + // TODO(fbarchard): Split this into separate function for better efficiency. + for (int y = 0; y < height; ++y) { + uint8* dest_pixel = rgb_buf + y * rgb_pitch; + int source_y_subpixel = (y * yscale_fixed); + if (yscale_fixed >= (kFractionMax * 2)) { + source_y_subpixel += kFractionMax / 2; // For 1/2 or less, center filter. + } + int source_y = source_y_subpixel >> kFractionBits; + + const uint8* y0_ptr = y_buf + source_y * y_pitch; + const uint8* y1_ptr = y0_ptr + y_pitch; + + const uint8* u0_ptr = u_buf + (source_y >> y_shift) * uv_pitch; + const uint8* u1_ptr = u0_ptr + uv_pitch; + const uint8* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch; + const uint8* v1_ptr = v0_ptr + uv_pitch; + + // vertical scaler uses 16.8 fixed point + int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8; + int source_uv_fraction = + ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; + + const uint8* y_ptr = y0_ptr; + const uint8* u_ptr = u0_ptr; + const uint8* v_ptr = v0_ptr; + // Apply vertical filtering if necessary. + // TODO(fbarchard): Remove memcpy when not necessary. + if (filter & mozilla::gfx::FILTER_BILINEAR_V) { + if (yscale_fixed != kFractionMax && + source_y_fraction && ((source_y + 1) < source_height)) { + FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + } else { + memcpy(ybuf, y0_ptr, source_width); + } + y_ptr = ybuf; + ybuf[source_width] = ybuf[source_width-1]; + int uv_source_width = (source_width + 1) / 2; + if (yscale_fixed != kFractionMax && + source_uv_fraction && + (((source_y >> y_shift) + 1) < (source_height >> y_shift))) { + FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction); + FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction); + } else { + memcpy(ubuf, u0_ptr, uv_source_width); + memcpy(vbuf, v0_ptr, uv_source_width); + } + u_ptr = ubuf; + v_ptr = vbuf; + ubuf[uv_source_width] = ubuf[uv_source_width - 1]; + vbuf[uv_source_width] = vbuf[uv_source_width - 1]; + } + if (source_dx == kFractionMax) { // Not scaled + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width); + } else if (filter & FILTER_BILINEAR_H) { + LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { +// Specialized scalers and rotation. +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) + if(mozilla::supports_sse()) { + if (width == (source_width * 2)) { + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width); + } else if ((source_dx & kFractionMask) == 0) { + // Scaling by integer scale factor. ie half. + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width, + source_dx >> kFractionBits); + } else if (source_dx_uv == source_dx) { // Not rotated. + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } else { + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, + dest_pixel, width, + source_dx >> kFractionBits, + source_dx_uv >> kFractionBits); + } + } + else { + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); + } +#else + (void)source_dx_uv; + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, width, source_dx); +#endif + } + } + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. + if (has_mmx) + EMMS(); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h new file mode 100644 index 000000000..266a23d45 --- /dev/null +++ b/gfx/ycbcr/yuv_convert.h @@ -0,0 +1,110 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MEDIA_BASE_YUV_CONVERT_H_ +#define MEDIA_BASE_YUV_CONVERT_H_ + +#include "chromium_types.h" +#include "ImageTypes.h" + +namespace mozilla { + +namespace gfx { + +// Type of YUV surface. +// The value of these enums matter as they are used to shift vertical indices. +enum YUVType { + YV12 = 0, // YV12 is half width and half height chroma channels. + YV16 = 1, // YV16 is half width and full height chroma channels. + YV24 = 2 // YV24 is full width and full height chroma channels. +}; + +// Mirror means flip the image horizontally, as in looking in a mirror. +// Rotate happens after mirroring. +enum Rotate { + ROTATE_0, // Rotation off. + ROTATE_90, // Rotate clockwise. + ROTATE_180, // Rotate upside down. + ROTATE_270, // Rotate counter clockwise. + MIRROR_ROTATE_0, // Mirror horizontally. + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. + MIRROR_ROTATE_180, // Mirror vertically. + MIRROR_ROTATE_270 // Transpose. +}; + +// Filter affects how scaling looks. +enum ScaleFilter { + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. + FILTER_BILINEAR = 3 // Bilinear filter. +}; + +YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight); + +// Convert a frame of YUV to 32 bit ARGB. +// Pass in YV16/YV12 depending on source format +void ConvertYCbCrToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + YUVColorSpace yuv_color_space); + +void ConvertYCbCrToRGB32_deprecated(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +// Scale a frame of YUV to 32 bit ARGB. +// Supports rotation and mirroring. +void ScaleYCbCrToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + YUVColorSpace yuv_color_space, + ScaleFilter filter); + +void ScaleYCbCrToRGB32_deprecated(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int source_width, + int source_height, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + Rotate view_rotate, + ScaleFilter filter); + +} // namespace gfx +} // namespace mozilla + +#endif // MEDIA_BASE_YUV_CONVERT_H_ diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp new file mode 100644 index 000000000..081343b0b --- /dev/null +++ b/gfx/ycbcr/yuv_convert_arm.cpp @@ -0,0 +1,232 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com> + +#include "yuv_convert.h" +#include "ycbcr_to_rgb565.h" + + + +#ifdef HAVE_YCBCR_TO_RGB565 + +namespace mozilla { + +namespace gfx { + +# if defined(MOZILLA_MAY_SUPPORT_NEON) +# if defined(__clang__) +void __attribute((noinline)) +# else +void __attribute((noinline,optimize("-fomit-frame-pointer"))) +# endif + yuv42x_to_rgb565_row_neon(uint16 *dst, + const uint8 *y, + const uint8 *u, + const uint8 *v, + int n, + int oddflag) +{ + static __attribute__((aligned(16))) uint16 acc_r[8] = { + 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840, + }; + static __attribute__((aligned(16))) uint16 acc_g[8] = { + 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312, + }; + static __attribute__((aligned(16))) uint16 acc_b[8] = { + 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832, + }; + /* + * Registers: + * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data + * q2 : d4, d5 - are used for storing converted RGB data + * q3 : d6, d7 - are used for temporary storage + * + * q4-q7 - reserved + * + * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data + * q10 : d20, d21 + * q11 : d22, d23 + * q12 : d24, d25 + * q13 : d26, d27 + * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154) + */ + asm volatile ( +".fpu neon\n" +/* Allow to build on targets not supporting neon, and force the object file + * target to avoid bumping the final binary target */ +".arch armv7-a\n" +".object_arch armv4t\n" +".macro convert_macroblock size\n" +/* load up to 16 source pixels */ + ".if \\size == 16\n" + "pld [%[y], #64]\n" + "pld [%[u], #64]\n" + "pld [%[v], #64]\n" + "vld1.8 {d1}, [%[y]]!\n" + "vld1.8 {d3}, [%[y]]!\n" + "vld1.8 {d0}, [%[u]]!\n" + "vld1.8 {d2}, [%[v]]!\n" + ".elseif \\size == 8\n" + "vld1.8 {d1}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d0[1]}, [%[u]]!\n" + "vld1.8 {d0[2]}, [%[u]]!\n" + "vld1.8 {d0[3]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + "vld1.8 {d2[1]}, [%[v]]!\n" + "vld1.8 {d2[2]}, [%[v]]!\n" + "vld1.8 {d2[3]}, [%[v]]!\n" + ".elseif \\size == 4\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d1[1]}, [%[y]]!\n" + "vld1.8 {d1[2]}, [%[y]]!\n" + "vld1.8 {d1[3]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d0[1]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + "vld1.8 {d2[1]}, [%[v]]!\n" + ".elseif \\size == 2\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d1[1]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + ".elseif \\size == 1\n" + "vld1.8 {d1[0]}, [%[y]]!\n" + "vld1.8 {d0[0]}, [%[u]]!\n" + "vld1.8 {d2[0]}, [%[v]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + + /* d1 - Y data (first 8 bytes) */ + /* d3 - Y data (next 8 bytes) */ + /* d0 - U data, d2 - V data */ + + /* split even and odd Y color components */ + "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */ + /* clip upper and lower boundaries */ + "vqadd.u8 q0, q0, q4\n" + "vqadd.u8 q1, q1, q4\n" + "vqsub.u8 q0, q0, q5\n" + "vqsub.u8 q1, q1, q5\n" + + "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */ + + "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */ + "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */ + + "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */ + "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */ + "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */ + "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */ + "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */ + "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */ + "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */ + "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */ + "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */ + + "vhsub.s16 q3, q8, q10\n" /* calculate even red components */ + "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */ + "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */ + "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */ + + "vhadd.s16 q3, q8, q11\n" /* calculate even green components */ + "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */ + "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */ + "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */ + + "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */ + "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */ + "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */ + "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */ + + "vzip.8 d0, d3\n" /* join even and odd red components */ + "vzip.8 d1, d4\n" /* join even and odd green components */ + "vzip.8 d2, d5\n" /* join even and odd blue components */ + + "vshll.u8 q3, d0, #8\n\t" + "vshll.u8 q8, d1, #8\n\t" + "vshll.u8 q9, d2, #8\n\t" + "vsri.u16 q3, q8, #5\t\n" + "vsri.u16 q3, q9, #11\t\n" + /* store pixel data to memory */ + ".if \\size == 16\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + " vshll.u8 q3, d3, #8\n\t" + " vshll.u8 q8, d4, #8\n\t" + " vshll.u8 q9, d5, #8\n\t" + " vsri.u16 q3, q8, #5\t\n" + " vsri.u16 q3, q9, #11\t\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + ".elseif \\size == 8\n" + " vst1.16 {d6, d7}, [%[dst]]!\n" + ".elseif \\size == 4\n" + " vst1.16 {d6}, [%[dst]]!\n" + ".elseif \\size == 2\n" + " vst1.16 {d6[0]}, [%[dst]]!\n" + " vst1.16 {d6[1]}, [%[dst]]!\n" + ".elseif \\size == 1\n" + " vst1.16 {d6[0]}, [%[dst]]!\n" + ".endif\n" + ".endm\n" + + "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */ + "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */ + "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */ + "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */ + + "vmov.u8 d26, #16\n" + "vmov.u8 d27, #149\n" + "vmov.u8 d28, #204\n" + "vmov.u8 d29, #50\n" + "vmov.u8 d30, #104\n" + "vmov.u8 d31, #154\n" + + "cmp %[oddflag], #0\n" + "beq 1f\n" + "convert_macroblock 1\n" + "sub %[n], %[n], #1\n" + "1:\n" + "subs %[n], %[n], #16\n" + "blt 2f\n" + "1:\n" + "convert_macroblock 16\n" + "subs %[n], %[n], #16\n" + "bge 1b\n" + "2:\n" + "tst %[n], #8\n" + "beq 3f\n" + "convert_macroblock 8\n" + "3:\n" + "tst %[n], #4\n" + "beq 4f\n" + "convert_macroblock 4\n" + "4:\n" + "tst %[n], #2\n" + "beq 5f\n" + "convert_macroblock 2\n" + "5:\n" + "tst %[n], #1\n" + "beq 6f\n" + "convert_macroblock 1\n" + "6:\n" + ".purgem convert_macroblock\n" + : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n) + : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]), + [oddflag] "r" (oddflag) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */ + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + ); +} +# endif // MOZILLA_MAY_SUPPORT_NEON + +} // namespace gfx + +} // namespace mozilla + +#endif // HAVE_YCBCR_TO_RGB565 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp new file mode 100644 index 000000000..b5353e500 --- /dev/null +++ b/gfx/ycbcr/yuv_convert_mmx.cpp @@ -0,0 +1,45 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <mmintrin.h> +#include "yuv_row.h" + +namespace mozilla { +namespace gfx { + +// FilterRows combines two rows of the image using linear interpolation. +// MMX version does 8 pixels at a time. +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m64 zero = _mm_setzero_si64(); + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); + + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); + + do { + __m64 y0 = *y0_ptr64++; + __m64 y1 = *y1_ptr64++; + __m64 y2 = _mm_unpackhi_pi8(y0, zero); + __m64 y3 = _mm_unpackhi_pi8(y1, zero); + y0 = _mm_unpacklo_pi8(y0, zero); + y1 = _mm_unpacklo_pi8(y1, zero); + y0 = _mm_mullo_pi16(y0, y0_fraction); + y1 = _mm_mullo_pi16(y1, y1_fraction); + y2 = _mm_mullo_pi16(y2, y0_fraction); + y3 = _mm_mullo_pi16(y3, y1_fraction); + y0 = _mm_add_pi16(y0, y1); + y2 = _mm_add_pi16(y2, y3); + y0 = _mm_srli_pi16(y0, 8); + y2 = _mm_srli_pi16(y2, 8); + y0 = _mm_packs_pu16(y0, y2); + *dest64++ = y0; + } while (dest64 < end64); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp new file mode 100644 index 000000000..25fe20639 --- /dev/null +++ b/gfx/ycbcr/yuv_convert_sse2.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <emmintrin.h> +#include "yuv_row.h" + +namespace mozilla { +namespace gfx { + +// FilterRows combines two rows of the image using linear interpolation. +// SSE2 version does 16 pixels at a time. +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m128i zero = _mm_setzero_si128(); + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); + + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); + + do { + __m128i y0 = _mm_loadu_si128(y0_ptr128); + __m128i y1 = _mm_loadu_si128(y1_ptr128); + __m128i y2 = _mm_unpackhi_epi8(y0, zero); + __m128i y3 = _mm_unpackhi_epi8(y1, zero); + y0 = _mm_unpacklo_epi8(y0, zero); + y1 = _mm_unpacklo_epi8(y1, zero); + y0 = _mm_mullo_epi16(y0, y0_fraction); + y1 = _mm_mullo_epi16(y1, y1_fraction); + y2 = _mm_mullo_epi16(y2, y0_fraction); + y3 = _mm_mullo_epi16(y3, y1_fraction); + y0 = _mm_add_epi16(y0, y1); + y2 = _mm_add_epi16(y2, y3); + y0 = _mm_srli_epi16(y0, 8); + y2 = _mm_srli_epi16(y2, 8); + y0 = _mm_packus_epi16(y0, y2); + *dest128++ = y0; + ++y0_ptr128; + ++y1_ptr128; + } while (dest128 < end128); +} + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h new file mode 100644 index 000000000..c89f54b8f --- /dev/null +++ b/gfx/ycbcr/yuv_row.h @@ -0,0 +1,142 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// yuv_row internal functions to handle YUV conversion and scaling to RGB. +// These functions are used from both yuv_convert.cc and yuv_scale.cc. + +// TODO(fbarchard): Write function that can handle rotation and scaling. + +#ifndef MEDIA_BASE_YUV_ROW_H_ +#define MEDIA_BASE_YUV_ROW_H_ + +#include "chromium_types.h" + +extern "C" { +// Can only do 1x. +// This is the second fastest of the scalers. +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + unsigned int x_shift); + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +// Can do 1x, half size or any scale down by an integer amount. +// Step can be negative (mirroring, rotate 180). +// This is the third fastest of the scalers. +// Only defined on Windows x86-32. +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int step); + +// Rotate is like Convert, but applies different step to Y versus U and V. +// This allows rotation by 90 or 270, by stepping by stride. +// This is the forth fastest of the scalers. +// Only defined on Windows x86-32. +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int ystep, + int uvstep); + +// Doubler does 4 pixels at a time. Each pixel is replicated. +// This is the fastest of the scalers. +// Only defined on Windows x86-32. +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +// Handles arbitrary scaling up or down. +// Mirroring is supported, but not 90 or 270 degree rotation. +// Chroma is under sampled every 2 pixels for performance. +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +// Handles arbitrary scaling up or down with bilinear filtering. +// Mirroring is supported, but not 90 or 270 degree rotation. +// Chroma is under sampled every 2 pixels for performance. +// This is the slowest of the scalers. +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif +extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); + +// x64 uses MMX2 (SSE) so emms is not required. +// Warning C4799: function has no EMMS instruction. +// EMMS() is slow and should be called by the calling function once per image. +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) +#if defined(_MSC_VER) +#define EMMS() __asm emms +#pragma warning(disable: 4799) +#else +#define EMMS() asm("emms") +#endif +#else +#define EMMS() ((void)0) +#endif + +} // extern "C" + +#endif // MEDIA_BASE_YUV_ROW_H_ diff --git a/gfx/ycbcr/yuv_row_arm.s b/gfx/ycbcr/yuv_row_arm.s new file mode 100644 index 000000000..6a6c81bee --- /dev/null +++ b/gfx/ycbcr/yuv_row_arm.s @@ -0,0 +1,304 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + .arch armv7-a + .fpu neon +/* Allow to build on targets not supporting neon, and force the object file + * target to avoid bumping the final binary target */ + .object_arch armv4t + .text + .align + + .balign 64 +YCbCr42xToRGB565_DITHER03_CONSTS_NEON: + .short -14240 + .short -14240+384 + .short 8672 + .short 8672+192 + .short -17696 + .short -17696+384 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER12_CONSTS_NEON: + .short -14240+128 + .short -14240+256 + .short 8672+64 + .short 8672+128 + .short -17696+128 + .short -17696+256 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER21_CONSTS_NEON: + .short -14240+256 + .short -14240+128 + .short 8672+128 + .short 8672+64 + .short -17696+256 + .short -17696+128 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER30_CONSTS_NEON: + .short -14240+384 + .short -14240 + .short 8672+192 + .short 8672 + .short -17696+384 + .short -17696 + .byte 102 + .byte 25 + .byte 52 + .byte 129 + +@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( +@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); +@ +@ ctx = { +@ uint16_t *rgb_row; /*r0*/ +@ const uint8_t *y_row; /*r1*/ +@ const uint8_t *u_row; /*r2*/ +@ const uint8_t *v_row; /*r3*/ +@ int y_yweight; /*r4*/ +@ int y_pitch; /*r5*/ +@ int width; /*r6*/ +@ int source_x0_q16; /*r7*/ +@ int source_dx_q16; /*r8*/ +@ int source_uv_xoffs_q16; /*r9*/ +@ }; + .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON + .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function + .balign 64 + .fnstart +ScaleYCbCr42xToRGB565_BilinearY_Row_NEON: + STMFD r13!,{r4-r9,r14} @ 8 words. + ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON + VPUSH {Q4-Q7} @ 16 words. + ADD r14,r14,r1, LSL #4 @ Select the dither table to use + LDMIA r0, {r0-r9} + @ Set up image index registers. + ADD r12,r8, r8 + VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16 + VDUP.32 D17,r12 + ADD r12,r12,r12 + VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16 + VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16 + ADD r12,r12,r12 + VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16 + VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16 + CMP r8, #0 @ If source_dx_q16 is negative... + VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16 + ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block + VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16 + SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use) + VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16 + VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16 + VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16 + VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16 + VLD1.64 {D30,D31},[r14,:128] @ Load some constants + VMOV.I8 D28,#52 + VMOV.I8 D29,#129 + @ The basic idea here is to do aligned loads of a block of data and then + @ index into it using VTBL to extract the data from the source X + @ coordinate corresponding to each destination pixel. + @ This is significantly less code and significantly fewer cycles than doing + @ a series of single-lane loads, but it means that the X step between + @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee + @ that we could read 8 pixels from a single aligned 32-byte block of data. + @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel, + @ separated into even pixels and odd pixels to make extracting offsets and + @ weights easier. + @ We then pull out two bytes from the middle of each coordinate: the top + @ byte corresponds to the integer part of the X coordinate, and the bottom + @ byte corresponds to the weight to use for bilinear blending. + @ These are separated out into different registers with VTRN. + @ Then by subtracting the integer X coordinate of the first pixel in the + @ data block we loaded, we produce an index register suitable for use by + @ VTBL. +s42xbily_neon_loop: + @ Load the Y' data. + MOV r12,r7, ASR #16 + VRSHRN.S32 D16,Q0, #8 + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I8 D20,r12 + ADD r12,r1, r12 @ r12 = y_row+(source_x&~7) + VRSHRN.S32 D17,Q1, #8 + PLD [r12,#64] + VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row + ADD r14,r7, r8, LSL #3 + VRSHRN.S32 D18,Q2, #8 + MOV r14,r14,ASR #16 + VRSHRN.S32 D19,Q3, #8 + AND r14,r14,#~15 @ Read 16-byte aligned blocks + VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row + PLD [r12,#64] + VDUP.I8 D21,r14 + ADD r14,r1, r14 @ r14 = y_row+(source_x&~7) + VMOV.I8 Q13,#1 + PLD [r14,#64] + VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded. + @ First 8 Y' pixels + VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x + VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x + VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x + VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1 + VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1 + @ Next 8 Y' pixels + VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row + PLD [r14,#64] + VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x + VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x + VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1 + VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1 + @ Blend Y'. + VDUP.I16 Q9, r4 @ Load the y weights. + VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a + VSUBL.U8 Q5, D25,D21 + VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b + VSUBL.U8 Q7, D27,D23 + VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight + VMUL.S16 Q5, Q5, Q9 + VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight + VMUL.S16 Q7, Q7, Q9 + VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits. + VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW. + VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8 + VRSHRN.S16 D9, Q5, #8 + VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8 + VRSHRN.S16 D13,Q7, #8 + VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8) + VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8) + VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a + VSUBL.U8 Q5, D23,D21 + VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight + VMUL.S16 Q5, Q5, Q13 + VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8 + ADD r12,r7, r9 + VRSHRN.S16 D9, Q5, #8 + MOV r12,r12,ASR #17 + VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8) + @ Start extracting the chroma x coordinates, and load Cb and Cr. + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4 + ADD r14,r2, r12 + VADD.I32 Q10,Q0, Q9 + VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb + PLD [r14,#64] + VADD.I32 Q11,Q1, Q9 + ADD r14,r3, r12 + VADD.I32 Q12,Q2, Q9 + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr + PLD [r14,#64] + VADD.I32 Q13,Q3, Q9 + VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0> + VRSHRN.S32 D21,Q11,#9 + VDUP.I8 Q9, r12 + VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1> + VRSHRN.S32 D23,Q13,#9 + @ We don't actually need the x weights, but we get them for free. + @ Free ALU slot + VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded. + VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x + VMOV.I8 D24,#74 + VTBL.8 D19,{D8, D9, D10,D11},D23 + VMOV.I8 D26,#102 + VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x + VMOV.I8 D27,#25 + VTBL.8 D21,{D12,D13,D14,D15},D23 + @ We now have Y' in Q8, Cb in Q9, and Cr in Q10 + @ We use VDUP to expand constants, because it's a permute instruction, so + @ it can dual issue on the A8. + SUBS r6, r6, #16 @ width -= 16 + VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74 + VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G + VMULL.U8 Q5, D17,D24 + VDUP.32 Q7, D30[1] + VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G + VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R + VMLSL.U8 Q7, D19,D27 + VDUP.32 Q12,D30[0] + VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R + VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B + VMLAL.U8 Q12,D21,D26 + VDUP.32 Q13,D31[0] + VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B + VMLAL.U8 Q13,D19,D29 + VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G + VMLSL.U8 Q7, D21,D28 + VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R + VADD.S16 Q12,Q5, Q12 + VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B + VQADD.S16 Q13,Q5, Q13 + VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G + VADD.S16 Q7, Q5, Q7 + @ Push each value to the top of its word and saturate it. + VQSHLU.S16 Q11,Q11,#2 + VQSHLU.S16 Q12,Q12,#2 + VQSHLU.S16 Q6, Q6, #2 + VQSHLU.S16 Q7, Q7, #2 + VQSHLU.S16 Q8, Q8, #2 + VQSHLU.S16 Q13,Q13,#2 + @ Merge G and B into R. + VSRI.U16 Q11,Q6, #5 + VSRI.U16 Q12,Q7, #5 + VSRI.U16 Q11,Q8, #11 + MOV r14,r8, LSL #4 + VSRI.U16 Q12,Q13,#11 + BLT s42xbily_neon_tail + VDUP.I32 Q13,r14 + @ Store the result. + VST1.16 {D22,D23,D24,D25},[r0]! + BEQ s42xbily_neon_done + @ Advance the x coordinates. + VADD.I32 Q0, Q0, Q13 + VADD.I32 Q1, Q1, Q13 + ADD r7, r14 + VADD.I32 Q2, Q2, Q13 + VADD.I32 Q3, Q3, Q13 + B s42xbily_neon_loop +s42xbily_neon_tail: + @ We have between 1 and 15 pixels left to write. + @ -r6 == the number of pixels we need to skip writing. + @ Adjust r0 to point to the last one we need to write, because we're going + @ to write them in reverse order. + ADD r0, r0, r6, LSL #1 + MOV r14,#-2 + ADD r0, r0, #30 + @ Skip past the ones we don't need to write. + SUB PC, PC, r6, LSL #2 + ORR r0, r0, r0 + VST1.16 {D25[3]},[r0,:16],r14 + VST1.16 {D25[2]},[r0,:16],r14 + VST1.16 {D25[1]},[r0,:16],r14 + VST1.16 {D25[0]},[r0,:16],r14 + VST1.16 {D24[3]},[r0,:16],r14 + VST1.16 {D24[2]},[r0,:16],r14 + VST1.16 {D24[1]},[r0,:16],r14 + VST1.16 {D24[0]},[r0,:16],r14 + VST1.16 {D23[3]},[r0,:16],r14 + VST1.16 {D23[2]},[r0,:16],r14 + VST1.16 {D23[1]},[r0,:16],r14 + VST1.16 {D23[0]},[r0,:16],r14 + VST1.16 {D22[3]},[r0,:16],r14 + VST1.16 {D22[2]},[r0,:16],r14 + VST1.16 {D22[1]},[r0,:16],r14 + VST1.16 {D22[0]},[r0,:16] +s42xbily_neon_done: + VPOP {Q4-Q7} @ 16 words. + LDMFD r13!,{r4-r9,PC} @ 8 words. + .fnend + .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON + +#if defined(__ELF__)&&defined(__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp new file mode 100644 index 000000000..d327f854e --- /dev/null +++ b/gfx/ycbcr/yuv_row_c.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +#define DCHECK(a) + +extern "C" { + +// C reference code that mimic the YUV assembly. +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf) { + + int b = kCoefficientsRgbY[256+u][0]; + int g = kCoefficientsRgbY[256+u][1]; + int r = kCoefficientsRgbY[256+u][2]; + int a = kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, kCoefficientsRgbY[512+v][0]); + g = paddsw(g, kCoefficientsRgbY[512+v][1]); + r = paddsw(r, kCoefficientsRgbY[512+v][2]); + a = paddsw(a, kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, kCoefficientsRgbY[y][0]); + g = paddsw(g, kCoefficientsRgbY[y][1]); + r = paddsw(r, kCoefficientsRgbY[y][2]); + a = paddsw(a, kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); +} + +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + unsigned int x_shift) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> x_shift]; + uint8 v = v_buf[x >> x_shift]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + if (x_shift == 0) { + u = u_buf[x + 1]; + v = v_buf[x + 1]; + } + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +// 16.16 fixed point is used. A shift by 16 isolates the integer. +// A shift by 17 is used to further subsample the chrominence channels. +// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +// for 1/65536 pixel accurate interpolation. +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; + int u1 = u_buf[(x >> 17) + 1]; + int v0 = v_buf[(x >> 17)]; + int v1 = v_buf[(x >> 17) + 1]; + int y_frac = (x & 65535); + int uv_frac = ((x >> 1) & 65535); + int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; + int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; + YuvPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 65535); + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YuvPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +} // extern "C" + diff --git a/gfx/ycbcr/yuv_row_other.cpp b/gfx/ycbcr/yuv_row_other.cpp new file mode 100644 index 000000000..c351139f9 --- /dev/null +++ b/gfx/ycbcr/yuv_row_other.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +} diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp new file mode 100644 index 000000000..a84792d96 --- /dev/null +++ b/gfx/ycbcr/yuv_row_posix.cpp @@ -0,0 +1,917 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" +#include "mozilla/SSE.h" + +#define DCHECK(a) + +extern "C" { + +#if defined(ARCH_CPU_X86_64) + +// We don't need CPUID guards here, since x86-64 implies SSE2. + +// AMD64 ABI uses register paremters. +void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm( + "jmp 1f\n" +"0:" + "movzb (%1),%%r10\n" + "add $0x1,%1\n" + "movzb (%2),%%r11\n" + "add $0x1,%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "add $0x2,%0\n" + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "add $0x8,%3\n" +"1:" + "sub $0x2,%4\n" + "jns 0b\n" + +"2:" + "add $0x1,%4\n" + "js 3f\n" + + "movzb (%1),%%r10\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%2),%%r10\n" + "movq 4096(%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movzb (%0),%%r10\n" + "movq (%5,%%r10,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" +"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width, // r8 + int source_dx) { // r9 + asm( + "xor %%r11,%%r11\n" + "sub $0x2,%4\n" + "js 1f\n" + +"0:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "lea (%%r11,%6),%%r10\n" + "sar $0x10,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%rax,8),%%xmm1\n" + "lea (%%r10,%6),%%r11\n" + "sar $0x10,%%r10\n" + "movzb (%0,%%r10,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm2\n" + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" + "jns 0b\n" + +"1:" + "add $0x1,%4\n" + "js 2f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +"2:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 + : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" +); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + asm( + "xor %%r11,%%r11\n" // x = 0 + "sub $0x2,%4\n" + "js 2f\n" + "cmp $0x20000,%6\n" // if source_dx >= 2.0 + "jl 0f\n" + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less +"0:" + +"1:" + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1, %%r10, 1), %%r13 \n" + "movzb 1(%1, %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0x1fffe, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $17, %%r13 \n" + "movq 2048(%5,%%r13,8), %%xmm0\n" + + "movzb (%2, %%r10, 1), %%r13 \n" + "movzb 1(%2, %%r10, 1), %%r14 \n" + "mov %%r11, %%rax \n" + "and $0x1fffe, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0x1fffe, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $17, %%r13 \n" + "movq 4096(%5,%%r13,8), %%xmm1\n" + + "mov %%r11, %%rax \n" + "lea (%%r11,%6),%%r10\n" + "sar $0x10,%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + + "movzb (%0, %%r11, 1), %%r13 \n" + "movzb 1(%0, %%r11, 1), %%r14 \n" + "and $0xffff, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0xffff, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $16, %%r13 \n" + "movq (%5,%%r13,8),%%xmm1\n" + + "mov %%r10, %%rax \n" + "lea (%%r10,%6),%%r11\n" + "sar $0x10,%%r10\n" + + "movzb (%0,%%r10,1), %%r13 \n" + "movzb 1(%0,%%r10,1), %%r14 \n" + "and $0xffff, %%rax \n" + "imul %%rax, %%r14 \n" + "xor $0xffff, %%rax \n" + "imul %%rax, %%r13 \n" + "add %%r14, %%r13 \n" + "shr $16, %%r13 \n" + "movq (%5,%%r13,8),%%xmm2\n" + + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" + "jns 1b\n" + +"2:" + "add $0x1,%4\n" + "js 3f\n" + + "mov %%r11,%%r10\n" + "sar $0x11,%%r10\n" + + "movzb (%1,%%r10,1), %%r13 \n" + "movq 2048(%5,%%r13,8),%%xmm0\n" + + "movzb (%2,%%r10,1), %%r13 \n" + "movq 4096(%5,%%r13,8),%%xmm1\n" + + "paddsw %%xmm1,%%xmm0\n" + "sar $0x10,%%r11\n" + + "movzb (%0,%%r11,1), %%r13 \n" + "movq (%5,%%r13,8),%%xmm1\n" + + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +"3:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(source_dx)) // %6 + : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" +); +} + +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) + +// PIC version is slower because less registers are available, so +// non-PIC is used on platforms where it is possible. +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" + ".global FastConvertYUVToRGB32Row_SSE\n" + ".type FastConvertYUVToRGB32Row_SSE, @function\n" +"FastConvertYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "jmp 1f\n" + +"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "add $0x2,%edx\n" + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "sub $0x2,%ecx\n" + "jns 0b\n" + + "and $0x1,%ecx\n" + "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) +{ + if (mozilla::supports_sse()) { + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); + return; + } + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + + +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + asm( + ".text\n" + ".global ScaleYUVToRGB32Row_SSE\n" + ".type ScaleYUVToRGB32Row_SSE, @function\n" +"ScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "sub $0x2,%ecx\n" + "jns 0b\n" + + "and $0x1,%ecx\n" + "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); + return; + } + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); +} + +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + asm( + ".text\n" + ".global LinearScaleYUVToRGB32Row_SSE\n" + ".type LinearScaleYUVToRGB32Row_SSE, @function\n" +"LinearScaleYUVToRGB32Row_SSE:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x30(%esp),%ebp\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 + "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%edi,%eax,1),%ecx\n" + "movzbl 1(%edi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" + + "mov 0x2c(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" + "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" + + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +"1:" + "cmp 0x34(%esp), %ebx\n" + "jl 0b\n" + "popa\n" + "ret\n" + +"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); + return; + } + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, + width, source_dx); +} + +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) + +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + const int16 *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICConvertYUVToRGB32Row_SSE:\n" +#else +"PICConvertYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x38(%esp),%ecx\n" + + "jmp 1f\n" + +"0:" + "movzbl (%edi),%eax\n" + "add $0x1,%edi\n" + "movzbl (%esi),%ebx\n" + "add $0x1,%esi\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw 4096(%ecx,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "add $0x2,%edx\n" + "movq 0(%ecx,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "subl $0x2,0x34(%esp)\n" + "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" + "je 2f\n" + + "movzbl (%edi),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "movzbl (%esi),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) +{ + if (mozilla::supports_sse()) { + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + &kCoefficientsRgbY[0][0]); + return; + } + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx, + const int16 *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICScaleYUVToRGB32Row_SSE:\n" +#else +"PICScaleYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x3c(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp 1f\n" + +"0:" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"1:" + "subl $0x2,0x34(%esp)\n" + "jns 0b\n" + + "andl $0x1,0x34(%esp)\n" + "je 2f\n" + + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"2:" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, + &kCoefficientsRgbY[0][0]); + return; + } + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx, + const int16 *kCoefficientsRgbY); + + asm( + ".text\n" +#if defined(XP_MACOSX) +"_PICLinearScaleYUVToRGB32Row_SSE:\n" +#else +"PICLinearScaleYUVToRGB32Row_SSE:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "mov 0x3c(%esp),%edi\n" + "xor %ebx,%ebx\n" + + // source_width = width * source_dx + ebx + "mov 0x34(%esp), %ecx\n" + "imull 0x38(%esp), %ecx\n" + "mov %ecx, 0x34(%esp)\n" + + "mov 0x38(%esp), %ecx\n" + "xor %ebx,%ebx\n" // x = 0 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 + "jl 1f\n" + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less + "jmp 1f\n" + +"0:" + "mov 0x28(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "movq 2048(%edi,%ecx,8),%mm0\n" + + "mov 0x2c(%esp),%esi\n" + "mov %ebx,%eax\n" + "sar $0x11,%eax\n" + + "movzbl (%esi,%eax,1),%ecx\n" + "movzbl 1(%esi,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "andl $0x1fffe, %eax \n" + "imul %eax, %esi \n" + "xorl $0x1fffe, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $17, %ecx \n" + "paddsw 4096(%edi,%ecx,8),%mm0\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm1\n" + + "cmp 0x34(%esp), %ebx\n" + "jge 2f\n" + + "mov %ebx,%eax\n" + "sar $0x10,%eax\n" + "movzbl (%edx,%eax,1),%ecx\n" + "movzbl 1(%edx,%eax,1),%esi\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "andl $0xffff, %eax \n" + "imul %eax, %esi \n" + "xorl $0xffff, %eax \n" + "imul %eax, %ecx \n" + "addl %esi, %ecx \n" + "shrl $16, %ecx \n" + "movq (%edi,%ecx,8),%mm2\n" + + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" + +"1:" + "cmp %ebx, 0x34(%esp)\n" + "jg 0b\n" + "popa\n" + "ret\n" + +"2:" + "paddsw %mm0, %mm1\n" + "psraw $6, %mm1\n" + "packuswb %mm1, %mm1\n" + "movd %mm1, (%ebp)\n" + "popa\n" + "ret\n" +#if !defined(XP_MACOSX) + ".previous\n" +#endif +); + + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) +{ + if (mozilla::supports_sse()) { + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx, &kCoefficientsRgbY[0][0]); + return; + } + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} +#else +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} +#endif + +} diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp new file mode 100644 index 000000000..c531b60c2 --- /dev/null +++ b/gfx/ycbcr/yuv_row_table.cpp @@ -0,0 +1,233 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { + +#define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +} + +#define RGBU(i) { \ + static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ + 0, \ + static_cast<int16>(256 * 64 - 1) \ +} + +#define RGBV(i) { \ + 0, \ + static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +SIMD_ALIGNED(const int16 kCoefficientsRgbY[256 * 3][4]) = { + RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), + RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), + RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), + RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), + RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), + RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), + RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), + RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), + RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), + RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), + RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), + RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), + RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), + RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), + RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), + RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), + RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), + RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), + RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), + RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), + RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), + RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), + RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), + RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), + RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), + RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), + RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), + RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), + RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), + RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), + RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), + RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), + RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), + RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), + RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), + RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), + RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), + RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), + RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), + RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), + RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), + RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), + RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), + RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), + RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), + RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), + RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), + RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), + RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), + RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), + RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), + RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), + RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), + RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), + RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), + RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), + RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), + RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), + RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), + RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), + RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), + RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), + RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), + RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), + + // Chroma U table. + RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), + RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), + RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), + RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), + RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), + RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), + RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), + RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), + RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), + RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), + RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), + RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), + RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), + RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), + RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), + RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), + RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), + RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), + RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), + RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), + RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), + RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), + RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), + RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), + RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), + RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), + RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), + RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), + RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), + RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), + RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), + RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), + RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), + RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), + RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), + RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), + RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), + RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), + RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), + RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), + RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), + RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), + RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), + RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), + RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), + RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), + RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), + RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), + RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), + RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), + RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), + RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), + RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), + RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), + RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), + RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), + RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), + RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), + RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), + RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), + RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), + RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), + RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), + RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), + + // Chroma V table. + RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), + RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), + RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), + RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), + RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), + RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), + RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), + RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), + RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), + RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), + RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), + RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), + RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), + RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), + RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), + RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), + RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), + RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), + RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), + RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), + RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), + RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), + RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), + RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), + RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), + RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), + RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), + RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), + RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), + RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), + RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), + RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), + RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), + RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), + RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), + RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), + RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), + RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), + RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), + RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), + RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), + RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), + RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), + RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), + RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), + RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), + RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), + RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), + RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), + RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), + RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), + RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), + RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), + RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), + RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), + RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), + RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), + RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), + RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), + RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), + RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), + RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), + RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), + RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), +}; + +#undef RGBY +#undef RGBU +#undef RGBV + +} // extern "C" diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp new file mode 100644 index 000000000..5cd931139 --- /dev/null +++ b/gfx/ycbcr/yuv_row_win.cpp @@ -0,0 +1,498 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" +#include "mozilla/SSE.h" + +#define kCoefficientsRgbU kCoefficientsRgbY + 2048 +#define kCoefficientsRgbV kCoefficientsRgbY + 4096 + +extern "C" { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) +__declspec(naked) +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp convertend + + convertloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [kCoefficientsRgbY + 8 * eax] + add edx, 2 + movq mm2, [kCoefficientsRgbY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + convertend : + sub ecx, 2 + jns convertloop + + and ecx, 1 // odd number of pixels? + jz convertdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + convertdone : + + popad + ret + } +} + +__declspec(naked) +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int ystep, + int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + mov ebx, [esp + 32 + 28] // uvstep + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + mov ebx, [esp + 32 + 24] // ystep + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + punpckldq mm1, mm1 + movntq [ebp], mm1 + + movzx ebx, byte ptr [edx + 1] + add edx, 2 + paddsw mm0, [kCoefficientsRgbY + 8 * ebx] + psraw mm0, 6 + packuswb mm0, mm0 + punpckldq mm0, mm0 + movntq [ebp+8], mm0 + add ebp, 16 + wend : + sub ecx, 4 + jns wloop + + add ecx, 4 + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + jmp wend1 + + wloop1 : + movd [ebp], mm1 + add ebp, 4 + wend1 : + sub ecx, 1 + jns wloop1 + wdone : + popad + ret + } +} + +// This version does general purpose scaling by any amount, up or down. +// The only thing it cannot do is rotation by 90 or 270. +// For performance the chroma is under-sampled, reducing cost of a 3x +// 1080p scale from 8.4 ms to 5.4 ms. +__declspec(naked) +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x + jmp scaleend + + scaleloop : + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + scaleend : + sub ecx, 2 + jns scaleloop + + and ecx, 1 // odd number of pixels? + jz scaledone + + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + + scaledone : + popad + ret + } +} + +__declspec(naked) +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + // [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + imul ecx, [esp + 32 + 24] // source_dx + mov [esp + 32 + 20], ecx // source_width = width * source_dx + mov ecx, [esp + 32 + 24] // source_dx + xor ebx, ebx // x = 0 + cmp ecx, 0x20000 + jl lscaleend + mov ebx, 0x8000 // x = 0.5 for 1/2 or less + jmp lscaleend +lscaleloop: + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [edi + eax] + movzx esi, byte ptr [edi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + movq mm0, [kCoefficientsRgbU + 8 * ecx] + + mov esi, [esp + 32 + 12] + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [esi + eax] + movzx esi, byte ptr [esi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + paddsw mm0, [kCoefficientsRgbV + 8 * ecx] + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [1 + edx + eax] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm1, [kCoefficientsRgbY + 8 * ecx] + + cmp ebx, [esp + 32 + 20] + jge lscalelastpixel + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [edx + eax + 1] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm2, [kCoefficientsRgbY + 8 * ecx] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 0x8 + +lscaleend: + cmp ebx, [esp + 32 + 20] + jl lscaleloop + popad + ret + +lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + popad + ret + }; +} +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); + return; + } +#endif + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); + return; + } +#endif + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); + return; + } +#endif + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +} // extern "C" diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp new file mode 100644 index 000000000..6a34f840a --- /dev/null +++ b/gfx/ycbcr/yuv_row_win64.cpp @@ -0,0 +1,205 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { + +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. + +#define kCoefficientsRgbU (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 2048) +#define kCoefficientsRgbV (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 4096) + +#include <emmintrin.h> + +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + + while (width >= 2) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); + + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8 u, v, y; + int x = 0; + + while (width >= 2) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y = y_buf[x >> 16]; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8 u0, u1, v0, v1, y0, y1; + uint32 uv_frac, y_frac, u, v, y; + int x = 0; + + if (source_dx >= 0x20000) { + x = 32768; + } + + while(width >= 2) { + u0 = u_buf[x >> 17]; + u1 = u_buf[(x >> 17) + 1]; + v0 = v_buf[x >> 17]; + v1 = v_buf[(x >> 17) + 1]; + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + uv_frac = (x & 0x1fffe); + y_frac = (x & 0xffff); + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 0xffff); + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y)); + + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); +} + +} // extern "C" |