diff options
author | wolfbeast <mcwerewolf@gmail.com> | 2018-07-18 08:24:24 +0200 |
---|---|---|
committer | wolfbeast <mcwerewolf@gmail.com> | 2018-07-18 08:24:24 +0200 |
commit | fc61780b35af913801d72086456f493f63197da6 (patch) | |
tree | f85891288a7bd988da9f0f15ae64e5c63f00d493 /media/libwebp/dsp | |
parent | 69f7f9e5f1475891ce11cc4f431692f965b0cd30 (diff) | |
parent | 50d3e596bbe89c95615f96eb71f6bc5be737a1db (diff) | |
download | UXP-fc61780b35af913801d72086456f493f63197da6.tar UXP-fc61780b35af913801d72086456f493f63197da6.tar.gz UXP-fc61780b35af913801d72086456f493f63197da6.tar.lz UXP-fc61780b35af913801d72086456f493f63197da6.tar.xz UXP-fc61780b35af913801d72086456f493f63197da6.zip |
Merge commit '50d3e596bbe89c95615f96eb71f6bc5be737a1db' into Basilisk-releasev2018.07.18
# Conflicts:
# browser/app/profile/firefox.js
# browser/components/preferences/jar.mn
Diffstat (limited to 'media/libwebp/dsp')
32 files changed, 3164 insertions, 1775 deletions
diff --git a/media/libwebp/dsp/alpha_processing.c b/media/libwebp/dsp/alpha_processing.c index 4b60e092b..6ff1352ae 100644 --- a/media/libwebp/dsp/alpha_processing.c +++ b/media/libwebp/dsp/alpha_processing.c @@ -12,10 +12,13 @@ // Author: Skal (pascal.massimino@gmail.com) #include <assert.h> -#include "./dsp.h" +#include "../dsp/dsp.h" // Tables can be faster on some platform but incur some extra binary size (~2k). -// #define USE_TABLES_FOR_ALPHA_MULT +#if !defined(USE_TABLES_FOR_ALPHA_MULT) +#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE +#endif + // ----------------------------------------------------------------------------- @@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) { return v; } -#ifdef USE_TABLES_FOR_ALPHA_MULT +#if (USE_TABLES_FOR_ALPHA_MULT == 1) static const uint32_t kMultTables[2][256] = { { // (255u << MFIX) / alpha @@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) { return inverse ? (255u << MFIX) / a : a * KINV_255; } -#endif // USE_TABLES_FOR_ALPHA_MULT +#endif // USE_TABLES_FOR_ALPHA_MULT -void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) { +void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t argb = ptr[x]; @@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) { } } -void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, - int width, int inverse) { +void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t a = alpha[x]; @@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride, #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24) #endif -static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, - int w, int h, int stride) { +#if !WEBP_NEON_OMIT_C_CODE +static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first, + int w, int h, int stride) { while (h-- > 0) { uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); @@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, rgba += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef MULTIPLIER #undef PREMULTIPLY @@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) { return (x * m) >> 16; } -static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444, - int w, int h, int stride, - int rg_byte_pos /* 0 or 1 */) { +static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444, + int w, int h, int stride, + int rg_byte_pos /* 0 or 1 */) { while (h-- > 0) { int i; for (i = 0; i < w; ++i) { @@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444, } #undef MULTIPLIER -static void ApplyAlphaMultiply_16b(uint8_t* rgba4444, - int w, int h, int stride) { -#ifdef WEBP_SWAP_16BIT_CSP - ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1); +static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444, + int w, int h, int stride) { +#if (WEBP_SWAP_16BIT_CSP == 1) + ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1); #else - ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0); + ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0); #endif } +#if !WEBP_NEON_OMIT_C_CODE static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride) { @@ -338,6 +344,46 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) { int i; for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8; } +#endif // !WEBP_NEON_OMIT_C_CODE + +//------------------------------------------------------------------------------ + +static int HasAlpha8b_C(const uint8_t* src, int length) { + while (length-- > 0) if (*src++ != 0xff) return 1; + return 0; +} + +static int HasAlpha32b_C(const uint8_t* src, int length) { + int x; + for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1; + return 0; +} + +//------------------------------------------------------------------------------ +// Simple channel manipulations. + +static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) { + return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b); +} + +#ifdef WORDS_BIGENDIAN +static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g, + const uint8_t* b, int len, uint32_t* out) { + int i; + for (i = 0; i < len; ++i) { + out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); + } +} +#endif + +static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out) { + int i, offset = 0; + for (i = 0; i < len; ++i) { + out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]); + offset += step; + } +} void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); @@ -345,6 +391,15 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int); int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size); +#ifdef WORDS_BIGENDIAN +void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g, + const uint8_t* b, int, uint32_t*); +#endif +void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out); + +int (*WebPHasAlpha8b)(const uint8_t* src, int length); +int (*WebPHasAlpha32b)(const uint8_t* src, int length); //------------------------------------------------------------------------------ // Init function @@ -354,21 +409,25 @@ extern void WebPInitAlphaProcessingSSE2(void); extern void WebPInitAlphaProcessingSSE41(void); extern void WebPInitAlphaProcessingNEON(void); -static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used = - (VP8CPUInfo)&alpha_processing_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) { - if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return; - - WebPMultARGBRow = WebPMultARGBRowC; - WebPMultRow = WebPMultRowC; - WebPApplyAlphaMultiply = ApplyAlphaMultiply; - WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; +WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) { + WebPMultARGBRow = WebPMultARGBRow_C; + WebPMultRow = WebPMultRow_C; + WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C; +#ifdef WORDS_BIGENDIAN + WebPPackARGB = PackARGB_C; +#endif + WebPPackRGB = PackRGB_C; +#if !WEBP_NEON_OMIT_C_CODE + WebPApplyAlphaMultiply = ApplyAlphaMultiply_C; WebPDispatchAlpha = DispatchAlpha_C; WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C; WebPExtractAlpha = ExtractAlpha_C; WebPExtractGreen = ExtractGreen_C; +#endif + + WebPHasAlpha8b = HasAlpha8b_C; + WebPHasAlpha32b = HasAlpha32b_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -382,16 +441,32 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) { #endif } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPInitAlphaProcessingNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { WebPInitAlphaProcessingMIPSdspR2(); } #endif } - alpha_processing_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitAlphaProcessingNEON(); + } +#endif + + assert(WebPMultARGBRow != NULL); + assert(WebPMultRow != NULL); + assert(WebPApplyAlphaMultiply != NULL); + assert(WebPApplyAlphaMultiply4444 != NULL); + assert(WebPDispatchAlpha != NULL); + assert(WebPDispatchAlphaToGreen != NULL); + assert(WebPExtractAlpha != NULL); + assert(WebPExtractGreen != NULL); +#ifdef WORDS_BIGENDIAN + assert(WebPPackARGB != NULL); +#endif + assert(WebPPackRGB != NULL); + assert(WebPHasAlpha8b != NULL); + assert(WebPHasAlpha32b != NULL); } diff --git a/media/libwebp/dsp/alpha_processing_sse2.c b/media/libwebp/dsp/alpha_processing_sse2.c index 83dc559fa..9a3bc4485 100644 --- a/media/libwebp/dsp/alpha_processing_sse2.c +++ b/media/libwebp/dsp/alpha_processing_sse2.c @@ -11,16 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <emmintrin.h> //------------------------------------------------------------------------------ -static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride) { +static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint8_t* dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } -static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride) { +static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride) { int i, j; const __m128i zero = _mm_setzero_si128(); const int limit = width & ~15; @@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, } } -static int ExtractAlpha(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride) { +static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, #undef MULTIPLIER #undef PREMULTIPLY +//------------------------------------------------------------------------------ +// Alpha detection + +static int HasAlpha8b_SSE2(const uint8_t* src, int length) { + const __m128i all_0xff = _mm_set1_epi8(0xff); + int i = 0; + for (; i + 16 <= length; i += 16) { + const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i < length; ++i) if (src[i] != 0xff) return 1; + return 0; +} + +static int HasAlpha32b_SSE2(const uint8_t* src, int length) { + const __m128i alpha_mask = _mm_set1_epi32(0xff); + const __m128i all_0xff = _mm_set1_epi8(0xff); + int i = 0; + // We don't know if we can access the last 3 bytes after the last alpha + // value 'src[4 * length - 4]' (because we don't know if alpha is the first + // or the last byte of the quadruplet). Hence the '-3' protection below. + length = length * 4 - 3; // size in bytes + for (; i + 64 <= length; i += 64) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); + const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); + const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); + const __m128i b0 = _mm_and_si128(a0, alpha_mask); + const __m128i b1 = _mm_and_si128(a1, alpha_mask); + const __m128i b2 = _mm_and_si128(a2, alpha_mask); + const __m128i b3 = _mm_and_si128(a3, alpha_mask); + const __m128i c0 = _mm_packs_epi32(b0, b1); + const __m128i c1 = _mm_packs_epi32(b2, b3); + const __m128i d = _mm_packus_epi16(c0, c1); + const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i + 32 <= length; i += 32) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); + const __m128i b0 = _mm_and_si128(a0, alpha_mask); + const __m128i b1 = _mm_and_si128(a1, alpha_mask); + const __m128i c = _mm_packs_epi32(b0, b1); + const __m128i d = _mm_packus_epi16(c, c); + const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i <= length; i += 4) if (src[i] != 0xff) return 1; + return 0; +} + // ----------------------------------------------------------------------------- // Apply alpha value to rows @@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { } } width -= x; - if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); + if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); } static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, @@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, } } width -= x; - if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); + if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); } //------------------------------------------------------------------------------ @@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { WebPMultARGBRow = MultARGBRow_SSE2; WebPMultRow = MultRow_SSE2; WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; - WebPDispatchAlpha = DispatchAlpha; - WebPDispatchAlphaToGreen = DispatchAlphaToGreen; - WebPExtractAlpha = ExtractAlpha; + WebPDispatchAlpha = DispatchAlpha_SSE2; + WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; + WebPExtractAlpha = ExtractAlpha_SSE2; + + WebPHasAlpha8b = HasAlpha8b_SSE2; + WebPHasAlpha32b = HasAlpha32b_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/media/libwebp/dsp/alpha_processing_sse41.c b/media/libwebp/dsp/alpha_processing_sse41.c index 986fde94e..e33c1aba4 100644 --- a/media/libwebp/dsp/alpha_processing_sse41.c +++ b/media/libwebp/dsp/alpha_processing_sse41.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE41) @@ -19,9 +19,9 @@ //------------------------------------------------------------------------------ -static int ExtractAlpha(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride) { +static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride, extern void WebPInitAlphaProcessingSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) { - WebPExtractAlpha = ExtractAlpha; + WebPExtractAlpha = ExtractAlpha_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/media/libwebp/dsp/common_sse2.h b/media/libwebp/dsp/common_sse2.h index 995d7cf4e..e9f1ebff4 100644 --- a/media/libwebp/dsp/common_sse2.h +++ b/media/libwebp/dsp/common_sse2.h @@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b( // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... -static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1, - __m128i* const in2, __m128i* const in3, - __m128i* const in4, __m128i* const in5) { +static WEBP_INLINE void VP8PlanarTo24b_SSE2( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1, // Convert four packed four-channel buffers like argbargbargbargb... into the // split channels aaaaa ... rrrr ... gggg .... bbbbb ...... -static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0, - __m128i* const in1, - __m128i* const in2, - __m128i* const in3) { +static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0, + __m128i* const in1, + __m128i* const in2, + __m128i* const in3) { // Column-wise transpose. const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1); const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1); diff --git a/media/libwebp/dsp/common_sse41.h b/media/libwebp/dsp/common_sse41.h new file mode 100644 index 000000000..2f173c024 --- /dev/null +++ b/media/libwebp/dsp/common_sse41.h @@ -0,0 +1,132 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE4 code common to several files. +// +// Author: Vincent Rabaud (vrabaud@google.com) + +#ifndef WEBP_DSP_COMMON_SSE41_H_ +#define WEBP_DSP_COMMON_SSE41_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(WEBP_USE_SSE41) +#include <smmintrin.h> + +//------------------------------------------------------------------------------ +// Channel mixing. +// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ... +#define WEBP_SSE41_SHUFF(OUT, IN0, IN1) \ + OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \ + OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \ + OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \ + OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \ + OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \ + OUT##5 = _mm_shuffle_epi8(*IN1, shuff2); + +// Pack the planar buffers +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... +static WEBP_INLINE void VP8PlanarTo24b_SSE41( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5) { + __m128i R0, R1, R2, R3, R4, R5; + __m128i G0, G1, G2, G3, G4, G5; + __m128i B0, B1, B2, B3, B4, B5; + + // Process R. + { + const __m128i shuff0 = _mm_set_epi8( + 5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0); + const __m128i shuff1 = _mm_set_epi8( + -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1); + WEBP_SSE41_SHUFF(R, in0, in1) + } + + // Process G. + { + // Same as before, just shifted to the left by one and including the right + // padding. + const __m128i shuff0 = _mm_set_epi8( + -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1); + const __m128i shuff1 = _mm_set_epi8( + 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5); + const __m128i shuff2 = _mm_set_epi8( + -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1); + WEBP_SSE41_SHUFF(G, in2, in3) + } + + // Process B. + { + const __m128i shuff0 = _mm_set_epi8( + -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1); + const __m128i shuff2 = _mm_set_epi8( + 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10); + WEBP_SSE41_SHUFF(B, in4, in5) + } + + // OR the different channels. + { + const __m128i RG0 = _mm_or_si128(R0, G0); + const __m128i RG1 = _mm_or_si128(R1, G1); + const __m128i RG2 = _mm_or_si128(R2, G2); + const __m128i RG3 = _mm_or_si128(R3, G3); + const __m128i RG4 = _mm_or_si128(R4, G4); + const __m128i RG5 = _mm_or_si128(R5, G5); + *in0 = _mm_or_si128(RG0, B0); + *in1 = _mm_or_si128(RG1, B1); + *in2 = _mm_or_si128(RG2, B2); + *in3 = _mm_or_si128(RG3, B3); + *in4 = _mm_or_si128(RG4, B4); + *in5 = _mm_or_si128(RG5, B5); + } +} + +#undef WEBP_SSE41_SHUFF + +// Convert four packed four-channel buffers like argbargbargbargb... into the +// split channels aaaaa ... rrrr ... gggg .... bbbbb ...... +static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0, + __m128i* const in1, + __m128i* const in2, + __m128i* const in3) { + // aaaarrrrggggbbbb + const __m128i shuff0 = + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0); + const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0); + const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0); + const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0); + // A0A1R0R1 + // G0G1B0B1 + // A2A3R2R3 + // G0G1B0B1 + const __m128i B0 = _mm_unpacklo_epi32(A0, A1); + const __m128i B1 = _mm_unpackhi_epi32(A0, A1); + const __m128i B2 = _mm_unpacklo_epi32(A2, A3); + const __m128i B3 = _mm_unpackhi_epi32(A2, A3); + *in3 = _mm_unpacklo_epi64(B0, B2); + *in2 = _mm_unpackhi_epi64(B0, B2); + *in1 = _mm_unpacklo_epi64(B1, B3); + *in0 = _mm_unpackhi_epi64(B1, B3); +} + +#endif // WEBP_USE_SSE41 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_DSP_COMMON_SSE41_H_ diff --git a/media/libwebp/dsp/dec.c b/media/libwebp/dsp/dec.c index 007e985d8..a599d26bc 100644 --- a/media/libwebp/dsp/dec.c +++ b/media/libwebp/dsp/dec.c @@ -11,7 +11,9 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include <assert.h> + +#include "../dsp/dsp.h" #include "../dec/vp8i_dec.h" #include "../utils/utils.h" @@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) { // Transforms (Paragraph 14.4) #define STORE(x, y, v) \ - dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3)) + dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3)) #define STORE2(y, dc, d, c) do { \ const int DC = (dc); \ @@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) { #define MUL1(a) ((((a) * 20091) >> 16) + (a)) #define MUL2(a) (((a) * 35468) >> 16) -static void TransformOne(const int16_t* in, uint8_t* dst) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformOne_C(const int16_t* in, uint8_t* dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { } // Simplified transform when only in[0], in[1] and in[4] are non-zero -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3_C(const int16_t* in, uint8_t* dst) { const int a = in[0] + 4; const int c4 = MUL2(in[4]); const int d4 = MUL1(in[4]); @@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { #undef MUL2 #undef STORE2 -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { - TransformOne(in, dst); +static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne_C(in, dst); if (do_two) { - TransformOne(in + 16, dst + 4); + TransformOne_C(in + 16, dst + 4); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void TransformUV(const int16_t* in, uint8_t* dst) { +static void TransformUV_C(const int16_t* in, uint8_t* dst) { VP8Transform(in + 0 * 16, dst, 1); VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); } -static void TransformDC(const int16_t* in, uint8_t* dst) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformDC_C(const int16_t* in, uint8_t* dst) { const int DC = in[0] + 4; int i, j; for (j = 0; j < 4; ++j) { @@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { } } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void TransformDCUV(const int16_t* in, uint8_t* dst) { +static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst); if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4); if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS); @@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) { //------------------------------------------------------------------------------ // Paragraph 14.3 -static void TransformWHT(const int16_t* in, int16_t* out) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformWHT_C(const int16_t* in, int16_t* out) { int tmp[16]; int i; for (i = 0; i < 4; ++i) { @@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { out += 64; } } +#endif // !WEBP_NEON_OMIT_C_CODE void (*VP8TransformWHT)(const int16_t* in, int16_t* out); @@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out); #define DST(x, y) dst[(x) + (y) * BPS] +#if !WEBP_NEON_OMIT_C_CODE static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { const uint8_t* top = dst - BPS; const uint8_t* const clip0 = VP8kclip1 - top[-1]; @@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { dst += BPS; } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } -static void TM16(uint8_t* dst) { TrueMotion(dst, 16); } +static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); } +static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); } +static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); } //------------------------------------------------------------------------------ // 16x16 -static void VE16(uint8_t* dst) { // vertical +static void VE16_C(uint8_t* dst) { // vertical int j; for (j = 0; j < 16; ++j) { memcpy(dst + j * BPS, dst - BPS, 16); } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_C(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { memset(dst, dst[-1], 16); @@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) { } } -static void DC16(uint8_t* dst) { // DC +static void DC16_C(uint8_t* dst) { // DC int DC = 16; int j; for (j = 0; j < 16; ++j) { @@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC Put16(DC >> 5, dst); } -static void DC16NoTop(uint8_t* dst) { // DC with top samples not available +static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available int DC = 8; int j; for (j = 0; j < 16; ++j) { @@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available Put16(DC >> 4, dst); } -static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available +static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available int DC = 8; int i; for (i = 0; i < 16; ++i) { @@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available Put16(DC >> 4, dst); } -static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples +static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples Put16(0x80, dst); } +#endif // !WEBP_NEON_OMIT_C_CODE VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES]; @@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES]; #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static void VE4(uint8_t* dst) { // vertical +#if !WEBP_NEON_OMIT_C_CODE +static void VE4_C(uint8_t* dst) { // vertical const uint8_t* top = dst - BPS; const uint8_t vals[4] = { AVG3(top[-1], top[0], top[1]), @@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical memcpy(dst + i * BPS, vals, sizeof(vals)); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HE4(uint8_t* dst) { // horizontal +static void HE4_C(uint8_t* dst) { // horizontal const int A = dst[-1 - BPS]; const int B = dst[-1]; const int C = dst[-1 + BPS]; @@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E)); } -static void DC4(uint8_t* dst) { // DC +#if !WEBP_NEON_OMIT_C_CODE +static void DC4_C(uint8_t* dst) { // DC uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS]; @@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4); } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_C(uint8_t* dst) { // Down-right const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right DST(3, 0) = AVG3(D, C, B); } -static void LD4(uint8_t* dst) { // Down-Left +static void LD4_C(uint8_t* dst) { // Down-Left const int A = dst[0 - BPS]; const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; @@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left DST(3, 2) = DST(2, 3) = AVG3(F, G, H); DST(3, 3) = AVG3(G, H, H); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void VR4(uint8_t* dst) { // Vertical-Right +static void VR4_C(uint8_t* dst) { // Vertical-Right const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right DST(3, 1) = AVG3(B, C, D); } -static void VL4(uint8_t* dst) { // Vertical-Left +static void VL4_C(uint8_t* dst) { // Vertical-Left const int A = dst[0 - BPS]; const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; @@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left DST(3, 3) = AVG3(F, G, H); } -static void HU4(uint8_t* dst) { // Horizontal-Up +static void HU4_C(uint8_t* dst) { // Horizontal-Up const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static void HD4(uint8_t* dst) { // Horizontal-Down +static void HD4_C(uint8_t* dst) { // Horizontal-Down const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES]; //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +#if !WEBP_NEON_OMIT_C_CODE +static void VE8uv_C(uint8_t* dst) { // vertical int j; for (j = 0; j < 8; ++j) { memcpy(dst + j * BPS, dst - BPS, 8); } } -static void HE8uv(uint8_t* dst) { // horizontal +static void HE8uv_C(uint8_t* dst) { // horizontal int j; for (j = 0; j < 8; ++j) { memset(dst, dst[-1], 8); @@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) { } } -static void DC8uv(uint8_t* dst) { // DC +static void DC8uv_C(uint8_t* dst) { // DC int dc0 = 8; int i; for (i = 0; i < 8; ++i) { @@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC Put8x8uv(dc0 >> 4, dst); } -static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples +static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { @@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples Put8x8uv(dc0 >> 3, dst); } -static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples +static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { @@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples Put8x8uv(dc0 >> 3, dst); } -static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing +static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing Put8x8uv(0x80, dst); } +#endif // !WEBP_NEON_OMIT_C_CODE VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES]; //------------------------------------------------------------------------------ // Edge filtering functions +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC // 4 pixels in, 2 pixels out -static WEBP_INLINE void do_filter2(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892] const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15] @@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) { } // 4 pixels in, 4 pixels out -static WEBP_INLINE void do_filter4(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; const int a = 3 * (q0 - p0); const int a1 = VP8ksclip2[(a + 4) >> 3]; @@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) { } // 6 pixels in, 6 pixels out -static WEBP_INLINE void do_filter6(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) { const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step]; const int q0 = p[0], q1 = p[step], q2 = p[2*step]; const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]]; @@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) { p[ 2*step] = VP8kclip1[q2 - a3]; } -static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) { +static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) { const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t); } +#endif // !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE int needs_filter2(const uint8_t* p, - int step, int t, int it) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p, + int step, int t, int it) { const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step]; const int p0 = p[-step], q0 = p[0]; const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; @@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p, VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it && VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it; } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC //------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +#if !WEBP_NEON_OMIT_C_CODE +static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) { int i; const int thresh2 = 2 * thresh + 1; for (i = 0; i < 16; ++i) { - if (needs_filter(p + i, stride, thresh2)) { - do_filter2(p + i, stride); + if (NeedsFilter_C(p + i, stride, thresh2)) { + DoFilter2_C(p + i, stride); } } } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) { int i; const int thresh2 = 2 * thresh + 1; for (i = 0; i < 16; ++i) { - if (needs_filter(p + i * stride, 1, thresh2)) { - do_filter2(p + i * stride, 1); + if (NeedsFilter_C(p + i * stride, 1, thresh2)) { + DoFilter2_C(p + i * stride, 1); } } } -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_C(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_C(p, stride, thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) -static WEBP_INLINE void FilterLoop26(uint8_t* p, - int hstride, int vstride, int size, - int thresh, int ithresh, int hev_thresh) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static WEBP_INLINE void FilterLoop26_C(uint8_t* p, + int hstride, int vstride, int size, + int thresh, int ithresh, + int hev_thresh) { const int thresh2 = 2 * thresh + 1; while (size-- > 0) { - if (needs_filter2(p, hstride, thresh2, ithresh)) { - if (hev(p, hstride, hev_thresh)) { - do_filter2(p, hstride); + if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) { + if (Hev(p, hstride, hev_thresh)) { + DoFilter2_C(p, hstride); } else { - do_filter6(p, hstride); + DoFilter6_C(p, hstride); } } p += vstride; } } -static WEBP_INLINE void FilterLoop24(uint8_t* p, - int hstride, int vstride, int size, - int thresh, int ithresh, int hev_thresh) { +static WEBP_INLINE void FilterLoop24_C(uint8_t* p, + int hstride, int vstride, int size, + int thresh, int ithresh, + int hev_thresh) { const int thresh2 = 2 * thresh + 1; while (size-- > 0) { - if (needs_filter2(p, hstride, thresh2, ithresh)) { - if (hev(p, hstride, hev_thresh)) { - do_filter2(p, hstride); + if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) { + if (Hev(p, hstride, hev_thresh)) { + DoFilter2_C(p, hstride); } else { - do_filter4(p, hstride); + DoFilter4_C(p, hstride); } } p += vstride; } } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +#if !WEBP_NEON_OMIT_C_CODE // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); +static void VFilter16_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh); } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); +static void HFilter16_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh); } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); + FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter16i_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); + FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +#if !WEBP_NEON_OMIT_C_CODE // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); - FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); +static void VFilter8_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh); + FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); - FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter8_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh); + FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); - FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE +static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); + FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); - FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); + FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC //------------------------------------------------------------------------------ -static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst, - int dst_stride) { +static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst, + int dst_stride) { int i, j; for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) { @@ -701,62 +741,69 @@ extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); extern void VP8DspInitMSA(void); -static volatile VP8CPUInfo dec_last_cpuinfo_used = - (VP8CPUInfo)&dec_last_cpuinfo_used; +WEBP_DSP_INIT_FUNC(VP8DspInit) { + VP8InitClipTables(); -WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { - if (dec_last_cpuinfo_used == VP8GetCPUInfo) return; +#if !WEBP_NEON_OMIT_C_CODE + VP8TransformWHT = TransformWHT_C; + VP8Transform = TransformTwo_C; + VP8TransformDC = TransformDC_C; + VP8TransformAC3 = TransformAC3_C; +#endif + VP8TransformUV = TransformUV_C; + VP8TransformDCUV = TransformDCUV_C; + +#if !WEBP_NEON_OMIT_C_CODE + VP8VFilter16 = VFilter16_C; + VP8VFilter16i = VFilter16i_C; + VP8HFilter16 = HFilter16_C; + VP8VFilter8 = VFilter8_C; + VP8VFilter8i = VFilter8i_C; + VP8SimpleVFilter16 = SimpleVFilter16_C; + VP8SimpleHFilter16 = SimpleHFilter16_C; + VP8SimpleVFilter16i = SimpleVFilter16i_C; + VP8SimpleHFilter16i = SimpleHFilter16i_C; +#endif - VP8InitClipTables(); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC + VP8HFilter16i = HFilter16i_C; + VP8HFilter8 = HFilter8_C; + VP8HFilter8i = HFilter8i_C; +#endif + +#if !WEBP_NEON_OMIT_C_CODE + VP8PredLuma4[0] = DC4_C; + VP8PredLuma4[1] = TM4_C; + VP8PredLuma4[2] = VE4_C; + VP8PredLuma4[4] = RD4_C; + VP8PredLuma4[6] = LD4_C; +#endif + + VP8PredLuma4[3] = HE4_C; + VP8PredLuma4[5] = VR4_C; + VP8PredLuma4[7] = VL4_C; + VP8PredLuma4[8] = HD4_C; + VP8PredLuma4[9] = HU4_C; + +#if !WEBP_NEON_OMIT_C_CODE + VP8PredLuma16[0] = DC16_C; + VP8PredLuma16[1] = TM16_C; + VP8PredLuma16[2] = VE16_C; + VP8PredLuma16[3] = HE16_C; + VP8PredLuma16[4] = DC16NoTop_C; + VP8PredLuma16[5] = DC16NoLeft_C; + VP8PredLuma16[6] = DC16NoTopLeft_C; + + VP8PredChroma8[0] = DC8uv_C; + VP8PredChroma8[1] = TM8uv_C; + VP8PredChroma8[2] = VE8uv_C; + VP8PredChroma8[3] = HE8uv_C; + VP8PredChroma8[4] = DC8uvNoTop_C; + VP8PredChroma8[5] = DC8uvNoLeft_C; + VP8PredChroma8[6] = DC8uvNoTopLeft_C; +#endif - VP8TransformWHT = TransformWHT; - VP8Transform = TransformTwo; - VP8TransformUV = TransformUV; - VP8TransformDC = TransformDC; - VP8TransformDCUV = TransformDCUV; - VP8TransformAC3 = TransformAC3; - - VP8VFilter16 = VFilter16; - VP8HFilter16 = HFilter16; - VP8VFilter8 = VFilter8; - VP8HFilter8 = HFilter8; - VP8VFilter16i = VFilter16i; - VP8HFilter16i = HFilter16i; - VP8VFilter8i = VFilter8i; - VP8HFilter8i = HFilter8i; - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[0] = DC4; - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[3] = HE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[5] = VR4; - VP8PredLuma4[6] = LD4; - VP8PredLuma4[7] = VL4; - VP8PredLuma4[8] = HD4; - VP8PredLuma4[9] = HU4; - - VP8PredLuma16[0] = DC16; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; - - VP8DitherCombine8x8 = DitherCombine8x8; + VP8DitherCombine8x8 = DitherCombine8x8_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -770,11 +817,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { #endif } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8DspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { VP8DspInitMIPS32(); @@ -791,5 +833,55 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { } #endif } - dec_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8DspInitNEON(); + } +#endif + + assert(VP8TransformWHT != NULL); + assert(VP8Transform != NULL); + assert(VP8TransformDC != NULL); + assert(VP8TransformAC3 != NULL); + assert(VP8TransformUV != NULL); + assert(VP8TransformDCUV != NULL); + assert(VP8VFilter16 != NULL); + assert(VP8HFilter16 != NULL); + assert(VP8VFilter8 != NULL); + assert(VP8HFilter8 != NULL); + assert(VP8VFilter16i != NULL); + assert(VP8HFilter16i != NULL); + assert(VP8VFilter8i != NULL); + assert(VP8HFilter8i != NULL); + assert(VP8SimpleVFilter16 != NULL); + assert(VP8SimpleHFilter16 != NULL); + assert(VP8SimpleVFilter16i != NULL); + assert(VP8SimpleHFilter16i != NULL); + assert(VP8PredLuma4[0] != NULL); + assert(VP8PredLuma4[1] != NULL); + assert(VP8PredLuma4[2] != NULL); + assert(VP8PredLuma4[3] != NULL); + assert(VP8PredLuma4[4] != NULL); + assert(VP8PredLuma4[5] != NULL); + assert(VP8PredLuma4[6] != NULL); + assert(VP8PredLuma4[7] != NULL); + assert(VP8PredLuma4[8] != NULL); + assert(VP8PredLuma4[9] != NULL); + assert(VP8PredLuma16[0] != NULL); + assert(VP8PredLuma16[1] != NULL); + assert(VP8PredLuma16[2] != NULL); + assert(VP8PredLuma16[3] != NULL); + assert(VP8PredLuma16[4] != NULL); + assert(VP8PredLuma16[5] != NULL); + assert(VP8PredLuma16[6] != NULL); + assert(VP8PredChroma8[0] != NULL); + assert(VP8PredChroma8[1] != NULL); + assert(VP8PredChroma8[2] != NULL); + assert(VP8PredChroma8[3] != NULL); + assert(VP8PredChroma8[4] != NULL); + assert(VP8PredChroma8[5] != NULL); + assert(VP8PredChroma8[6] != NULL); + assert(VP8DitherCombine8x8 != NULL); } diff --git a/media/libwebp/dsp/dec_clip_tables.c b/media/libwebp/dsp/dec_clip_tables.c index 74ba34c0b..9c86011d2 100644 --- a/media/libwebp/dsp/dec_clip_tables.c +++ b/media/libwebp/dsp/dec_clip_tables.c @@ -11,11 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" -#define USE_STATIC_TABLES // undefine to have run-time table initialization +// define to 0 to have run-time table initialization +#if !defined(USE_STATIC_TABLES) +#define USE_STATIC_TABLES 1 // ALTERNATE_CODE +#endif -#ifdef USE_STATIC_TABLES +#if (USE_STATIC_TABLES == 1) static const uint8_t abs0[255 + 255 + 1] = { 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, @@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1]; // and make sure it's set to true _last_ (so as to be thread-safe) static volatile int tables_ok = 0; -#endif +#endif // USE_STATIC_TABLES const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020]; const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112]; @@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255]; const uint8_t* const VP8kabs0 = &abs0[255]; WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) { -#if !defined(USE_STATIC_TABLES) +#if (USE_STATIC_TABLES == 0) int i; if (!tables_ok) { for (i = -255; i <= 255; ++i) { diff --git a/media/libwebp/dsp/dec_neon.c b/media/libwebp/dsp/dec_neon.c index 34796cf4a..e8341327e 100644 --- a/media/libwebp/dsp/dec_neon.c +++ b/media/libwebp/dsp/dec_neon.c @@ -12,43 +12,23 @@ // Authors: Somnath Banerjee (somnath@google.com) // Johann Koenig (johannkoenig@google.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_NEON) -#include "./neon.h" +#include "../dsp/neon.h" #include "../dec/vp8i_dec.h" //------------------------------------------------------------------------------ // NxM Loading functions -// Load/Store vertical edge -#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ - "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" - -#define STORE8x2(c1, c2, p, stride) \ - "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" - #if !defined(WORK_AROUND_GCC) // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation // (register alloc, probably). The variants somewhat mitigate the problem, but // not quite. HFilter16i() remains problematic. -static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { +static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src, + int stride) { const uint8x8_t zero = vdup_n_u8(0); uint8x8x4_t out; INIT_VECTOR4(out, zero, zero, zero, zero); @@ -63,13 +43,15 @@ static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { return out; } -static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7] // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15] - const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride); - const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride); + const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride); + const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride); *p1 = vcombine_u8(row0.val[0], row8.val[0]); *p0 = vcombine_u8(row0.val[1], row8.val[1]); *q0 = vcombine_u8(row0.val[2], row8.val[2]); @@ -83,9 +65,11 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, src += stride; \ } while (0) -static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { const uint32x4_t zero = vdupq_n_u32(0); uint32x4x4_t in; INIT_VECTOR4(in, zero, zero, zero, zero); @@ -126,40 +110,40 @@ static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { - Load4x16(src - 2, stride, p3, p2, p1, p0); - Load4x16(src + 2, stride, q0, q1, q2, q3); +static WEBP_INLINE void Load8x16_NEON( + const uint8_t* const src, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { + Load4x16_NEON(src - 2, stride, p3, p2, p1, p0); + Load4x16_NEON(src + 2, stride, q0, q1, q2, q3); } -static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { *p1 = vld1q_u8(src - 2 * stride); *p0 = vld1q_u8(src - 1 * stride); *q0 = vld1q_u8(src + 0 * stride); *q1 = vld1q_u8(src + 1 * stride); } -static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { - Load16x4(src - 2 * stride, stride, p3, p2, p1, p0); - Load16x4(src + 2 * stride, stride, q0, q1, q2, q3); +static WEBP_INLINE void Load16x8_NEON( + const uint8_t* const src, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { + Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0); + Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3); } -static WEBP_INLINE void Load8x8x2(const uint8_t* const u, - const uint8_t* const v, - int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { +static WEBP_INLINE void Load8x8x2_NEON( + const uint8_t* const u, const uint8_t* const v, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination // and the v-samples on the higher half. *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride)); @@ -177,13 +161,11 @@ static WEBP_INLINE void Load8x8x2(const uint8_t* const u, #define LOAD_UV_8(ROW) \ vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride)) -static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, - const uint8_t* const v, - int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { +static WEBP_INLINE void Load8x8x2T_NEON( + const uint8_t* const u, const uint8_t* const v, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination // and the v-samples on the higher half. const uint8x16_t row0 = LOAD_UV_8(0); @@ -238,8 +220,8 @@ static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Store2x8(const uint8x8x2_t v, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v, + uint8_t* const dst, int stride) { vst2_lane_u8(dst + 0 * stride, v, 0); vst2_lane_u8(dst + 1 * stride, v, 1); vst2_lane_u8(dst + 2 * stride, v, 2); @@ -250,20 +232,20 @@ static WEBP_INLINE void Store2x8(const uint8x8x2_t v, vst2_lane_u8(dst + 7 * stride, v, 7); } -static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0, + uint8_t* const dst, int stride) { uint8x8x2_t lo, hi; lo.val[0] = vget_low_u8(p0); lo.val[1] = vget_low_u8(q0); hi.val[0] = vget_high_u8(p0); hi.val[1] = vget_high_u8(q0); - Store2x8(lo, dst - 1 + 0 * stride, stride); - Store2x8(hi, dst - 1 + 8 * stride, stride); + Store2x8_NEON(lo, dst - 1 + 0 * stride, stride); + Store2x8_NEON(hi, dst - 1 + 8 * stride, stride); } #if !defined(WORK_AROUND_GCC) -static WEBP_INLINE void Store4x8(const uint8x8x4_t v, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v, + uint8_t* const dst, int stride) { vst4_lane_u8(dst + 0 * stride, v, 0); vst4_lane_u8(dst + 1 * stride, v, 1); vst4_lane_u8(dst + 2 * stride, v, 2); @@ -274,9 +256,9 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v, vst4_lane_u8(dst + 7 * stride, v, 7); } -static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + uint8_t* const dst, int stride) { uint8x8x4_t lo, hi; INIT_VECTOR4(lo, vget_low_u8(p1), vget_low_u8(p0), @@ -284,27 +266,28 @@ static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, INIT_VECTOR4(hi, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), vget_high_u8(q1)); - Store4x8(lo, dst - 2 + 0 * stride, stride); - Store4x8(hi, dst - 2 + 8 * stride, stride); + Store4x8_NEON(lo, dst - 2 + 0 * stride, stride); + Store4x8_NEON(hi, dst - 2 + 8 * stride, stride); } #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0, + uint8_t* const dst, int stride) { vst1q_u8(dst - stride, p0); vst1q_u8(dst, q0); } -static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const dst, int stride) { - Store16x2(p1, p0, dst - stride, stride); - Store16x2(q0, q1, dst + stride, stride); +static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + uint8_t* const dst, int stride) { + Store16x2_NEON(p1, p0, dst - stride, stride); + Store16x2_NEON(q0, q1, dst + stride, stride); } -static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0, + const uint8x16_t q0, + uint8_t* const u, uint8_t* const v, + int stride) { // p0 and q0 contain the u+v samples packed in low/high halves. vst1_u8(u - stride, vget_low_u8(p0)); vst1_u8(u, vget_low_u8(q0)); @@ -312,13 +295,15 @@ static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, vst1_u8(v, vget_high_u8(q0)); } -static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1, + const uint8x16_t p0, + const uint8x16_t q0, + const uint8x16_t q1, + uint8_t* const u, uint8_t* const v, + int stride) { // The p1...q1 registers contain the u+v samples packed in low/high halves. - Store8x2x2(p1, p0, u - stride, v - stride, stride); - Store8x2x2(q0, q1, u + stride, v + stride, stride); + Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride); + Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride); } #if !defined(WORK_AROUND_GCC) @@ -329,11 +314,10 @@ static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, (DST) += stride; \ } while (0) -static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, - const uint8x16_t p0, const uint8x16_t q0, - const uint8x16_t q1, const uint8x16_t q2, - uint8_t* u, uint8_t* v, - int stride) { +static WEBP_INLINE void Store6x8x2_NEON( + const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, + uint8_t* u, uint8_t* v, int stride) { uint8x8x3_t u0, u1, v0, v1; INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)); INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)); @@ -358,10 +342,12 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, } #undef STORE6_LANE -static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1, + const uint8x16_t p0, + const uint8x16_t q0, + const uint8x16_t q1, + uint8_t* const u, uint8_t* const v, + int stride) { uint8x8x4_t u0, v0; INIT_VECTOR4(u0, vget_low_u8(p1), vget_low_u8(p0), @@ -390,15 +376,15 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, #endif // !WORK_AROUND_GCC // Zero extend 'v' to an int16x8_t. -static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { +static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) { return vreinterpretq_s16_u16(vmovl_u8(v)); } // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result // to the corresponding rows of 'dst'. -static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, - const int16x8_t dst01, - const int16x8_t dst23) { +static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, + const int16x8_t dst01, + const int16x8_t dst23) { // Unsigned saturate to 8b. const uint8x8_t dst01_u8 = vqmovun_s16(dst01); const uint8x8_t dst23_u8 = vqmovun_s16(dst23); @@ -410,8 +396,9 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); } -static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, - uint8_t* const dst) { +static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, + const int16x8_t row23, + uint8_t* const dst) { uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0); @@ -423,23 +410,23 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, { // Convert to 16b. - const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); - const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); + const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01)); + const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23)); // Descale with rounding. const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); // Add the inverse transform. - SaturateAndStore4x4(dst, out01, out23); + SaturateAndStore4x4_NEON(dst, out01, out23); } } //----------------------------------------------------------------------------- // Simple In-loop filtering (Paragraph 15.2) -static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - int thresh) { +static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int thresh) { const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) @@ -450,18 +437,18 @@ static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, return mask; } -static int8x16_t FlipSign(const uint8x16_t v) { +static int8x16_t FlipSign_NEON(const uint8x16_t v) { const uint8x16_t sign_bit = vdupq_n_u8(0x80); return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); } -static uint8x16_t FlipSignBack(const int8x16_t v) { +static uint8x16_t FlipSignBack_NEON(const int8x16_t v) { const int8x16_t sign_bit = vdupq_n_s8(0x80); return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); } -static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, - const int8x16_t q0, const int8x16_t q1) { +static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0, + const int8x16_t q0, const int8x16_t q1) { const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) @@ -470,7 +457,7 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, return s3; } -static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { +static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) { const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) @@ -479,9 +466,10 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { //------------------------------------------------------------------------------ -static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, - const int8x16_t delta, - int8x16_t* const op0, int8x16_t* const oq0) { +static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + int8x16_t* const op0, + int8x16_t* const oq0) { const int8x16_t kCst3 = vdupq_n_s8(0x03); const int8x16_t kCst4 = vdupq_n_s8(0x04); const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); @@ -494,9 +482,9 @@ static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, #if defined(WEBP_USE_INTRINSICS) -static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, - const int8x16_t delta, - uint8x16_t* const op0, uint8x16_t* const oq0) { +static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + uint8x16_t* const op0, uint8x16_t* const oq0) { const int8x16_t kCst3 = vdupq_n_s8(0x03); const int8x16_t kCst4 = vdupq_n_s8(0x04); const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); @@ -505,45 +493,66 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); const int8x16_t sp0 = vqaddq_s8(p0s, delta3); const int8x16_t sq0 = vqsubq_s8(q0s, delta4); - *op0 = FlipSignBack(sp0); - *oq0 = FlipSignBack(sq0); -} - -static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - const uint8x16_t mask, - uint8x16_t* const op0, uint8x16_t* const oq0) { - const int8x16_t p1s = FlipSign(p1); - const int8x16_t p0s = FlipSign(p0); - const int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); - const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + *op0 = FlipSignBack_NEON(sp0); + *oq0 = FlipSignBack_NEON(sq0); +} + +static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t mask, + uint8x16_t* const op0, uint8x16_t* const oq0) { + const int8x16_t p1s = FlipSign_NEON(p1); + const int8x16_t p0s = FlipSign_NEON(p0); + const int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); + const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); - ApplyFilter2(p0s, q0s, delta1, op0, oq0); + ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0); } -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, op0, oq0; - Load16x4(p, stride, &p1, &p0, &q0, &q1); + Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1); { - const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); - DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); + DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); } - Store16x2(op0, oq0, p, stride); + Store16x2_NEON(op0, oq0, p, stride); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, oq0, op0; - Load4x16(p, stride, &p1, &p0, &q0, &q1); + Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1); { - const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); - DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); + DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); } - Store2x16(op0, oq0, p, stride); + Store2x16_NEON(op0, oq0, p, stride); } #else +// Load/Store vertical edge +#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ + "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" + +#define STORE8x2(c1, c2, p, stride) \ + "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" + #define QRegs "q0", "q1", "q2", "q3", \ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -592,7 +601,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ FLIP_SIGN_BIT2(p0, q0, q10) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { __asm__ volatile ( "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride @@ -613,7 +622,7 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { ); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { __asm__ volatile ( "sub r4, %[p], #2 \n" // base1 = p - 2 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride @@ -639,30 +648,33 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { ); } +#undef LOAD8x4 +#undef STORE8x2 + #endif // WEBP_USE_INTRINSICS -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) { uint32_t k; for (k = 3; k != 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_NEON(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) { uint32_t k; for (k = 3; k != 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_NEON(p, stride, thresh); } } //------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) -static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - int hev_thresh) { +static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int hev_thresh) { const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) @@ -671,11 +683,11 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, return mask; } -static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, - const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - const uint8x16_t q2, const uint8x16_t q3, - int ithresh, int thresh) { +static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2, + const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t q2, const uint8x16_t q3, + int ithresh, int thresh) { const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) @@ -689,14 +701,14 @@ static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t max12 = vmaxq_u8(max1, max2); const uint8x16_t max123 = vmaxq_u8(max12, max3); const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123); - const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh); + const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh); const uint8x16_t mask = vandq_u8(mask1, mask2); return mask; } // 4-points filter -static void ApplyFilter4( +static void ApplyFilter4_NEON( const int8x16_t p1, const int8x16_t p0, const int8x16_t q0, const int8x16_t q1, const int8x16_t delta0, @@ -709,47 +721,47 @@ static void ApplyFilter4( const int8x16_t a1 = vshrq_n_s8(delta1, 3); const int8x16_t a2 = vshrq_n_s8(delta2, 3); const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1 - *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2) - *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1) - *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3) - *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3) + *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2) + *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1) + *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3) + *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3) } -static void DoFilter4( +static void DoFilter4_NEON( const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t mask, const uint8x16_t hev_mask, uint8x16_t* const op1, uint8x16_t* const op0, uint8x16_t* const oq0, uint8x16_t* const oq1) { // This is a fused version of DoFilter2() calling ApplyFilter2 directly - const int8x16_t p1s = FlipSign(p1); - int8x16_t p0s = FlipSign(p0); - int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); + const int8x16_t p1s = FlipSign_NEON(p1); + int8x16_t p0s = FlipSign_NEON(p0); + int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); // do_filter2 part (simple loopfilter on pixels with hev) { - const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); const int8x16_t simple_lf_delta = vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); - ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); + ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter4 part (complex loopfilter on pixels without hev) { - const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); + const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s); // we use: (mask & hev_mask) ^ mask = mask & !hev_mask const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); const int8x16_t complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); - ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); + ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); } } // 6-points filter -static void ApplyFilter6( +static void ApplyFilter6_NEON( const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, const int8x16_t delta, @@ -778,35 +790,35 @@ static void ApplyFilter6( const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); - *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) - *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) - *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) - *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) - *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) - *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) + *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1) + *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1) + *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2) + *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2) + *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3) + *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3) } -static void DoFilter6( +static void DoFilter6_NEON( const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, const uint8x16_t mask, const uint8x16_t hev_mask, uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { // This is a fused version of DoFilter2() calling ApplyFilter2 directly - const int8x16_t p2s = FlipSign(p2); - const int8x16_t p1s = FlipSign(p1); - int8x16_t p0s = FlipSign(p0); - int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); - const int8x16_t q2s = FlipSign(q2); + const int8x16_t p2s = FlipSign_NEON(p2); + const int8x16_t p1s = FlipSign_NEON(p1); + int8x16_t p0s = FlipSign_NEON(p0); + int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); + const int8x16_t q2s = FlipSign_NEON(q2); const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); - const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); // do_filter2 part (simple loopfilter on pixels with hev) { const int8x16_t simple_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); - ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); + ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter6 part (complex loopfilter on pixels without hev) @@ -815,65 +827,65 @@ static void DoFilter6( const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); const int8x16_t complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); - ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, - op2, op1, op0, oq0, oq1, oq2); + ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, + op2, op1, op0, oq0, oq1, oq2); } } // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store16x2(op2, op1, p - 2 * stride, stride); - Store16x2(op0, oq0, p + 0 * stride, stride); - Store16x2(oq1, oq2, p + 2 * stride, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store16x2_NEON(op2, op1, p - 2 * stride, stride); + Store16x2_NEON(op0, oq0, p + 0 * stride, stride); + Store16x2_NEON(oq1, oq2, p + 2 * stride, stride); } } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store2x16(op2, op1, p - 2, stride); - Store2x16(op0, oq0, p + 0, stride); - Store2x16(oq1, oq2, p + 2, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store2x16_NEON(op2, op1, p - 2, stride); + Store2x16_NEON(op0, oq0, p + 0, stride); + Store2x16_NEON(oq1, oq2, p + 2, stride); } } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint32_t k; uint8x16_t p3, p2, p1, p0; - Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0); + Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0); for (k = 3; k != 0; --k) { uint8x16_t q0, q1, q2, q3; p += 4 * stride; - Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); + Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = - NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); - Store16x4(p1, p0, p3, p2, p, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store16x4_NEON(p1, p0, p3, p2, p, stride); p1 = q2; p0 = q3; } @@ -881,21 +893,21 @@ static void VFilter16i(uint8_t* p, int stride, } #if !defined(WORK_AROUND_GCC) -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16i_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint32_t k; uint8x16_t p3, p2, p1, p0; - Load4x16(p + 2, stride, &p3, &p2, &p1, &p0); + Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0); for (k = 3; k != 0; --k) { uint8x16_t q0, q1, q2, q3; p += 4; - Load4x16(p + 2, stride, &q0, &q1, &q2, &q3); + Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = - NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); - Store4x16(p1, p0, p3, p2, p, stride); + NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store4x16_NEON(p1, p0, p3, p2, p, stride); p1 = q2; p0 = q3; } @@ -904,67 +916,67 @@ static void HFilter16i(uint8_t* p, int stride, #endif // !WORK_AROUND_GCC // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride); - Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride); - Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride); + Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride); + Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); } } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4 * stride; v += 4 * stride; - Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store8x4x2(op1, op0, oq0, oq1, u, v, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); + Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride); } } #if !defined(WORK_AROUND_GCC) -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride); } } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4; v += 4; - Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store4x8x2(op1, op0, oq0, oq1, u, v, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); + Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride); } } #endif // !WORK_AROUND_GCC @@ -992,8 +1004,9 @@ static const int16_t kC1 = 20091; static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. #if defined(WEBP_USE_INTRINSICS) -static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, - int16x8x2_t* const out) { +static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, + const int16x8_t in1, + int16x8x2_t* const out) { // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... @@ -1001,7 +1014,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); } -static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { +static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { // {rows} = in0 | in4 // in8 | in12 // B1 = in4 | in12 @@ -1024,20 +1037,20 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); - Transpose8x2(E0, E1, rows); + Transpose8x2_NEON(E0, E1, rows); } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); - TransformPass(&rows); - TransformPass(&rows); - Add4x4(rows.val[0], rows.val[1], dst); + TransformPass_NEON(&rows); + TransformPass_NEON(&rows); + Add4x4_NEON(rows.val[0], rows.val[1], dst); } #else -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { const int kBPS = BPS; // kC1, kC2. Padded because vld1.16 loads 8 bytes const int16_t constants[4] = { kC1, kC2, 0, 0 }; @@ -1170,16 +1183,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { #endif // WEBP_USE_INTRINSICS -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { - TransformOne(in, dst); +static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne_NEON(in, dst); if (do_two) { - TransformOne(in + 16, dst + 4); + TransformOne_NEON(in + 16, dst + 4); } } -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { const int16x8_t DC = vdupq_n_s16(in[0]); - Add4x4(DC, DC, dst); + Add4x4_NEON(DC, DC, dst); } //------------------------------------------------------------------------------ @@ -1191,7 +1204,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ } while (0) -static void TransformWHT(const int16_t* in, int16_t* out) { +static void TransformWHT_NEON(const int16_t* in, int16_t* out) { int32x4x4_t tmp; { @@ -1209,7 +1222,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { tmp.val[2] = vsubq_s32(a0, a1); tmp.val[3] = vsubq_s32(a3, a2); // Arrange the temporary results column-wise. - tmp = Transpose4x4(tmp); + tmp = Transpose4x4_NEON(tmp); } { @@ -1243,7 +1256,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ #define MUL(a, b) (((a) * (b)) >> 16) -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { static const int kC1_full = 20091 + (1 << 16); static const int kC2_full = 35468; const int16x4_t A = vld1_dup_s16(in); @@ -1259,14 +1272,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const int16x4_t B = vqadd_s16(A, CD); const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); - Add4x4(m0_m1, m2_m3, dst); + Add4x4_NEON(m0_m1, m2_m3, dst); } #undef MUL //------------------------------------------------------------------------------ // 4x4 -static void DC4(uint8_t* dst) { // DC +static void DC4_NEON(uint8_t* dst) { // DC const uint8x8_t A = vld1_u8(dst - BPS); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); @@ -1287,17 +1300,17 @@ static void DC4(uint8_t* dst) { // DC } // TrueMotion (4x4 + 8x8) -static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { +static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) { const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] int y; for (y = 0; y < size; y += 4) { // left edge - const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); - const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); - const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); - const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] const int16x8_t r1 = vaddq_s16(L1, d); const int16x8_t r2 = vaddq_s16(L2, d); @@ -1322,9 +1335,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } +static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); } -static void VE4(uint8_t* dst) { // vertical +static void VE4_NEON(uint8_t* dst) { // vertical // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row const uint64x1_t A1 = vshr_n_u64(A0, 8); @@ -1340,7 +1353,7 @@ static void VE4(uint8_t* dst) { // vertical } } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_NEON(uint8_t* dst) { // Down-right const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); @@ -1368,7 +1381,7 @@ static void RD4(uint8_t* dst) { // Down-right vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); } -static void LD4(uint8_t* dst) { // Down-left +static void LD4_NEON(uint8_t* dst) { // Down-left // Note using the same shift trick as VE4() is slower here. const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); @@ -1390,7 +1403,7 @@ static void LD4(uint8_t* dst) { // Down-left //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +static void VE8uv_NEON(uint8_t* dst) { // vertical const uint8x8_t top = vld1_u8(dst - BPS); int j; for (j = 0; j < 8; ++j) { @@ -1398,7 +1411,7 @@ static void VE8uv(uint8_t* dst) { // vertical } } -static void HE8uv(uint8_t* dst) { // horizontal +static void HE8uv_NEON(uint8_t* dst) { // horizontal int j; for (j = 0; j < 8; ++j) { const uint8x8_t left = vld1_dup_u8(dst - 1); @@ -1407,7 +1420,7 @@ static void HE8uv(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { +static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -1458,17 +1471,17 @@ static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { } } -static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } -static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } -static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } -static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } +static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); } +static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); } +static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); } +static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } +static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); } //------------------------------------------------------------------------------ // 16x16 -static void VE16(uint8_t* dst) { // vertical +static void VE16_NEON(uint8_t* dst) { // vertical const uint8x16_t top = vld1q_u8(dst - BPS); int j; for (j = 0; j < 16; ++j) { @@ -1476,7 +1489,7 @@ static void VE16(uint8_t* dst) { // vertical } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_NEON(uint8_t* dst) { // horizontal int j; for (j = 0; j < 16; ++j) { const uint8x16_t left = vld1q_dup_u8(dst - 1); @@ -1485,7 +1498,7 @@ static void HE16(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { +static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -1542,12 +1555,12 @@ static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { } } -static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } -static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } -static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } -static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } +static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); } +static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); } +static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); } +static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); } -static void TM16(uint8_t* dst) { +static void TM16_NEON(uint8_t* dst) { const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' // A[c] - A[-1] @@ -1556,10 +1569,10 @@ static void TM16(uint8_t* dst) { int y; for (y = 0; y < 16; y += 4) { // left edge - const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); - const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); - const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); - const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] const int16x8_t r1_lo = vaddq_s16(L1, d_lo); const int16x8_t r2_lo = vaddq_s16(L2, d_lo); @@ -1587,49 +1600,49 @@ static void TM16(uint8_t* dst) { extern void VP8DspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { - VP8Transform = TransformTwo; - VP8TransformAC3 = TransformAC3; - VP8TransformDC = TransformDC; - VP8TransformWHT = TransformWHT; - - VP8VFilter16 = VFilter16; - VP8VFilter16i = VFilter16i; - VP8HFilter16 = HFilter16; + VP8Transform = TransformTwo_NEON; + VP8TransformAC3 = TransformAC3_NEON; + VP8TransformDC = TransformDC_NEON; + VP8TransformWHT = TransformWHT_NEON; + + VP8VFilter16 = VFilter16_NEON; + VP8VFilter16i = VFilter16i_NEON; + VP8HFilter16 = HFilter16_NEON; #if !defined(WORK_AROUND_GCC) - VP8HFilter16i = HFilter16i; + VP8HFilter16i = HFilter16i_NEON; #endif - VP8VFilter8 = VFilter8; - VP8VFilter8i = VFilter8i; + VP8VFilter8 = VFilter8_NEON; + VP8VFilter8i = VFilter8i_NEON; #if !defined(WORK_AROUND_GCC) - VP8HFilter8 = HFilter8; - VP8HFilter8i = HFilter8i; + VP8HFilter8 = HFilter8_NEON; + VP8HFilter8i = HFilter8i_NEON; #endif - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[0] = DC4; - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[6] = LD4; - - VP8PredLuma16[0] = DC16TopLeft; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8SimpleVFilter16 = SimpleVFilter16_NEON; + VP8SimpleHFilter16 = SimpleHFilter16_NEON; + VP8SimpleVFilter16i = SimpleVFilter16i_NEON; + VP8SimpleHFilter16i = SimpleHFilter16i_NEON; + + VP8PredLuma4[0] = DC4_NEON; + VP8PredLuma4[1] = TM4_NEON; + VP8PredLuma4[2] = VE4_NEON; + VP8PredLuma4[4] = RD4_NEON; + VP8PredLuma4[6] = LD4_NEON; + + VP8PredLuma16[0] = DC16TopLeft_NEON; + VP8PredLuma16[1] = TM16_NEON; + VP8PredLuma16[2] = VE16_NEON; + VP8PredLuma16[3] = HE16_NEON; + VP8PredLuma16[4] = DC16NoTop_NEON; + VP8PredLuma16[5] = DC16NoLeft_NEON; + VP8PredLuma16[6] = DC16NoTopLeft_NEON; + + VP8PredChroma8[0] = DC8uv_NEON; + VP8PredChroma8[1] = TM8uv_NEON; + VP8PredChroma8[2] = VE8uv_NEON; + VP8PredChroma8[3] = HE8uv_NEON; + VP8PredChroma8[4] = DC8uvNoTop_NEON; + VP8PredChroma8[5] = DC8uvNoLeft_NEON; + VP8PredChroma8[6] = DC8uvNoTopLeft_NEON; } #else // !WEBP_USE_NEON diff --git a/media/libwebp/dsp/dec_sse2.c b/media/libwebp/dsp/dec_sse2.c index 411fb0276..f187a5bb4 100644 --- a/media/libwebp/dsp/dec_sse2.c +++ b/media/libwebp/dsp/dec_sse2.c @@ -12,23 +12,25 @@ // Author: somnath@google.com (Somnath Banerjee) // cduvivier@google.com (Christian Duvivier) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE2) // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C // one it seems => disable it by default. Uncomment the following to enable: -// #define USE_TRANSFORM_AC3 +#if !defined(USE_TRANSFORM_AC3) +#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE +#endif #include <emmintrin.h> -#include "./common_sse2.h" +#include "../dsp/common_sse2.h" #include "../dec/vp8i_dec.h" #include "../utils/utils.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) -static void Transform(const int16_t* in, uint8_t* dst, int do_two) { +static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { } } -#if defined(USE_TRANSFORM_AC3) +#if (USE_TRANSFORM_AC3 == 1) #define MUL(a, b) (((a) * (b)) >> 16) static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); @@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { _mm_subs_epu8((p), (q))) // Shift each byte of "x" by 3 bits while preserving by the sign bit. -static WEBP_INLINE void SignedShift8b(__m128i* const x) { +static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) { const __m128i zero = _mm_setzero_si128(); const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x); const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x); @@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) { } #define FLIP_SIGN_BIT2(a, b) { \ - a = _mm_xor_si128(a, sign_bit); \ - b = _mm_xor_si128(b, sign_bit); \ + (a) = _mm_xor_si128(a, sign_bit); \ + (b) = _mm_xor_si128(b, sign_bit); \ } #define FLIP_SIGN_BIT4(a, b, c, d) { \ @@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) { } // input/output is uint8_t -static WEBP_INLINE void GetNotHEV(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int hev_thresh, __m128i* const not_hev) { +static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int hev_thresh, __m128i* const not_hev) { const __m128i zero = _mm_setzero_si128(); const __m128i t_1 = MM_ABS(*p1, *p0); const __m128i t_2 = MM_ABS(*q1, *q0); @@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1, } // input pixels are int8_t -static WEBP_INLINE void GetBaseDelta(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - __m128i* const delta) { +static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + __m128i* const delta) { // beware of addition order, for saturation! const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0 @@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1, } // input and output are int8_t -static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, - const __m128i* const fl) { +static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0, + __m128i* const q0, + const __m128i* const fl) { const __m128i k3 = _mm_set1_epi8(3); const __m128i k4 = _mm_set1_epi8(4); __m128i v3 = _mm_adds_epi8(*fl, k3); __m128i v4 = _mm_adds_epi8(*fl, k4); - SignedShift8b(&v4); // v4 >> 3 - SignedShift8b(&v3); // v3 >> 3 + SignedShift8b_SSE2(&v4); // v4 >> 3 + SignedShift8b_SSE2(&v3); // v3 >> 3 *q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4 *p0 = _mm_adds_epi8(*p0, v3); // p0 += v3 } @@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). -static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, - const __m128i* const a0_lo, - const __m128i* const a0_hi) { +static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi, + const __m128i* const a0_lo, + const __m128i* const a0_hi) { const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7); const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7); const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi); @@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, } // input pixels are uint8_t -static WEBP_INLINE void NeedsFilter(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int thresh, __m128i* const mask) { +static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int thresh, __m128i* const mask) { const __m128i m_thresh = _mm_set1_epi8(thresh); const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) const __m128i kFE = _mm_set1_epi8(0xFE); @@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1, // Edge filtering functions // Applies filter on 2 pixels (p0 and q0) -static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1, - int thresh) { +static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1, + int thresh) { __m128i a, mask; const __m128i sign_bit = _mm_set1_epi8(0x80); - // convert p1/q1 to int8_t (for GetBaseDelta) + // convert p1/q1 to int8_t (for GetBaseDelta_SSE2) const __m128i p1s = _mm_xor_si128(*p1, sign_bit); const __m128i q1s = _mm_xor_si128(*q1, sign_bit); - NeedsFilter(p1, p0, q0, q1, thresh, &mask); + NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask); FLIP_SIGN_BIT2(*p0, *q0); - GetBaseDelta(&p1s, p0, q0, &q1s, &a); + GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a); a = _mm_and_si128(a, mask); // mask filter values we don't care about - DoSimpleFilter(p0, q0, &a); + DoSimpleFilter_SSE2(p0, q0, &a); FLIP_SIGN_BIT2(*p0, *q0); } // Applies filter on 4 pixels (p1, p0, q0 and q1) -static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1, - const __m128i* const mask, int hev_thresh) { +static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1, + const __m128i* const mask, + int hev_thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bit = _mm_set1_epi8(0x80); const __m128i k64 = _mm_set1_epi8(64); @@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, __m128i t1, t2, t3; // compute hev mask - GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); + GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); @@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3 t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4 - SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 - SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3 FLIP_SIGN_BIT2(*p0, *q0); @@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) -static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, - __m128i* const p0, __m128i* const q0, - __m128i* const q1, __m128i* const q2, - const __m128i* const mask, int hev_thresh) { +static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1, + __m128i* const p0, __m128i* const q0, + __m128i* const q1, __m128i* const q2, + const __m128i* const mask, + int hev_thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bit = _mm_set1_epi8(0x80); __m128i a, not_hev; // compute hev mask - GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); + GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev); FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); - GetBaseDelta(p1, p0, q0, q1, &a); + GetBaseDelta_SSE2(p1, p0, q0, q1, &a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); - DoSimpleFilter(p0, q0, &f); + DoSimpleFilter_SSE2(p0, q0, &f); } { // do strong filter on pixels with not hev @@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 - Update2Pixels(p2, q2, &a2_lo, &a2_hi); - Update2Pixels(p1, q1, &a1_lo, &a1_hi); - Update2Pixels(p0, q0, &a0_lo, &a0_hi); + Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi); + Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi); + Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi); } } // reads 8 rows across a vertical edge. -static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride, - __m128i* const p, __m128i* const q) { +static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride, + __m128i* const p, __m128i* const q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( @@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride, *q = _mm_unpackhi_epi32(C0, C1); } -static WEBP_INLINE void Load16x4(const uint8_t* const r0, - const uint8_t* const r8, - int stride, - __m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1) { +static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0, + const uint8_t* const r8, + int stride, + __m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1) { // Assume the pixels around the edge (|) are numbered as follows // 00 01 | 02 03 // 10 11 | 12 13 @@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0, // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(r0, stride, p1, q0); - Load8x4(r8, stride, p0, q1); + Load8x4_SSE2(r0, stride, p1, q0); + Load8x4_SSE2(r8, stride, p0, q1); { // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 @@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0, } } -static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { +static WEBP_INLINE void Store4x4_SSE2(__m128i* const x, + uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x)); @@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { } // Transpose back and store -static WEBP_INLINE void Store16x4(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - uint8_t* r0, uint8_t* r8, - int stride) { +static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + uint8_t* r0, uint8_t* r8, + int stride) { __m128i t1, p1_s, p0_s, q0_s, q1_s; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 @@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1, p1_s = _mm_unpacklo_epi16(t1, q1_s); q1_s = _mm_unpackhi_epi16(t1, q1_s); - Store4x4(&p0_s, r0, stride); + Store4x4_SSE2(&p0_s, r0, stride); r0 += 4 * stride; - Store4x4(&q0_s, r0, stride); + Store4x4_SSE2(&q0_s, r0, stride); - Store4x4(&p1_s, r8, stride); + Store4x4_SSE2(&p1_s, r8, stride); r8 += 4 * stride; - Store4x4(&q1_s, r8, stride); + Store4x4_SSE2(&q1_s, r8, stride); } //------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) { // Load __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]); __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]); __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]); - DoFilter2(&p1, &p0, &q0, &q1, thresh); + DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh); // Store _mm_storeu_si128((__m128i*)&p[-stride], p0); _mm_storeu_si128((__m128i*)&p[0], q0); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) { __m128i p1, p0, q0, q1; p -= 2; // beginning of p1 - Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); - DoFilter2(&p1, &p0, &q0, &q1, thresh); - Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride); + Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); + DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh); + Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride); } -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_SSE2(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_SSE2(p, stride, thresh); } } @@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { // Complex In-loop filtering (Paragraph 15.3) #define MAX_DIFF1(p3, p2, p1, p0, m) do { \ - m = MM_ABS(p1, p0); \ - m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ - m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + (m) = MM_ABS(p1, p0); \ + (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \ } while (0) #define MAX_DIFF2(p3, p2, p1, p0, m) do { \ - m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ - m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ - m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + (m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \ + (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \ } while (0) #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ - e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \ - e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \ - e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \ - e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \ + (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \ + (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \ + (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \ + (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \ } #define LOADUV_H_EDGE(p, u, v, stride) do { \ const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \ - p = _mm_unpacklo_epi64(U, V); \ + (p) = _mm_unpacklo_epi64(U, V); \ } while (0) #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ - LOADUV_H_EDGE(e1, u, v, 0 * stride); \ - LOADUV_H_EDGE(e2, u, v, 1 * stride); \ - LOADUV_H_EDGE(e3, u, v, 2 * stride); \ - LOADUV_H_EDGE(e4, u, v, 3 * stride); \ + LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \ + LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \ + LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \ + LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \ } #define STOREUV(p, u, v, stride) { \ - _mm_storel_epi64((__m128i*)&u[(stride)], p); \ - p = _mm_srli_si128(p, 8); \ - _mm_storel_epi64((__m128i*)&v[(stride)], p); \ + _mm_storel_epi64((__m128i*)&(u)[(stride)], p); \ + (p) = _mm_srli_si128(p, 8); \ + _mm_storel_epi64((__m128i*)&(v)[(stride)], p); \ } -static WEBP_INLINE void ComplexMask(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int thresh, int ithresh, - __m128i* const mask) { +static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int thresh, int ithresh, + __m128i* const mask) { const __m128i it = _mm_set1_epi8(ithresh); const __m128i diff = _mm_subs_epu8(*mask, it); const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128()); __m128i filter_mask; - NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask); + NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask); *mask = _mm_and_si128(thresh_mask, filter_mask); } // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i t1; __m128i mask; __m128i p2, p1, p0, q0, q1, q2; @@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride, LOAD_H_EDGES4(p, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&p[-3 * stride], p2); @@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride, _mm_storeu_si128((__m128i*)&p[+2 * stride], q2); } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const b = p - 4; - Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); MAX_DIFF1(p3, p2, p1, p0, mask); - Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); MAX_DIFF2(q3, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); - Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride); - Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride); + Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride); + Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride); } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants @@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride, // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&b[0 * stride], p1); @@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride, } } -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16i_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants - Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue + Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue for (k = 3; k > 0; --k) { __m128i mask, tmp1, tmp2; @@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride, p += 4; // beginning of q0 (and next span) MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask - Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); + Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); MAX_DIFF2(p3, p2, tmp1, tmp2, mask); - ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh); - Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); + Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); // rotate samples p1 = tmp1; @@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; @@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride, LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store STOREUV(p2, u, v, -3 * stride); @@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride, STOREUV(q2, u, v, 2 * stride); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const tu = u - 4; uint8_t* const tv = v - 4; - Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0); MAX_DIFF1(p3, p2, p1, p0, mask); - Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3); MAX_DIFF2(q3, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); - Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride); - Store16x4(&q0, &q1, &q2, &q3, u, v, stride); + Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride); + Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride); } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; @@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride, LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2); MAX_DIFF2(t2, t1, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh); // Store STOREUV(p1, u, v, -2 * stride); @@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride, STOREUV(q1, u, v, 1 * stride); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; - Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(t2, t1, p1, p0, mask); u += 4; // beginning of q0 v += 4; - Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 MAX_DIFF2(t2, t1, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh); u -= 2; // beginning of p1 v -= 2; - Store16x4(&p1, &p0, &q0, &q1, u, v, stride); + Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride); } //------------------------------------------------------------------------------ @@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 -static void VE4(uint8_t* dst) { // vertical +static void VE4_SSE2(uint8_t* dst) { // vertical const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical } } -static void LD4(uint8_t* dst) { // Down-Left +static void LD4_SSE2(uint8_t* dst) { // Down-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static void VR4(uint8_t* dst) { // Vertical-Right +static void VR4_SSE2(uint8_t* dst) { // Vertical-Right const __m128i one = _mm_set1_epi8(1); const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; @@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right DST(0, 3) = AVG3(K, J, I); } -static void VL4(uint8_t* dst) { // Vertical-Left +static void VL4_SSE2(uint8_t* dst) { // Vertical-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); @@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left DST(3, 3) = (extra_out >> 8) & 0xff; } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_SSE2(uint8_t* dst) { // Down-right const __m128i one = _mm_set1_epi8(1); const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); const __m128i ____XABCD = _mm_slli_si128(XABCD, 4); @@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right //------------------------------------------------------------------------------ // Luma 16x16 -static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { +static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { const uint8_t* top = dst - BPS; const __m128i zero = _mm_setzero_si128(); int y; @@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } -static void TM16(uint8_t* dst) { TrueMotion(dst, 16); } +static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); } +static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); } +static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); } -static void VE16(uint8_t* dst) { +static void VE16_SSE2(uint8_t* dst) { const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); int j; for (j = 0; j < 16; ++j) { @@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) { } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_SSE2(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { const __m128i values = _mm_set1_epi8(dst[-1]); @@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 16; ++j) { @@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { } } -static void DC16(uint8_t* dst) { // DC +static void DC16_SSE2(uint8_t* dst) { // DC const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); const __m128i sad8x2 = _mm_sad_epu8(top, zero); @@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC } { const int DC = _mm_cvtsi128_si32(sum) + left + 16; - Put16(DC >> 5, dst); + Put16_SSE2(DC >> 5, dst); } } -static void DC16NoTop(uint8_t* dst) { // DC with top samples not available +static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable int DC = 8; int j; for (j = 0; j < 16; ++j) { DC += dst[-1 + j * BPS]; } - Put16(DC >> 4, dst); + Put16_SSE2(DC >> 4, dst); } -static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available +static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); const __m128i sad8x2 = _mm_sad_epu8(top, zero); // sum the two sads: sad8x2[0:1] + sad8x2[8:9] const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); const int DC = _mm_cvtsi128_si32(sum) + 8; - Put16(DC >> 4, dst); + Put16_SSE2(DC >> 4, dst); } -static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples - Put16(0x80, dst); +static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples + Put16_SSE2(0x80, dst); } //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +static void VE8uv_SSE2(uint8_t* dst) { // vertical int j; const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); for (j = 0; j < 8; ++j) { @@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical } } -static void HE8uv(uint8_t* dst) { // horizontal - int j; - for (j = 0; j < 8; ++j) { - const __m128i values = _mm_set1_epi8(dst[-1]); - _mm_storel_epi64((__m128i*)dst, values); - dst += BPS; - } -} - // helper for chroma-DC predictions -static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 8; ++j) { @@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { } } -static void DC8uv(uint8_t* dst) { // DC +static void DC8uv_SSE2(uint8_t* dst) { // DC const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); const __m128i sum = _mm_sad_epu8(top, zero); @@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC } { const int DC = _mm_cvtsi128_si32(sum) + left + 8; - Put8x8uv(DC >> 4, dst); + Put8x8uv_SSE2(DC >> 4, dst); } } -static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples +static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); const __m128i sum = _mm_sad_epu8(top, zero); const int DC = _mm_cvtsi128_si32(sum) + 4; - Put8x8uv(DC >> 3, dst); + Put8x8uv_SSE2(DC >> 3, dst); } -static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples +static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { dc0 += dst[-1 + i * BPS]; } - Put8x8uv(dc0 >> 3, dst); + Put8x8uv_SSE2(dc0 >> 3, dst); } -static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing - Put8x8uv(0x80, dst); +static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing + Put8x8uv_SSE2(0x80, dst); } //------------------------------------------------------------------------------ @@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing extern void VP8DspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) { - VP8Transform = Transform; -#if defined(USE_TRANSFORM_AC3) - VP8TransformAC3 = TransformAC3; + VP8Transform = Transform_SSE2; +#if (USE_TRANSFORM_AC3 == 1) + VP8TransformAC3 = TransformAC3_SSE2; #endif - VP8VFilter16 = VFilter16; - VP8HFilter16 = HFilter16; - VP8VFilter8 = VFilter8; - VP8HFilter8 = HFilter8; - VP8VFilter16i = VFilter16i; - VP8HFilter16i = HFilter16i; - VP8VFilter8i = VFilter8i; - VP8HFilter8i = HFilter8i; - - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[5] = VR4; - VP8PredLuma4[6] = LD4; - VP8PredLuma4[7] = VL4; - - VP8PredLuma16[0] = DC16; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8VFilter16 = VFilter16_SSE2; + VP8HFilter16 = HFilter16_SSE2; + VP8VFilter8 = VFilter8_SSE2; + VP8HFilter8 = HFilter8_SSE2; + VP8VFilter16i = VFilter16i_SSE2; + VP8HFilter16i = HFilter16i_SSE2; + VP8VFilter8i = VFilter8i_SSE2; + VP8HFilter8i = HFilter8i_SSE2; + + VP8SimpleVFilter16 = SimpleVFilter16_SSE2; + VP8SimpleHFilter16 = SimpleHFilter16_SSE2; + VP8SimpleVFilter16i = SimpleVFilter16i_SSE2; + VP8SimpleHFilter16i = SimpleHFilter16i_SSE2; + + VP8PredLuma4[1] = TM4_SSE2; + VP8PredLuma4[2] = VE4_SSE2; + VP8PredLuma4[4] = RD4_SSE2; + VP8PredLuma4[5] = VR4_SSE2; + VP8PredLuma4[6] = LD4_SSE2; + VP8PredLuma4[7] = VL4_SSE2; + + VP8PredLuma16[0] = DC16_SSE2; + VP8PredLuma16[1] = TM16_SSE2; + VP8PredLuma16[2] = VE16_SSE2; + VP8PredLuma16[3] = HE16_SSE2; + VP8PredLuma16[4] = DC16NoTop_SSE2; + VP8PredLuma16[5] = DC16NoLeft_SSE2; + VP8PredLuma16[6] = DC16NoTopLeft_SSE2; + + VP8PredChroma8[0] = DC8uv_SSE2; + VP8PredChroma8[1] = TM8uv_SSE2; + VP8PredChroma8[2] = VE8uv_SSE2; + VP8PredChroma8[4] = DC8uvNoTop_SSE2; + VP8PredChroma8[5] = DC8uvNoLeft_SSE2; + VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/media/libwebp/dsp/dec_sse41.c b/media/libwebp/dsp/dec_sse41.c index 4e81ec4d8..02deae956 100644 --- a/media/libwebp/dsp/dec_sse41.c +++ b/media/libwebp/dsp/dec_sse41.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE41) @@ -19,7 +19,7 @@ #include "../dec/vp8i_dec.h" #include "../utils/utils.h" -static void HE16(uint8_t* dst) { // horizontal +static void HE16_SSE41(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { @@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal extern void VP8DspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) { - VP8PredLuma16[3] = HE16; + VP8PredLuma16[3] = HE16_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/media/libwebp/dsp/dsp.h b/media/libwebp/dsp/dsp.h index 813fed4a3..537ea2044 100644 --- a/media/libwebp/dsp/dsp.h +++ b/media/libwebp/dsp/dsp.h @@ -38,10 +38,22 @@ extern "C" { # define LOCAL_GCC_PREREQ(maj, min) 0 #endif +#if defined(__clang__) +# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) +# define LOCAL_CLANG_PREREQ(maj, min) \ + (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) +#else +# define LOCAL_CLANG_VERSION 0 +# define LOCAL_CLANG_PREREQ(maj, min) 0 +#endif + #ifndef __has_builtin # define __has_builtin(x) 0 #endif +// for now, none of the optimizations below are available in emscripten +#if !defined(EMSCRIPTEN) + #if defined(_MSC_VER) && _MSC_VER > 1310 && \ (defined(_M_X64) || defined(_M_IX86)) #define WEBP_MSC_SSE2 // Visual C++ SSE2 targets @@ -68,18 +80,20 @@ extern "C" { #define WEBP_USE_AVX2 #endif -#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) -#define WEBP_ANDROID_NEON // Android targets that might support NEON -#endif - // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. -#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \ +#if (defined(__ARM_NEON__) || \ defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \ !defined(__native_client__) #define WEBP_USE_NEON #endif +#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ + defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) +#define WEBP_ANDROID_NEON // Android targets that may have NEON +#define WEBP_USE_NEON +#endif + #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) #define WEBP_USE_NEON #define WEBP_USE_INTRINSICS @@ -90,7 +104,7 @@ extern "C" { #define WEBP_USE_MIPS32 #if (__mips_isa_rev >= 2) #define WEBP_USE_MIPS32_R2 -#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2) +#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) #define WEBP_USE_MIPS_DSP_R2 #endif #endif @@ -100,6 +114,24 @@ extern "C" { #define WEBP_USE_MSA #endif +#endif /* EMSCRIPTEN */ + +#ifndef WEBP_DSP_OMIT_C_CODE +#define WEBP_DSP_OMIT_C_CODE 1 +#endif + +#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE +#define WEBP_NEON_OMIT_C_CODE 1 +#else +#define WEBP_NEON_OMIT_C_CODE 0 +#endif + +#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#define WEBP_NEON_WORK_AROUND_GCC 1 +#else +#define WEBP_NEON_WORK_AROUND_GCC 0 +#endif + // This macro prevents thread_sanitizer from reporting known concurrent writes. #define WEBP_TSAN_IGNORE_FUNCTION #if defined(__has_feature) @@ -109,6 +141,42 @@ extern "C" { #endif #endif +#if defined(WEBP_USE_THREAD) && !defined(_WIN32) +#include <pthread.h> // NOLINT + +#define WEBP_DSP_INIT(func) do { \ + static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ + (VP8CPUInfo)&func ## _last_cpuinfo_used; \ + static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \ + if (pthread_mutex_lock(&func ## _lock)) break; \ + if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \ + func ## _last_cpuinfo_used = VP8GetCPUInfo; \ + (void)pthread_mutex_unlock(&func ## _lock); \ +} while (0) +#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) +#define WEBP_DSP_INIT(func) do { \ + static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ + (VP8CPUInfo)&func ## _last_cpuinfo_used; \ + if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \ + func(); \ + func ## _last_cpuinfo_used = VP8GetCPUInfo; \ +} while (0) +#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) + +// Defines an Init + helper function that control multiple initialization of +// function pointers / tables. +/* Usage: + WEBP_DSP_INIT_FUNC(InitFunc) { + ...function body + } +*/ +#define WEBP_DSP_INIT_FUNC(name) \ + static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \ + WEBP_TSAN_IGNORE_FUNCTION void name(void) { \ + WEBP_DSP_INIT(name ## _body); \ + } \ + static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void) + #define WEBP_UBSAN_IGNORE_UNDEF #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW #if defined(__clang__) && defined(__has_attribute) @@ -129,6 +197,18 @@ extern "C" { #endif #endif +// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) +#if !defined(WEBP_SWAP_16BIT_CSP) +#define WEBP_SWAP_16BIT_CSP 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + typedef enum { kSSE2, kSSE3, @@ -143,7 +223,7 @@ typedef enum { } CPUFeature; // returns true if the CPU supports the feature. typedef int (*VP8CPUInfo)(CPUFeature feature); -WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo; +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; //------------------------------------------------------------------------------ // Init stub generator @@ -152,7 +232,7 @@ WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo; // avoiding a compiler warning. #define WEBP_DSP_INIT_STUB(func) \ extern void func(void); \ - WEBP_TSAN_IGNORE_FUNCTION void func(void) {} + void func(void) {} //------------------------------------------------------------------------------ // Encoding @@ -271,6 +351,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1, int xo, int yo, // center position int W, int H); // plane dimension +#if !defined(WEBP_REDUCE_SIZE) // This version is called with the guarantee that you can load 8 bytes and // 8 rows at offset src1 and src2 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1, @@ -278,10 +359,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1, extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping +#endif +#if !defined(WEBP_DISABLE_STATS) typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1, const uint8_t* src2, int len); extern VP8AccumulateSSEFunc VP8AccumulateSSE; +#endif // must be called before using any of the above directly void VP8SSIMDspInit(void); @@ -462,12 +546,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand; extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink; // Plain-C implementation, as fall-back. -extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk, - const uint8_t* src); -extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk, - const uint8_t* src); -extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk); -extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk); +extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk, + const uint8_t* src); +extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk, + const uint8_t* src); +extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk); +extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk); // Main entry calls: extern void WebPRescalerImportRow(struct WebPRescaler* const wrk, @@ -533,24 +617,28 @@ void WebPMultRows(uint8_t* ptr, int stride, int width, int num_rows, int inverse); // Plain-C versions, used as fallback by some implementations. -void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, - int width, int inverse); -void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse); - -// To be called first before using the above. -void WebPInitAlphaProcessing(void); +void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse); +void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse); +#ifdef WORDS_BIGENDIAN // ARGB packing function: a/r/g/b input is rgba or bgra order. -extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r, - const uint8_t* g, const uint8_t* b, int len, - uint32_t* out); +extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, + const uint8_t* g, const uint8_t* b, int len, + uint32_t* out); +#endif // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order. -extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out); +extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out); + +// This function returns true if src[i] contains a value different from 0xff. +extern int (*WebPHasAlpha8b)(const uint8_t* src, int length); +// This function returns true if src[4*i] contains a value different from 0xff. +extern int (*WebPHasAlpha32b)(const uint8_t* src, int length); // To be called first before using the above. -void VP8EncDspARGBInit(void); +void WebPInitAlphaProcessing(void); //------------------------------------------------------------------------------ // Filter functions diff --git a/media/libwebp/dsp/filters.c b/media/libwebp/dsp/filters.c index 65f34aad1..dea3eb410 100644 --- a/media/libwebp/dsp/filters.c +++ b/media/libwebp/dsp/filters.c @@ -11,7 +11,7 @@ // // Author: Urvang (urvang@google.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #include <assert.h> #include <stdlib.h> #include <string.h> @@ -20,16 +20,17 @@ // Helpful macro. # define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, + uint8_t* dst, int length, int inverse) { int i; if (inverse) { for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i]; @@ -41,10 +42,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred, //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; preds += stride; in += stride; @@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { // Leftmost pixel is predicted from above. - PredictLine(in, preds - stride, out, 1, inverse); - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in, preds - stride, out, 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); ++row; preds += stride; in += stride; @@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; in += stride; out += stride; @@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - PredictLine(in, preds, out, width, inverse); + PredictLine_C(in, preds, out, width, inverse); ++row; preds += stride; in += stride; out += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Gradient filter. -static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { +static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) { const int g = a + b - c; return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit } -static WEBP_INLINE void DoGradientFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; preds += stride; in += stride; @@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, while (row < last_row) { int w; // leftmost pixel: predict from above. - PredictLine(in, preds - stride, out, 1, inverse); + PredictLine_C(in, preds - stride, out, 1, inverse); for (w = 1; w < width; ++w) { - const int pred = GradientPredictor(preds[w - 1], - preds[w - stride], - preds[w - stride - 1]); + const int pred = GradientPredictor_C(preds[w - 1], + preds[w - stride], + preds[w - stride - 1]); out[w] = in[w] + (inverse ? pred : -pred); } ++row; @@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, out += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef SANITY_CHECK //------------------------------------------------------------------------------ -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); +#if !WEBP_NEON_OMIT_C_CODE +static void HorizontalFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_C(data, width, height, stride, 0, height, 0, + filtered_data); } -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); +static void VerticalFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data); } - -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); +static void GradientFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data); } - +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ -static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { uint8_t pred = (prev == NULL) ? 0 : prev[0]; int i; for (i = 0; i < width; ++i) { @@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, } } -static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +#if !WEBP_NEON_OMIT_C_CODE +static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_C(NULL, in, out, width); } else { int i; for (i = 0; i < width; ++i) out[i] = prev[i] + in[i]; } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_C(NULL, in, out, width); } else { uint8_t top = prev[0], top_left = top, left = top; int i; for (i = 0; i < width; ++i) { top = prev[i]; // need to read this first, in case prev==out - left = in[i] + GradientPredictor(left, top, top_left); + left = in[i] + GradientPredictor_C(left, top, top_left); top_left = top; out[i] = left; } @@ -231,21 +238,20 @@ extern void VP8FiltersInitMSA(void); extern void VP8FiltersInitNEON(void); extern void VP8FiltersInitSSE2(void); -static volatile VP8CPUInfo filters_last_cpuinfo_used = - (VP8CPUInfo)&filters_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { - if (filters_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8FiltersInit) { WebPUnfilters[WEBP_FILTER_NONE] = NULL; - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; - WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; - WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; +#if !WEBP_NEON_OMIT_C_CODE + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C; +#endif + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C; WebPFilters[WEBP_FILTER_NONE] = NULL; - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; +#if !WEBP_NEON_OMIT_C_CODE + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C; +#endif if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -253,11 +259,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { VP8FiltersInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8FiltersInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { VP8FiltersInitMIPSdspR2(); @@ -269,5 +270,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { } #endif } - filters_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8FiltersInitNEON(); + } +#endif + + assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL); + assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL); + assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL); + assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL); + assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL); + assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL); } diff --git a/media/libwebp/dsp/filters_sse2.c b/media/libwebp/dsp/filters_sse2.c index 67f77999e..2cc9bb976 100644 --- a/media/libwebp/dsp/filters_sse2.c +++ b/media/libwebp/dsp/filters_sse2.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE2) @@ -24,16 +24,16 @@ // Helpful macro. # define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -static void PredictLineTop(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { +static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, + uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred, } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { +static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, while (row < last_row) { // Leftmost pixel is predicted from above. out[0] = in[0] - in[-stride]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); ++row; in += stride; out += stride; @@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - PredictLineTop(in, in - stride, out, width); + PredictLineTop_SSE2(in, in - stride, out, width); ++row; in += stride; out += stride; @@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Gradient filter. -static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) { +static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) { const int g = a + b - c; return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit } -static void GradientPredictDirect(const uint8_t* const row, - const uint8_t* const top, - uint8_t* const out, int length) { +static void GradientPredictDirect_SSE2(const uint8_t* const row, + const uint8_t* const top, + uint8_t* const out, int length) { const int max_pos = length & ~7; int i; const __m128i zero = _mm_setzero_si128(); @@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row, _mm_storel_epi64((__m128i*)(out + i), H); } for (; i < length; ++i) { - out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]); + out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]); } } -static WEBP_INLINE void DoGradientFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - uint8_t* out) { +static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { out[0] = in[0] - in[-stride]; - GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); + GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1); ++row; in += stride; out += stride; @@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, //------------------------------------------------------------------------------ -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); +static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_SSE2(data, width, height, stride, 0, height, + filtered_data); } -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); +static void VerticalFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data); } -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, filtered_data); +static void GradientFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ // Inverse transforms -static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { int i; __m128i last; out[0] = in[0] + (prev == NULL ? 0 : prev[0]); @@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, for (; i < width; ++i) out[i] = in[i] + out[i - 1]; } -static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_SSE2(NULL, in, out, width); } else { int i; const int max_pos = width & ~31; @@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, } } -static void GradientPredictInverse(const uint8_t* const in, - const uint8_t* const top, - uint8_t* const row, int length) { +static void GradientPredictInverse_SSE2(const uint8_t* const in, + const uint8_t* const top, + uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; @@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in, _mm_storel_epi64((__m128i*)&row[i], out); } for (; i < length; ++i) { - row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); + row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]); } } } -static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_SSE2(NULL, in, out, width); } else { out[0] = in[0] + prev[0]; // predict from above - GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1); + GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1); } } @@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, extern void VP8FiltersInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) { - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; - WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; - WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2; + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2; - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/media/libwebp/dsp/lossless.c b/media/libwebp/dsp/lossless.c index 20d18f6ec..93ccecdfd 100644 --- a/media/libwebp/dsp/lossless.c +++ b/media/libwebp/dsp/lossless.c @@ -13,14 +13,15 @@ // Jyrki Alakuijala (jyrki@google.com) // Urvang Joshi (urvang@google.com) -#include "./dsp.h" +#include "../dsp/dsp.h" +#include <assert.h> #include <math.h> #include <stdlib.h> #include "../dec/vp8li_dec.h" #include "../utils/endian_inl_utils.h" -#include "./lossless.h" -#include "./lossless_common.h" +#include "../dsp/lossless.h" +#include "../dsp/lossless_common.h" #define MAX_DIFF_COST (1e30f) @@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; } -// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined. -#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409 +// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is +// inlined. +#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409 # define LOCAL_INLINE __attribute__ ((noinline)) #else # define LOCAL_INLINE WEBP_INLINE @@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { //------------------------------------------------------------------------------ // Predictors -static uint32_t Predictor0(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) { (void)top; (void)left; return ARGB_BLACK; } -static uint32_t Predictor1(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) { (void)top; return left; } -static uint32_t Predictor2(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) { (void)left; return top[0]; } -static uint32_t Predictor3(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) { (void)left; return top[1]; } -static uint32_t Predictor4(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) { (void)left; return top[-1]; } -static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average3(left, top[0], top[1]); return pred; } -static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[-1]); return pred; } -static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[0]); return pred; } -static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[-1], top[0]); (void)left; return pred; } -static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[0], top[1]); (void)left; return pred; } -static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average4(left, top[-1], top[0], top[1]); return pred; } -static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Select(top[0], left, top[-1]); return pred; } -static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); return pred; } -static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); return pred; } -GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0) -static void PredictorAdd1(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { +GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C) +static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper, + int num_pixels, uint32_t* out) { int i; uint32_t left = out[-1]; for (i = 0; i < num_pixels; ++i) { @@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper, } (void)upper; } -GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2) -GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3) -GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4) -GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5) -GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6) -GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7) -GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8) -GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9) -GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10) -GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11) -GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12) -GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13) +GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C) +GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C) +GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C) +GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C) +GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C) +GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C) +GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C) +GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C) +GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C) +GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C) +GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C) +GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C) //------------------------------------------------------------------------------ // Inverse prediction. -static void PredictorInverseTransform(const VP8LTransform* const transform, - int y_start, int y_end, - const uint32_t* in, uint32_t* out) { +static void PredictorInverseTransform_C(const VP8LTransform* const transform, + int y_start, int y_end, + const uint32_t* in, uint32_t* out) { const int width = transform->xsize_; if (y_start == 0) { // First Row follows the L (mode=1) mode. - PredictorAdd0(in, NULL, 1, out); - PredictorAdd1(in + 1, NULL, width - 1, out + 1); + PredictorAdd0_C(in, NULL, 1, out); + PredictorAdd1_C(in + 1, NULL, width - 1, out + 1); in += width; out += width; ++y_start; @@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform, const uint32_t* pred_mode_src = pred_mode_base; int x = 1; // First pixel follows the T (mode=2) mode. - PredictorAdd2(in, out - width, 1, out); + PredictorAdd2_C(in, out - width, 1, out); // .. the rest: while (x < width) { const VP8LPredictorAddSubFunc pred_func = @@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, const uint32_t argb = src[i]; const uint32_t green = argb >> 8; const uint32_t red = argb >> 16; - int new_red = red; - int new_blue = argb; + int new_red = red & 0xff; + int new_blue = argb & 0xff; new_red += ColorTransformDelta(m->green_to_red_, green); new_red &= 0xff; new_blue += ColorTransformDelta(m->green_to_blue_, green); @@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, } // Color space inverse transform. -static void ColorSpaceInverseTransform(const VP8LTransform* const transform, - int y_start, int y_end, - const uint32_t* src, uint32_t* dst) { +static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform, + int y_start, int y_end, + const uint32_t* src, uint32_t* dst) { const int width = transform->xsize_; const int tile_width = 1 << transform->bits_; const int mask = tile_width - 1; @@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \ } \ } -COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b, - VP8GetARGBIndex, VP8GetARGBValue) -COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t, - 8b, VP8GetAlphaIndex, VP8GetAlphaValue) +COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static, + uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue) +COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, , + uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue) #undef COLOR_INDEX_INVERSE @@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out); break; case PREDICTOR_TRANSFORM: - PredictorInverseTransform(transform, row_start, row_end, in, out); + PredictorInverseTransform_C(transform, row_start, row_end, in, out); if (row_end != transform->ysize_) { // The last predicted row in this iteration will be the top-pred row // for the first row in next iteration. @@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, } break; case CROSS_COLOR_TRANSFORM: - ColorSpaceInverseTransform(transform, row_start, row_end, in, out); + ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out); break; case COLOR_INDEXING_TRANSFORM: if (in == out && transform->bits_ > 0) { @@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform, VP8LSubSampleSize(transform->xsize_, transform->bits_); uint32_t* const src = out + out_stride - in_stride; memmove(src, out, in_stride * sizeof(*src)); - ColorIndexInverseTransform(transform, row_start, row_end, src, out); + ColorIndexInverseTransform_C(transform, row_start, row_end, src, out); } else { - ColorIndexInverseTransform(transform, row_start, row_end, in, out); + ColorIndexInverseTransform_C(transform, row_start, row_end, in, out); } break; } @@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, const uint32_t argb = *src++; const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf); const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) *dst++ = ba; *dst++ = rg; #else @@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src, const uint32_t argb = *src++; const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7); const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) *dst++ = gb; *dst++ = rg; #else @@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; - -#if !defined(WORDS_BIGENDIAN) -#if !defined(WEBP_REFERENCE_IMPLEMENTATION) WebPUint32ToMem(dst, BSwap32(argb)); -#else // WEBP_REFERENCE_IMPLEMENTATION - dst[0] = (argb >> 24) & 0xff; - dst[1] = (argb >> 16) & 0xff; - dst[2] = (argb >> 8) & 0xff; - dst[3] = (argb >> 0) & 0xff; -#endif -#else // WORDS_BIGENDIAN - dst[0] = (argb >> 0) & 0xff; - dst[1] = (argb >> 8) & 0xff; - dst[2] = (argb >> 16) & 0xff; - dst[3] = (argb >> 24) & 0xff; -#endif dst += sizeof(argb); } } else { @@ -590,48 +577,46 @@ extern void VP8LDspInitNEON(void); extern void VP8LDspInitMIPSdspR2(void); extern void VP8LDspInitMSA(void); -static volatile VP8CPUInfo lossless_last_cpuinfo_used = - (VP8CPUInfo)&lossless_last_cpuinfo_used; - -#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \ - (OUT)[0] = IN##0; \ - (OUT)[1] = IN##1; \ - (OUT)[2] = IN##2; \ - (OUT)[3] = IN##3; \ - (OUT)[4] = IN##4; \ - (OUT)[5] = IN##5; \ - (OUT)[6] = IN##6; \ - (OUT)[7] = IN##7; \ - (OUT)[8] = IN##8; \ - (OUT)[9] = IN##9; \ - (OUT)[10] = IN##10; \ - (OUT)[11] = IN##11; \ - (OUT)[12] = IN##12; \ - (OUT)[13] = IN##13; \ - (OUT)[14] = IN##0; /* <- padding security sentinels*/ \ - (OUT)[15] = IN##0; \ +#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \ + (OUT)[0] = IN##0_C; \ + (OUT)[1] = IN##1_C; \ + (OUT)[2] = IN##2_C; \ + (OUT)[3] = IN##3_C; \ + (OUT)[4] = IN##4_C; \ + (OUT)[5] = IN##5_C; \ + (OUT)[6] = IN##6_C; \ + (OUT)[7] = IN##7_C; \ + (OUT)[8] = IN##8_C; \ + (OUT)[9] = IN##9_C; \ + (OUT)[10] = IN##10_C; \ + (OUT)[11] = IN##11_C; \ + (OUT)[12] = IN##12_C; \ + (OUT)[13] = IN##13_C; \ + (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \ + (OUT)[15] = IN##0_C; \ } while (0); -WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { - if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8LDspInit) { COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors) COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C) +#if !WEBP_NEON_OMIT_C_CODE VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C; VP8LTransformColorInverse = VP8LTransformColorInverse_C; - VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C; + VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; + VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C; +#endif + VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C; VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C; - VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C; - VP8LMapColor32b = MapARGB; - VP8LMapColor8b = MapAlpha; + VP8LMapColor32b = MapARGB_C; + VP8LMapColor8b = MapAlpha_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -640,11 +625,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { VP8LDspInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8LDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { VP8LDspInitMIPSdspR2(); @@ -656,7 +636,23 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { } #endif } - lossless_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8LDspInitNEON(); + } +#endif + + assert(VP8LAddGreenToBlueAndRed != NULL); + assert(VP8LTransformColorInverse != NULL); + assert(VP8LConvertBGRAToRGBA != NULL); + assert(VP8LConvertBGRAToRGB != NULL); + assert(VP8LConvertBGRAToBGR != NULL); + assert(VP8LConvertBGRAToRGBA4444 != NULL); + assert(VP8LConvertBGRAToRGB565 != NULL); + assert(VP8LMapColor32b != NULL); + assert(VP8LMapColor8b != NULL); } #undef COPY_PREDICTOR_ARRAY diff --git a/media/libwebp/dsp/lossless.h b/media/libwebp/dsp/lossless.h index 352a54e50..4a1d1e0dd 100644 --- a/media/libwebp/dsp/lossless.h +++ b/media/libwebp/dsp/lossless.h @@ -25,10 +25,6 @@ extern "C" { #endif -#ifdef WEBP_EXPERIMENTAL_FEATURES -#include "../enc/delta_palettization_enc.h" -#endif // WEBP_EXPERIMENTAL_FEATURES - //------------------------------------------------------------------------------ // Decoding @@ -124,7 +120,7 @@ void VP8LDspInit(void); typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels); extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, - uint32_t* const dst, int num_pixels); + uint32_t* dst, int num_pixels); extern VP8LTransformColorFunc VP8LTransformColor; typedef void (*VP8LCollectColorBlueTransformsFunc)( const uint32_t* argb, int stride, diff --git a/media/libwebp/dsp/lossless_common.h b/media/libwebp/dsp/lossless_common.h index c40f71120..dd2e4f247 100644 --- a/media/libwebp/dsp/lossless_common.h +++ b/media/libwebp/dsp/lossless_common.h @@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { // ----------------------------------------------------------------------------- // PrefixEncode() -static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { - const int log_floor = BitsLog2Floor(n); - if (n == (n & ~(n - 1))) { // zero or a power of two. - return log_floor; - } - return log_floor + 1; -} - // Splitting of distance and length codes into prefixes and // extra bits. The prefixes are encoded with an entropy code // while the extra bits are stored just as normal bits. diff --git a/media/libwebp/dsp/lossless_neon.c b/media/libwebp/dsp/lossless_neon.c index 1145d5fad..a7bf47f3c 100644 --- a/media/libwebp/dsp/lossless_neon.c +++ b/media/libwebp/dsp/lossless_neon.c @@ -11,14 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <arm_neon.h> -#include "./lossless.h" -#include "./neon.h" +#include "../dsp/lossless.h" +#include "../dsp/neon.h" //------------------------------------------------------------------------------ // Colorspace conversion functions @@ -26,8 +26,8 @@ #if !defined(WORK_AROUND_GCC) // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc-4.8.x at least. -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src, VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src, static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~1); const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); for (; src < end; src += 2) { @@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = { { 21, 22, 24, 25, 26, 28, 29, 30 } }; -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); @@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = { { 21, 20, 26, 25, 24, 30, 29, 28 } }; -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); @@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src, #endif // !WORK_AROUND_GCC - //------------------------------------------------------------------------------ // Predictor Transform @@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x16_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x16_t shuffle) { return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), vtbl1q_u8(argb, vget_high_u8(shuffle))); } @@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, // 255 = byte will be zeroed static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x8_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x8_t shuffle) { return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); } #endif // USE_VTBLQ -static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels, + uint32_t* dst) { const uint32_t* const end = src + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); @@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, #endif for (; src < end; src += 4, dst += 4) { const uint8x16_t argb = vld1q_u8((const uint8_t*)src); - const uint8x16_t greens = DoGreenShuffle(argb, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle); vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C @@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, //------------------------------------------------------------------------------ // Color Transform -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_NEON(const VP8LMultipliers* const m, + const uint32_t* const src, + int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 6. #define CST(X) (((int16_t)(m->X << 8)) >> 6) const int16_t rb[8] = { @@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m, const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i)); const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); // 0 g 0 g - const uint8x16_t greens = DoGreenShuffle(in, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle); // x dr x db1 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); // x r' x b' @@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { VP8LPredictorsAdd[12] = PredictorAdd12_NEON; VP8LPredictorsAdd[13] = PredictorAdd13_NEON; - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON; + VP8LTransformColorInverse = TransformColorInverse_NEON; } #else // !WEBP_USE_NEON diff --git a/media/libwebp/dsp/lossless_sse2.c b/media/libwebp/dsp/lossless_sse2.c index 15aae9386..c40fcfb76 100644 --- a/media/libwebp/dsp/lossless_sse2.c +++ b/media/libwebp/dsp/lossless_sse2.c @@ -11,21 +11,22 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE2) -#include "./common_sse2.h" -#include "./lossless.h" -#include "./lossless_common.h" +#include "../dsp/common_sse2.h" +#include "../dsp/lossless.h" +#include "../dsp/lossless_common.h" #include <assert.h> #include <emmintrin.h> //------------------------------------------------------------------------------ // Predictor Transform -static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, - uint32_t c2) { +static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0, + uint32_t c1, + uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); @@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, return output; } -static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, - uint32_t c2) { +static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, + uint32_t c1, + uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); @@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, return output; } -static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { +static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_cvtsi32_si128(a); @@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0, *avg = _mm_sub_epi8(avg1, one); } -static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, - __m128i* const avg) { +static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, + const uint32_t a1, + __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); const __m128i A0 = _mm_cvtsi32_si128(a0); @@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, *avg = _mm_sub_epi8(avg1, one); } -static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { +static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); @@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { return _mm_srli_epi16(sum, 1); } -static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { +static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) { __m128i output; - Average2_uint32(a0, a1, &output); + Average2_uint32_SSE2(a0, a1, &output); return _mm_cvtsi128_si32(output); } -static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { +static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1, + uint32_t a2) { const __m128i zero = _mm_setzero_si128(); - const __m128i avg1 = Average2_uint32_16(a0, a2); + const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); @@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { return output; } -static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, - uint32_t a2, uint32_t a3) { - const __m128i avg1 = Average2_uint32_16(a0, a1); - const __m128i avg2 = Average2_uint32_16(a2, a3); +static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, + uint32_t a2, uint32_t a3) { + const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1); + const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3); const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); @@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, } static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average3(left, top[0], top[1]); + const uint32_t pred = Average3_SSE2(left, top[0], top[1]); return pred; } static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[-1]); + const uint32_t pred = Average2_SSE2(left, top[-1]); return pred; } static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[0]); + const uint32_t pred = Average2_SSE2(left, top[0]); return pred; } static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[-1], top[0]); + const uint32_t pred = Average2_SSE2(top[-1], top[0]); (void)left; return pred; } static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[0], top[1]); + const uint32_t pred = Average2_SSE2(top[0], top[1]); (void)left; return pred; } static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average4(left, top[-1], top[0], top[1]); + const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]); return pred; } static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Select(top[0], left, top[-1]); + const uint32_t pred = Select_SSE2(top[0], left, top[-1]); return pred; } static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); + const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]); return pred; } static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); + const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]); return pred; } @@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) #undef GENERATE_PREDICTOR_2 // Predictor10: average of (average of (L,TL), average of (T, TR)). +#define DO_PRED10(OUT) do { \ + __m128i avgLTL, avg; \ + Average2_m128i(&L, &TL, &avgLTL); \ + Average2_m128i(&avgTTR, &avgLTL, &avg); \ + L = _mm_add_epi8(avg, src); \ + out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +} while (0) + +#define DO_PRED10_SHIFT do { \ + /* Rotate the pre-computed values for the next iteration.*/ \ + avgTTR = _mm_srli_si128(avgTTR, 4); \ + TL = _mm_srli_si128(TL, 4); \ + src = _mm_srli_si128(src, 4); \ +} while (0) + static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { - int i, j; + int i; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); @@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); __m128i avgTTR; Average2_m128i(&T, &TR, &avgTTR); - for (j = 0; j < 4; ++j) { - __m128i avgLTL, avg; - Average2_m128i(&L, &TL, &avgLTL); - Average2_m128i(&avgTTR, &avgLTL, &avg); - L = _mm_add_epi8(avg, src); - out[i + j] = _mm_cvtsi128_si32(L); - // Rotate the pre-computed values for the next iteration. - avgTTR = _mm_srli_si128(avgTTR, 4); - TL = _mm_srli_si128(TL, 4); - src = _mm_srli_si128(src, 4); - } + DO_PRED10(0); + DO_PRED10_SHIFT; + DO_PRED10(1); + DO_PRED10_SHIFT; + DO_PRED10(2); + DO_PRED10_SHIFT; + DO_PRED10(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); } } +#undef DO_PRED10 +#undef DO_PRED10_SHIFT // Predictor11: select. -static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, - __m128i* const out) { - // We can unpack with any value on the upper 32 bits, provided it's the same - // on both operands (to that their sum of abs diff is zero). Here we use *A. - const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); - const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); - const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); - const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); - const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); - const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); - *out = _mm_packs_epi32(s_lo, s_hi); -} +#define DO_PRED11(OUT) do { \ + const __m128i L_lo = _mm_unpacklo_epi32(L, T); \ + const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \ + const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \ + const __m128i mask = _mm_cmpgt_epi32(pb, pa); \ + const __m128i A = _mm_and_si128(mask, L); \ + const __m128i B = _mm_andnot_si128(mask, T); \ + const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \ + L = _mm_add_epi8(src, pred); \ + out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +} while (0) + +#define DO_PRED11_SHIFT do { \ + /* Shift the pre-computed value for the next iteration.*/ \ + T = _mm_srli_si128(T, 4); \ + TL = _mm_srli_si128(TL, 4); \ + src = _mm_srli_si128(src, 4); \ + pa = _mm_srli_si128(pa, 4); \ +} while (0) static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { - int i, j; + int i; + __m128i pa; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); - __m128i pa; - GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| - for (j = 0; j < 4; ++j) { - const __m128i L_lo = _mm_unpacklo_epi32(L, L); - const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); - const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| - const __m128i mask = _mm_cmpgt_epi32(pb, pa); - const __m128i A = _mm_and_si128(mask, L); - const __m128i B = _mm_andnot_si128(mask, T); - const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T - L = _mm_add_epi8(src, pred); - out[i + j] = _mm_cvtsi128_si32(L); - // Shift the pre-computed value for the next iteration. - T = _mm_srli_si128(T, 4); - TL = _mm_srli_si128(TL, 4); - src = _mm_srli_si128(src, 4); - pa = _mm_srli_si128(pa, 4); + { + // We can unpack with any value on the upper 32 bits, provided it's the + // same on both operands (so that their sum of abs diff is zero). Here we + // use T. + const __m128i T_lo = _mm_unpacklo_epi32(T, T); + const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); + const __m128i T_hi = _mm_unpackhi_epi32(T, T); + const __m128i TL_hi = _mm_unpackhi_epi32(TL, T); + const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo); + const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi); + pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL| } + DO_PRED11(0); + DO_PRED11_SHIFT; + DO_PRED11(1); + DO_PRED11_SHIFT; + DO_PRED11(2); + DO_PRED11_SHIFT; + DO_PRED11(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } } +#undef DO_PRED11 +#undef DO_PRED11_SHIFT // Predictor12: ClampedAddSubtractFull. -#define DO_PRED12(DIFF, LANE, OUT) \ -do { \ - const __m128i all = _mm_add_epi16(L, (DIFF)); \ - const __m128i alls = _mm_packus_epi16(all, all); \ - const __m128i res = _mm_add_epi8(src, alls); \ - out[i + (OUT)] = _mm_cvtsi128_si32(res); \ - L = _mm_unpacklo_epi8(res, zero); \ +#define DO_PRED12(DIFF, LANE, OUT) do { \ + const __m128i all = _mm_add_epi16(L, (DIFF)); \ + const __m128i alls = _mm_packus_epi16(all, all); \ + const __m128i res = _mm_add_epi8(src, alls); \ + out[i + (OUT)] = _mm_cvtsi128_si32(res); \ + L = _mm_unpacklo_epi8(res, zero); \ +} while (0) + +#define DO_PRED12_SHIFT(DIFF, LANE) do { \ /* Shift the pre-computed value for the next iteration.*/ \ - if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ + if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ src = _mm_srli_si128(src, 4); \ } while (0) @@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); DO_PRED12(diff_lo, 0, 0); + DO_PRED12_SHIFT(diff_lo, 0); DO_PRED12(diff_lo, 1, 1); + DO_PRED12_SHIFT(diff_lo, 1); DO_PRED12(diff_hi, 0, 2); + DO_PRED12_SHIFT(diff_hi, 0); DO_PRED12(diff_hi, 1, 3); } if (i != num_pixels) { @@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, } } #undef DO_PRED12 +#undef DO_PRED12_SHIFT // Due to averages with integers, values cannot be accumulated in parallel for // predictors 13. @@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2) //------------------------------------------------------------------------------ // Subtract-Green Transform -static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, + uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb @@ -414,19 +448,16 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, //------------------------------------------------------------------------------ // Color Transform -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, + const uint32_t* const src, + int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend - const __m128i mults_rb = _mm_set_epi16( - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_)); - const __m128i mults_b2 = _mm_set_epi16( - CST(red_to_blue_), 0, CST(red_to_blue_), 0, - CST(red_to_blue_), 0, CST(red_to_blue_), 0); +#define MK_CST_16(HI, LO) \ + _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) + const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); + const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); +#undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; @@ -454,8 +485,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m, //------------------------------------------------------------------------------ // Color-space conversion functions -static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, - uint8_t* dst) { +static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, + uint8_t* dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; @@ -469,11 +500,11 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, __m128i in5 = _mm_loadu_si128(in + 5); __m128i in6 = _mm_loadu_si128(in + 6); __m128i in7 = _mm_loadu_si128(in + 7); - VP8L32bToPlanar(&in0, &in1, &in2, &in3); - VP8L32bToPlanar(&in4, &in5, &in6, &in7); + VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3); + VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7); // At this points, in1/in5 contains red only, in2/in6 green only ... // Pack the colors in 24b RGB. - VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7); + VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7); _mm_storeu_si128(out + 0, in1); _mm_storeu_si128(out + 1, in5); _mm_storeu_si128(out + 2, in2); @@ -490,27 +521,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, } } -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { - const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 - const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 - const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... - const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... - const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... - const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... - const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 - const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 - const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 - const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 - const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 - const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 - const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... - const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... - _mm_storeu_si128(out++, rgba0); - _mm_storeu_si128(out++, rgba4); + const __m128i A1 = _mm_loadu_si128(in++); + const __m128i A2 = _mm_loadu_si128(in++); + const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0 + const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0 + const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A + const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A + const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i F1 = _mm_or_si128(E1, C1); + const __m128i F2 = _mm_or_si128(E2, C2); + _mm_storeu_si128(out++, F1); + _mm_storeu_si128(out++, F2); num_pixels -= 8; } // left-overs @@ -519,8 +549,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, } } -static void ConvertBGRAToRGBA4444(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); const __m128i* in = (const __m128i*)src; @@ -541,7 +571,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 #else const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 @@ -555,8 +585,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, } } -static void ConvertBGRAToRGB565(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); @@ -582,7 +612,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src, const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx const __m128i b1 = _mm_srli_epi16(b0, 3); const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 #else const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 @@ -596,8 +626,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src, } } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); const __m128i* in = (const __m128i*)src; @@ -660,14 +690,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) { VP8LPredictorsAdd[12] = PredictorAdd12_SSE2; VP8LPredictorsAdd[13] = PredictorAdd13_SSE2; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2; + VP8LTransformColorInverse = TransformColorInverse_SSE2; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; - VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2; + VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2; + VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/media/libwebp/dsp/moz.build b/media/libwebp/dsp/moz.build index 006a691a0..fa6df9e9e 100644 --- a/media/libwebp/dsp/moz.build +++ b/media/libwebp/dsp/moz.build @@ -27,8 +27,10 @@ SOURCES += [ 'upsampling.c', 'upsampling_neon.c', 'upsampling_sse2.c', + 'upsampling_sse41.c', 'yuv.c', 'yuv_sse2.c', + 'yuv_sse41.c', ] if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']: @@ -45,7 +47,9 @@ elif CONFIG['INTEL_ARCHITECTURE']: SOURCES['lossless_sse2.c'].flags += CONFIG['SSE2_FLAGS'] SOURCES['rescaler_sse2.c'].flags += CONFIG['SSE2_FLAGS'] SOURCES['upsampling_sse2.c'].flags += CONFIG['SSE2_FLAGS'] + SOURCES['upsampling_sse41.c'].flags += CONFIG['SSE2_FLAGS'] SOURCES['yuv_sse2.c'].flags += CONFIG['SSE2_FLAGS'] + SOURCES['yuv_sse41.c'].flags += CONFIG['SSE2_FLAGS'] FINAL_LIBRARY = 'gkmedias' diff --git a/media/libwebp/dsp/msa_macro.h b/media/libwebp/dsp/msa_macro.h index d0e5f45e0..dfacda6cc 100644 --- a/media/libwebp/dsp/msa_macro.h +++ b/media/libwebp/dsp/msa_macro.h @@ -22,6 +22,7 @@ #endif #ifdef CLANG_BUILD + #define ALPHAVAL (-1) #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b) #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b) @@ -32,6 +33,7 @@ #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) #define ORI_B(a, b) __msa_ori_b((v16u8)a, b) #else + #define ALPHAVAL (0xff) #define ADDVI_H(a, b) (a + b) #define ADDVI_W(a, b) (a + b) #define SRAI_B(a, b) (a >> b) diff --git a/media/libwebp/dsp/neon.h b/media/libwebp/dsp/neon.h index 3b548a685..63c27a290 100644 --- a/media/libwebp/dsp/neon.h +++ b/media/libwebp/dsp/neon.h @@ -14,11 +14,12 @@ #include <arm_neon.h> -#include "./dsp.h" +#include "../dsp/dsp.h" // Right now, some intrinsics functions seem slower, so we disable them -// everywhere except aarch64 where the inline assembly is incompatible. -#if defined(__aarch64__) +// everywhere except newer clang/gcc or aarch64 where the inline assembly is +// incompatible. +#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) #define WEBP_USE_INTRINSICS // use intrinsics when possible #endif @@ -43,11 +44,11 @@ // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) -#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) #define WORK_AROUND_GCC #endif -static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { +static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) { uint64x2x2_t row01, row23; row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); diff --git a/media/libwebp/dsp/rescaler.c b/media/libwebp/dsp/rescaler.c index 0f5450235..f70e6beef 100644 --- a/media/libwebp/dsp/rescaler.c +++ b/media/libwebp/dsp/rescaler.c @@ -13,7 +13,7 @@ #include <assert.h> -#include "./dsp.h" +#include "../dsp/dsp.h" #include "../utils/rescaler_utils.h" //------------------------------------------------------------------------------ @@ -25,7 +25,8 @@ //------------------------------------------------------------------------------ // Row import -void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) { +void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) { } } -void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) { +void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) { //------------------------------------------------------------------------------ // Row export -void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { +void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { } } -void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) { +void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -202,16 +204,15 @@ extern void WebPRescalerDspInitMIPSdspR2(void); extern void WebPRescalerDspInitMSA(void); extern void WebPRescalerDspInitNEON(void); -static volatile VP8CPUInfo rescaler_last_cpuinfo_used = - (VP8CPUInfo)&rescaler_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { - if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return; +WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) { +#if !defined(WEBP_REDUCE_SIZE) +#if !WEBP_NEON_OMIT_C_CODE + WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C; + WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C; +#endif - WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC; - WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC; - WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC; - WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC; + WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C; + WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C; if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -219,11 +220,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { WebPRescalerDspInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPRescalerDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { WebPRescalerDspInitMIPS32(); @@ -240,5 +236,17 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { } #endif } - rescaler_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPRescalerDspInitNEON(); + } +#endif + + assert(WebPRescalerExportRowExpand != NULL); + assert(WebPRescalerExportRowShrink != NULL); + assert(WebPRescalerImportRowExpand != NULL); + assert(WebPRescalerImportRowShrink != NULL); +#endif // WEBP_REDUCE_SIZE } diff --git a/media/libwebp/dsp/rescaler_neon.c b/media/libwebp/dsp/rescaler_neon.c index b2dd8f30c..835e646c1 100644 --- a/media/libwebp/dsp/rescaler_neon.c +++ b/media/libwebp/dsp/rescaler_neon.c @@ -11,13 +11,13 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" -#if defined(WEBP_USE_NEON) +#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE) #include <arm_neon.h> #include <assert.h> -#include "./neon.h" +#include "../dsp/neon.h" #include "../utils/rescaler_utils.h" #define ROUNDER (WEBP_RESCALER_ONE >> 1) @@ -41,9 +41,9 @@ #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work" #endif -static uint32x4_t Interpolate(const rescaler_t* const frow, - const rescaler_t* const irow, - uint32_t A, uint32_t B) { +static uint32x4_t Interpolate_NEON(const rescaler_t* const frow, + const rescaler_t* const irow, + uint32_t A, uint32_t B) { LOAD_32x4(frow, A0); LOAD_32x4(irow, B0); const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A); @@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow, return E; } -static void RescalerExportRowExpand(WebPRescaler* const wrk) { +static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) { const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); for (x_out = 0; x_out < max_span; x_out += 8) { const uint32x4_t C0 = - Interpolate(frow + x_out + 0, irow + x_out + 0, A, B); + Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B); const uint32x4_t C1 = - Interpolate(frow + x_out + 4, irow + x_out + 4, A, B); + Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B); const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half); const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half); const uint16x4_t E0 = vmovn_u32(D0); @@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) { } } -static void RescalerExportRowShrink(WebPRescaler* const wrk) { +static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) { extern void WebPRescalerDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) { - WebPRescalerExportRowExpand = RescalerExportRowExpand; - WebPRescalerExportRowShrink = RescalerExportRowShrink; + WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON; + WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON; } #else // !WEBP_USE_NEON diff --git a/media/libwebp/dsp/rescaler_sse2.c b/media/libwebp/dsp/rescaler_sse2.c index 8271c22e0..1306f8457 100644 --- a/media/libwebp/dsp/rescaler_sse2.c +++ b/media/libwebp/dsp/rescaler_sse2.c @@ -11,9 +11,9 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "../dsp/dsp.h" -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE) #include <emmintrin.h> #include <assert.h> @@ -27,7 +27,7 @@ #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0 -static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { +static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0 @@ -36,28 +36,30 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { } // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0 -static void LoadHeightPixels(const uint8_t* const src, __m128i* out) { +static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH *out = _mm_unpacklo_epi8(A, zero); } -static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, + const uint8_t* src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; int accum = x_add; __m128i cur_pixels; + // SSE2 implementation only works with 16b signed arithmetic at max. + if (wrk->src_width < 8 || accum >= (1 << 15)) { + WebPRescalerImportRowExpand_C(wrk, src); + return; + } + assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); if (wrk->num_channels == 4) { - if (wrk->src_width < 2) { - WebPRescalerImportRowExpandC(wrk, src); - return; - } - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); @@ -67,7 +69,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } @@ -75,11 +77,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, } else { int left; const uint8_t* const src_limit = src + wrk->src_width - 8; - if (wrk->src_width < 8) { - WebPRescalerImportRowExpandC(wrk, src); - return; - } - LoadHeightPixels(src, &cur_pixels); + LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { @@ -94,7 +92,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { - LoadHeightPixels(src, &cur_pixels); + LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail @@ -110,8 +108,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, assert(accum == 0); } -static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, + const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); @@ -123,7 +121,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { - WebPRescalerImportRowShrinkC(wrk, src); + WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); @@ -169,12 +167,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, // Row export // load *src as epi64, multiply by mult and store result in [out0 ... out3] -static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, - const __m128i* const mult, - __m128i* const out0, - __m128i* const out1, - __m128i* const out2, - __m128i* const out3) { +static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src, + const __m128i* const mult, + __m128i* const out0, + __m128i* const out1, + __m128i* const out2, + __m128i* const out3) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A2 = _mm_srli_epi64(A0, 32); @@ -192,12 +190,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, } } -static WEBP_INLINE void ProcessRow(const __m128i* const A0, - const __m128i* const A1, - const __m128i* const A2, - const __m128i* const A3, - const __m128i* const mult, - uint8_t* const dst) { +static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, + const __m128i* const A1, + const __m128i* const A2, + const __m128i* const A3, + const __m128i* const mult, + uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); @@ -210,7 +208,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0, const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); -#if (WEBP_RESCALER_FIX < 32) +#if (WEBP_RESCALER_RFIX < 32) const __m128i D2 = _mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask); const __m128i D3 = @@ -226,7 +224,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0, _mm_storel_epi64((__m128i*)dst, G); } -static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { +static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -240,8 +238,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { if (wrk->y_accum == 0) { for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; @@ -257,8 +255,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(A0, B0); const __m128i C1 = _mm_add_epi64(A1, B1); @@ -272,7 +270,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX); const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX); const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX); - ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { @@ -286,7 +284,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { } } -static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { +static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -303,8 +301,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); @@ -324,7 +322,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const __m128i G1 = _mm_or_si128(D1, F3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0); _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1); - ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { @@ -340,10 +338,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const __m128i zero = _mm_setzero_si128(); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero); _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], scale); @@ -362,10 +360,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { extern void WebPRescalerDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) { - WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2; - WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2; - WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2; - WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2; + WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2; + WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2; + WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2; + WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/media/libwebp/dsp/upsampling.c b/media/libwebp/dsp/upsampling.c index 265e722c1..b76483a3a 100644 --- a/media/libwebp/dsp/upsampling.c +++ b/media/libwebp/dsp/upsampling.c @@ -11,8 +11,8 @@ // // Author: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" -#include "./yuv.h" +#include "../dsp/dsp.h" +#include "../dsp/yuv.h" #include <assert.h> @@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \ const uint32_t uv1 = (diag_03 + t_uv) >> 1; \ FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (2 * x - 1) * XSTEP); \ + top_dst + (2 * x - 1) * (XSTEP)); \ FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \ - top_dst + (2 * x - 0) * XSTEP); \ + top_dst + (2 * x - 0) * (XSTEP)); \ } \ if (bottom_y != NULL) { \ const uint32_t uv0 = (diag_03 + l_uv) >> 1; \ const uint32_t uv1 = (diag_12 + uv) >> 1; \ FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (2 * x - 1) * XSTEP); \ + bottom_dst + (2 * x - 1) * (XSTEP)); \ FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \ - bottom_dst + (2 * x + 0) * XSTEP); \ + bottom_dst + (2 * x + 0) * (XSTEP)); \ } \ tl_uv = t_uv; \ l_uv = uv; \ @@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ { \ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (len - 1) * XSTEP); \ + top_dst + (len - 1) * (XSTEP)); \ } \ if (bottom_y != NULL) { \ const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (len - 1) * XSTEP); \ + bottom_dst + (len - 1) * (XSTEP)); \ } \ } \ } // All variants implemented. -UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) -UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) -UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) -UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) -UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) -UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) -UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) +#if !WEBP_NEON_OMIT_C_CODE +UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4) +UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4) +UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3) +UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3) +UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2) +UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2) +#else +static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y, + const uint8_t* top_u, const uint8_t* top_v, + const uint8_t* cur_u, const uint8_t* cur_v, + uint8_t* top_dst, uint8_t* bottom_dst, int len) { + (void)top_y; + (void)bottom_y; + (void)top_u; + (void)top_v; + (void)cur_u; + (void)cur_v; + (void)top_dst; + (void)bottom_dst; + (void)len; + assert(0); // COLORSPACE SUPPORT NOT COMPILED +} +#define UpsampleArgbLinePair_C EmptyUpsampleFunc +#define UpsampleRgbLinePair_C EmptyUpsampleFunc +#define UpsampleBgrLinePair_C EmptyUpsampleFunc +#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc +#define UpsampleRgb565LinePair_C EmptyUpsampleFunc +#endif // WEBP_REDUCE_CSP + +#endif #undef LOAD_UV #undef UPSAMPLE_FUNC @@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb) WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) { WebPInitUpsamplers(); - VP8YUVInit(); #ifdef FANCY_UPSAMPLING return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB]; #else @@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ int i; \ - for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ + for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ } -YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3) -YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3) -YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4) -YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4) -YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4) -YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2) -YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2) +YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4) +YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3) +YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3) +YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4) +YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2) +YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2) +#else +static void EmptyYuv444Func(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + (void)y; + (void)u; + (void)v; + (void)dst; + (void)len; +} +#define WebPYuv444ToRgb_C EmptyYuv444Func +#define WebPYuv444ToBgr_C EmptyYuv444Func +#define WebPYuv444ToArgb_C EmptyYuv444Func +#define WebPYuv444ToRgba4444_C EmptyYuv444Func +#define WebPYuv444ToRgb565_C EmptyYuv444Func +#endif // WEBP_REDUCE_CSP #undef YUV444_FUNC @@ -175,24 +217,20 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST]; extern void WebPInitYUV444ConvertersMIPSdspR2(void); extern void WebPInitYUV444ConvertersSSE2(void); - -static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 = - (VP8CPUInfo)&upsampling_last_cpuinfo_used1; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) { - if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return; - - WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC; - WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC; - WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC; - WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC; - WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC; - WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C; - WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C; - WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC; - WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC; - WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC; - WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C; +extern void WebPInitYUV444ConvertersSSE41(void); + +WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) { + WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C; + WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C; + WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C; + WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C; + WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C; + WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C; + WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C; + WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C; + WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C; + WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C; + WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C; if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -200,41 +238,43 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) { WebPInitYUV444ConvertersSSE2(); } #endif +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitYUV444ConvertersSSE41(); + } +#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { WebPInitYUV444ConvertersMIPSdspR2(); } #endif } - upsampling_last_cpuinfo_used1 = VP8GetCPUInfo; } //------------------------------------------------------------------------------ // Main calls extern void WebPInitUpsamplersSSE2(void); +extern void WebPInitUpsamplersSSE41(void); extern void WebPInitUpsamplersNEON(void); extern void WebPInitUpsamplersMIPSdspR2(void); extern void WebPInitUpsamplersMSA(void); -static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 = - (VP8CPUInfo)&upsampling_last_cpuinfo_used2; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { - if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) { #ifdef FANCY_UPSAMPLING - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; +#if !WEBP_NEON_OMIT_C_CODE + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C; + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C; +#endif // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -243,9 +283,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { WebPInitUpsamplersSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPInitUpsamplersNEON(); +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitUpsamplersSSE41(); } #endif #if defined(WEBP_USE_MIPS_DSP_R2) @@ -259,8 +299,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { } #endif } + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitUpsamplersNEON(); + } +#endif + + assert(WebPUpsamplers[MODE_RGBA] != NULL); + assert(WebPUpsamplers[MODE_BGRA] != NULL); + assert(WebPUpsamplers[MODE_rgbA] != NULL); + assert(WebPUpsamplers[MODE_bgrA] != NULL); +#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE + assert(WebPUpsamplers[MODE_RGB] != NULL); + assert(WebPUpsamplers[MODE_BGR] != NULL); + assert(WebPUpsamplers[MODE_ARGB] != NULL); + assert(WebPUpsamplers[MODE_RGBA_4444] != NULL); + assert(WebPUpsamplers[MODE_RGB_565] != NULL); + assert(WebPUpsamplers[MODE_Argb] != NULL); + assert(WebPUpsamplers[MODE_rgbA_4444] != NULL); +#endif + #endif // FANCY_UPSAMPLING - upsampling_last_cpuinfo_used2 = VP8GetCPUInfo; } //------------------------------------------------------------------------------ diff --git a/media/libwebp/dsp/upsampling_neon.c b/media/libwebp/dsp/upsampling_neon.c index d371a834f..c847d70d4 100644 --- a/media/libwebp/dsp/upsampling_neon.c +++ b/media/libwebp/dsp/upsampling_neon.c @@ -12,15 +12,15 @@ // Author: mans@mansr.com (Mans Rullgard) // Based on SSE code by: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <assert.h> #include <arm_neon.h> #include <string.h> -#include "./neon.h" -#include "./yuv.h" +#include "../dsp/neon.h" +#include "../dsp/yuv.h" #ifdef FANCY_UPSAMPLING @@ -58,8 +58,8 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, - uint8_t *out) { +static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2, + uint8_t *out) { UPSAMPLE_16PIXELS(r1, r2, out); } @@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, /* replicate last byte */ \ memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ - Upsample16Pixels(r1, r2, out); \ + Upsample16Pixels_NEON(r1, r2, out); \ } //----------------------------------------------------------------------------- @@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ } // NEON variants of the fancy upsampler. -NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3) -NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3) -NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4) -NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4) -NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4) -NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2) -NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2) +NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4) +NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4) +#if !defined(WEBP_REDUCE_CSP) +NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3) +NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3) +NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4) +NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2) +NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2) +#endif // WEBP_REDUCE_CSP //------------------------------------------------------------------------------ // Entry point @@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersNEON(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING diff --git a/media/libwebp/dsp/upsampling_sse2.c b/media/libwebp/dsp/upsampling_sse2.c index b5b668900..fd23681ca 100644 --- a/media/libwebp/dsp/upsampling_sse2.c +++ b/media/libwebp/dsp/upsampling_sse2.c @@ -11,14 +11,14 @@ // // Author: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" +#include "../dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <assert.h> #include <emmintrin.h> #include <string.h> -#include "./yuv.h" +#include "../dsp/yuv.h" #ifdef FANCY_UPSAMPLING @@ -83,13 +83,13 @@ GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ \ /* pack the alternate pixels */ \ - PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \ - PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \ + PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ + PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ } // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], + uint8_t* const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -101,30 +101,15 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \ memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \ /* using the shared function instead of the macro saves ~3k code size */ \ - Upsample32Pixels(r1, r2, out); \ -} - -#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \ - top_dst, bottom_dst, cur_x, num_pixels) { \ - int n; \ - for (n = 0; n < (num_pixels); ++n) { \ - FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \ - top_dst + ((cur_x) + n) * XSTEP); \ - } \ - if (bottom_y != NULL) { \ - for (n = 0; n < (num_pixels); ++n) { \ - FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \ - bottom_dst + ((cur_x) + n) * XSTEP); \ - } \ - } \ + Upsample32Pixels_SSE2(r1, r2, out); \ } #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \ top_dst, bottom_dst, cur_x) do { \ - FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \ - if (bottom_y != NULL) { \ - FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \ - bottom_dst + (cur_x) * XSTEP); \ + FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \ + if ((bottom_y) != NULL) { \ + FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \ + (bottom_dst) + (cur_x) * (XSTEP)); \ } \ } while (0) @@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ - uint8_t uv_buf[4 * 32 + 15]; \ + uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ uint8_t* const r_v = r_u + 32; \ \ @@ -160,22 +145,36 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ } \ if (len > 1) { \ const int left_over = ((len + 1) >> 1) - (pos >> 1); \ + uint8_t* const tmp_top_dst = r_u + 4 * 32; \ + uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \ + uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \ + uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \ assert(left_over > 0); \ UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \ UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \ - CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \ - pos, len - pos); \ + memcpy(tmp_top, top_y + pos, len - pos); \ + if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \ + CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \ + tmp_bottom_dst, 0); \ + memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \ + if (bottom_y != NULL) { \ + memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \ + (len - pos) * (XSTEP)); \ + } \ } \ } // SSE2 variants of the fancy upsampler. -SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) -SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) -SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) -SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) -SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) -SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) -SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) +SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4) +SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4) + +#if !defined(WEBP_REDUCE_CSP) +SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3) +SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3) +SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4) +SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2) +SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2) +#endif // WEBP_REDUCE_CSP #undef GET_M #undef PACK_AND_STORE @@ -193,17 +192,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING @@ -213,29 +214,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) { extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE2(void); -#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \ -extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \ - const uint8_t* v, uint8_t* dst, int len); \ +#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ +extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len); \ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ int i; \ const int max_len = len & ~31; \ - for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\ + for (i = 0; i < max_len; i += 32) { \ + CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \ + } \ if (i < len) { /* C-fallback */ \ - WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \ + CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \ } \ } -YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4); -YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4); -YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3); -YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3); +YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4); +YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4); +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3); +YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3); +YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4) +YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \ + WebPYuv444ToRgba4444_C, 2) +YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2) +#endif // WEBP_REDUCE_CSP WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) { - WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba; - WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra; - WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb; - WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr; + WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2; + WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2; + WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2; + WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2; +#if !defined(WEBP_REDUCE_CSP) + WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2; + WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2; + WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2; + WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2; + WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2; + WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2; + WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2; +#endif // WEBP_REDUCE_CSP } #else diff --git a/media/libwebp/dsp/upsampling_sse41.c b/media/libwebp/dsp/upsampling_sse41.c new file mode 100644 index 000000000..65d175e87 --- /dev/null +++ b/media/libwebp/dsp/upsampling_sse41.c @@ -0,0 +1,239 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE41 version of YUV to RGB upsampling functions. +// +// Author: somnath@google.com (Somnath Banerjee) + +#include "../dsp/dsp.h" + +#if defined(WEBP_USE_SSE41) + +#include <assert.h> +#include <smmintrin.h> +#include <string.h> +#include "../dsp/yuv.h" + +#ifdef FANCY_UPSAMPLING + +#if !defined(WEBP_REDUCE_CSP) + +// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows +// u = (9*a + 3*b + 3*c + d + 8) / 16 +// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2 +// = (a + m + 1) / 2 +// where m = (a + 3*b + 3*c + d) / 8 +// = ((a + b + c + d) / 2 + b + c) / 4 +// +// Let's say k = (a + b + c + d) / 4. +// We can compute k as +// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1 +// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2 +// +// Then m can be written as +// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 + +// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 +#define GET_M(ij, in, out) do { \ + const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \ + const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \ + const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \ + const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\ + const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \ + (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \ +} while (0) + +// pack and store two alternating pixel rows +#define PACK_AND_STORE(a, b, da, db, out) do { \ + const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \ + const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \ + const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \ + const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \ + _mm_store_si128(((__m128i*)(out)) + 0, t_1); \ + _mm_store_si128(((__m128i*)(out)) + 1, t_2); \ +} while (0) + +// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. +#define UPSAMPLE_32PIXELS(r1, r2, out) { \ + const __m128i one = _mm_set1_epi8(1); \ + const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \ + const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \ + const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \ + const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \ + \ + const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \ + const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \ + const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \ + \ + const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \ + const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \ + \ + const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \ + const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \ + const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \ + const __m128i t4 = _mm_avg_epu8(s, t); \ + const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \ + __m128i diag1, diag2; \ + \ + GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \ + GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ + \ + /* pack the alternate pixels */ \ + PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ + PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ +} + +// Turn the macro into a function for reducing code-size when non-critical +static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], + uint8_t* const out) { + UPSAMPLE_32PIXELS(r1, r2, out); +} + +#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ + uint8_t r1[17], r2[17]; \ + memcpy(r1, (tb), (num_pixels)); \ + memcpy(r2, (bb), (num_pixels)); \ + /* replicate last byte */ \ + memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \ + memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \ + /* using the shared function instead of the macro saves ~3k code size */ \ + Upsample32Pixels_SSE41(r1, r2, out); \ +} + +#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \ + top_dst, bottom_dst, cur_x) do { \ + FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \ + if ((bottom_y) != NULL) { \ + FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64, \ + (bottom_dst) + (cur_x) * (XSTEP)); \ + } \ +} while (0) + +#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ + const uint8_t* top_u, const uint8_t* top_v, \ + const uint8_t* cur_u, const uint8_t* cur_v, \ + uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ + int uv_pos, pos; \ + /* 16byte-aligned array to cache reconstructed u and v */ \ + uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ + uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + uint8_t* const r_v = r_u + 32; \ + \ + assert(top_y != NULL); \ + { /* Treat the first pixel in regular way */ \ + const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ + const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ + const int u0_t = (top_u[0] + u_diag) >> 1; \ + const int v0_t = (top_v[0] + v_diag) >> 1; \ + FUNC(top_y[0], u0_t, v0_t, top_dst); \ + if (bottom_y != NULL) { \ + const int u0_b = (cur_u[0] + u_diag) >> 1; \ + const int v0_b = (cur_v[0] + v_diag) >> 1; \ + FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \ + } \ + } \ + /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \ + for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \ + UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \ + UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \ + CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \ + } \ + if (len > 1) { \ + const int left_over = ((len + 1) >> 1) - (pos >> 1); \ + uint8_t* const tmp_top_dst = r_u + 4 * 32; \ + uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \ + uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \ + uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \ + assert(left_over > 0); \ + UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \ + UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \ + memcpy(tmp_top, top_y + pos, len - pos); \ + if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \ + CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \ + tmp_bottom_dst, 0); \ + memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \ + if (bottom_y != NULL) { \ + memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \ + (len - pos) * (XSTEP)); \ + } \ + } \ +} + +// SSE4 variants of the fancy upsampler. +SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41, VP8YuvToRgb, 3) +SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41, VP8YuvToBgr, 3) + +#undef GET_M +#undef PACK_AND_STORE +#undef UPSAMPLE_32PIXELS +#undef UPSAMPLE_LAST_BLOCK +#undef CONVERT2RGB +#undef CONVERT2RGB_32 +#undef SSE4_UPSAMPLE_FUNC + +#endif // WEBP_REDUCE_CSP + +//------------------------------------------------------------------------------ +// Entry point + +extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; + +extern void WebPInitUpsamplersSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) { +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE41; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE41; +#endif // WEBP_REDUCE_CSP +} + +#endif // FANCY_UPSAMPLING + +//------------------------------------------------------------------------------ + +extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; +extern void WebPInitYUV444ConvertersSSE41(void); + +#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ +extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len); \ +static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len) { \ + int i; \ + const int max_len = len & ~31; \ + for (i = 0; i < max_len; i += 32) { \ + CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \ + } \ + if (i < len) { /* C-fallback */ \ + CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \ + } \ +} + +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3); +YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3); +#endif // WEBP_REDUCE_CSP + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) { +#if !defined(WEBP_REDUCE_CSP) + WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE41; + WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE41; +#endif // WEBP_REDUCE_CSP +} + +#else + +WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41) + +#endif // WEBP_USE_SSE41 + +#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41)) +WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41) +#endif diff --git a/media/libwebp/dsp/yuv.c b/media/libwebp/dsp/yuv.c index dd7d9dedf..12c04ca42 100644 --- a/media/libwebp/dsp/yuv.c +++ b/media/libwebp/dsp/yuv.c @@ -11,63 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./yuv.h" +#include "../dsp/yuv.h" +#include <assert.h> #include <stdlib.h> -#if defined(WEBP_YUV_USE_TABLE) - -static int done = 0; - -static WEBP_INLINE uint8_t clip(int v, int max_value) { - return v < 0 ? 0 : v > max_value ? max_value : v; -} - -int16_t VP8kVToR[256], VP8kUToB[256]; -int32_t VP8kVToG[256], VP8kUToG[256]; -uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; -uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) { - int i; - if (done) { - return; - } -#ifndef USE_YUVj - for (i = 0; i < 256; ++i) { - VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX; - VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF; - VP8kVToG[i] = -45773 * (i - 128); - VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX; - } - for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { - const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX; - VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); - VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); - } -#else - for (i = 0; i < 256; ++i) { - VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX; - VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF; - VP8kVToG[i] = -46802 * (i - 128); - VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX; - } - for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { - const int k = i; - VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); - VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); - } -#endif - - done = 1; -} - -#else - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {} - -#endif // WEBP_YUV_USE_TABLE - //----------------------------------------------------------------------------- // Plain-C version @@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {} static void FUNC_NAME(const uint8_t* y, \ const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ - const uint8_t* const end = dst + (len & ~1) * XSTEP; \ + const uint8_t* const end = dst + (len & ~1) * (XSTEP); \ while (dst != end) { \ FUNC(y[0], u[0], v[0], dst); \ - FUNC(y[1], u[0], v[0], dst + XSTEP); \ + FUNC(y[1], u[0], v[0], dst + (XSTEP)); \ y += 2; \ ++u; \ ++v; \ - dst += 2 * XSTEP; \ + dst += 2 * (XSTEP); \ } \ if (len & 1) { \ FUNC(y[0], u[0], v[0], dst); \ @@ -123,15 +71,11 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, WebPSamplerRowFunc WebPSamplers[MODE_LAST]; extern void WebPInitSamplersSSE2(void); +extern void WebPInitSamplersSSE41(void); extern void WebPInitSamplersMIPS32(void); extern void WebPInitSamplersMIPSdspR2(void); -static volatile VP8CPUInfo yuv_last_cpuinfo_used = - (VP8CPUInfo)&yuv_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { - if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(WebPInitSamplers) { WebPSamplers[MODE_RGB] = YuvToRgbRow; WebPSamplers[MODE_RGBA] = YuvToRgbaRow; WebPSamplers[MODE_BGR] = YuvToBgrRow; @@ -151,6 +95,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { WebPInitSamplersSSE2(); } #endif // WEBP_USE_SSE2 +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitSamplersSSE41(); + } +#endif // WEBP_USE_SSE41 #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { WebPInitSamplersMIPS32(); @@ -162,13 +111,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { } #endif // WEBP_USE_MIPS_DSP_R2 } - yuv_last_cpuinfo_used = VP8GetCPUInfo; } //----------------------------------------------------------------------------- // ARGB -> YUV converters -static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i) { const uint32_t p = argb[i]; @@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, //----------------------------------------------------------------------------- -static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i, rgb += 3) { y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i, bgr += 3) { y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); @@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- +#if !WEBP_NEON_OMIT_C_CODE #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic static uint16_t clip_y(int v) { return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; @@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len, out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef MAX_Y @@ -304,26 +254,26 @@ void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src, void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out); -static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used = - (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used; - extern void WebPInitConvertARGBToYUVSSE2(void); +extern void WebPInitConvertARGBToYUVSSE41(void); +extern void WebPInitConvertARGBToYUVNEON(void); extern void WebPInitSharpYUVSSE2(void); +extern void WebPInitSharpYUVNEON(void); -WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { - if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return; - - WebPConvertARGBToY = ConvertARGBToY; +WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { + WebPConvertARGBToY = ConvertARGBToY_C; WebPConvertARGBToUV = WebPConvertARGBToUV_C; - WebPConvertRGB24ToY = ConvertRGB24ToY; - WebPConvertBGR24ToY = ConvertBGR24ToY; + WebPConvertRGB24ToY = ConvertRGB24ToY_C; + WebPConvertBGR24ToY = ConvertBGR24ToY_C; WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; +#if !WEBP_NEON_OMIT_C_CODE WebPSharpYUVUpdateY = SharpYUVUpdateY_C; WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C; WebPSharpYUVFilterRow = SharpYUVFilterRow_C; +#endif if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -332,6 +282,27 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { WebPInitSharpYUVSSE2(); } #endif // WEBP_USE_SSE2 +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitConvertARGBToYUVSSE41(); + } +#endif // WEBP_USE_SSE41 + } + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitConvertARGBToYUVNEON(); + WebPInitSharpYUVNEON(); } - rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo; +#endif // WEBP_USE_NEON + + assert(WebPConvertARGBToY != NULL); + assert(WebPConvertARGBToUV != NULL); + assert(WebPConvertRGB24ToY != NULL); + assert(WebPConvertBGR24ToY != NULL); + assert(WebPConvertRGBA32ToUV != NULL); + assert(WebPSharpYUVUpdateY != NULL); + assert(WebPSharpYUVUpdateRGB != NULL); + assert(WebPSharpYUVFilterRow != NULL); } diff --git a/media/libwebp/dsp/yuv.h b/media/libwebp/dsp/yuv.h index 1d33b5863..b4c5d0b6c 100644 --- a/media/libwebp/dsp/yuv.h +++ b/media/libwebp/dsp/yuv.h @@ -35,19 +35,9 @@ #ifndef WEBP_DSP_YUV_H_ #define WEBP_DSP_YUV_H_ -#include "./dsp.h" +#include "../dsp/dsp.h" #include "../dec/vp8_dec.h" -#if defined(WEBP_EXPERIMENTAL_FEATURES) -// Do NOT activate this feature for real compression. This is only experimental! -// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace. -// This colorspace is close to Rec.601's Y'CbCr model with the notable -// difference of allowing larger range for luma/chroma. -// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its -// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion -// #define USE_YUVj -#endif - //------------------------------------------------------------------------------ // YUV -> RGB conversion @@ -58,12 +48,8 @@ extern "C" { enum { YUV_FIX = 16, // fixed-point precision for RGB->YUV YUV_HALF = 1 << (YUV_FIX - 1), - YUV_MASK = (256 << YUV_FIX) - 1, - YUV_RANGE_MIN = -227, // min value of r/g/b output - YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output YUV_FIX2 = 6, // fixed-point precision for YUV->RGB - YUV_HALF2 = 1 << YUV_FIX2 >> 1, YUV_MASK2 = (256 << YUV_FIX2) - 1 }; @@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v, const int b = VP8YUVToB(y, u); // 5 usable bits const int rg = (r & 0xf8) | (g >> 5); const int gb = ((g << 3) & 0xe0) | (b >> 3); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) rgb[0] = gb; rgb[1] = rg; #else @@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v, const int b = VP8YUVToB(y, u); // 4 usable bits const int rg = (r & 0xf0) | (g >> 4); const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) argb[0] = ba; argb[1] = rg; #else @@ -157,32 +143,42 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, rgba[3] = 0xff; } -// Must be called before everything, to initialize the tables. -void VP8YUVInit(void); - //----------------------------------------------------------------------------- // SSE2 extra functions (mostly for upsampling_sse2.c) #if defined(WEBP_USE_SSE2) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, +void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst); -void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, + const uint8_t* v, uint8_t* dst); +void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); #endif // WEBP_USE_SSE2 +//----------------------------------------------------------------------------- +// SSE41 extra functions (mostly for upsampling_sse41.c) + +#if defined(WEBP_USE_SSE41) + +// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. +void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); + +#endif // WEBP_USE_SSE41 + //------------------------------------------------------------------------------ // RGB -> YUV conversion @@ -192,8 +188,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) { return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255; } -#ifndef USE_YUVj - static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) { const int luma = 16839 * r + 33059 * g + 6420 * b; return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip @@ -209,28 +203,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { return VP8ClipUV(v, rounding); } -#else - -// This JPEG-YUV colorspace, only for comparison! -// These are also 16bit precision coefficients from Rec.601, but with full -// [0..255] output range. -static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) { - const int luma = 19595 * r + 38470 * g + 7471 * b; - return (luma + rounding) >> YUV_FIX; // no need to clip -} - -static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) { - const int u = -11058 * r - 21710 * g + 32768 * b; - return VP8ClipUV(u, rounding); -} - -static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { - const int v = 32768 * r - 27439 * g - 5329 * b; - return VP8ClipUV(v, rounding); -} - -#endif // USE_YUVj - #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libwebp/dsp/yuv_sse2.c b/media/libwebp/dsp/yuv_sse2.c index e33c2bbaf..755662a05 100644 --- a/media/libwebp/dsp/yuv_sse2.c +++ b/media/libwebp/dsp/yuv_sse2.c @@ -11,11 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./yuv.h" +#include "../dsp/yuv.h" #if defined(WEBP_USE_SSE2) -#include "./common_sse2.h" +#include "../dsp/common_sse2.h" #include <stdlib.h> #include <emmintrin.h> @@ -26,12 +26,12 @@ // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 -static void ConvertYUV444ToRGB(const __m128i* const Y0, - const __m128i* const U0, - const __m128i* const V0, - __m128i* const R, - __m128i* const G, - __m128i* const B) { +static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0, + const __m128i* const U0, + const __m128i* const V0, + __m128i* const R, + __m128i* const G, + __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); @@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0, } // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. -static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) { +static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); } // Load and replicate the U/V samples -static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { +static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); @@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, - __m128i* const R, __m128i* const G, __m128i* const B) { - const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v); - ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); +static void YUV444ToRGB_SSE2(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), + V0 = Load_HI_16_SSE2(v); + ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, - __m128i* const R, __m128i* const G, __m128i* const B) { - const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v); - ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); +static void YUV420ToRGB_SSE2(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), + V0 = Load_UV_HI_8_SSE2(v); + ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); } // Pack R/G/B/A results into 32b output. -static WEBP_INLINE void PackAndStore4(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i rg = _mm_unpacklo_epi8(rb, ga); @@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R, } // Pack R/G/B/A results into 16b output. -static WEBP_INLINE void PackAndStore4444(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { -#if !defined(WEBP_SWAP_16BIT_CSP) +static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { +#if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i ba0 = _mm_packus_epi16(*B, *A); #else @@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R, } // Pack R/G/B results into 16b output. -static WEBP_INLINE void PackAndStore565(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + uint8_t* const dst) { const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); @@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R, const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); const __m128i rg = _mm_or_si128(r1, g1); const __m128i gb = _mm_or_si128(g2, b1); -#if !defined(WEBP_SWAP_16BIT_CSP) +#if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); #else const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); @@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R, // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... -static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, - __m128i* const in2, __m128i* const in3, - __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { +static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, + __m128i* const in2, __m128i* const in3, + __m128i* const in4, __m128i* const in5, + uint8_t* const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -176,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, // Repeat the same permutations twice more: // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 - VP8PlanarTo24b(in0, in1, in2, in3, in4, in5); + VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5); _mm_storeu_si128((__m128i*)(rgb + 0), *in0); _mm_storeu_si128((__m128i*)(rgb + 16), *in1); @@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&R, &G, &B, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); } } -void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&B, &G, &R, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); } } -void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&kAlpha, &R, &G, &B, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); } } -void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, + const uint8_t* v, uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4444(&R, &G, &B, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst); } } -void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore565(&R, &G, &B, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore565_SSE2(&R, &G, &B, dst); } } -void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); - YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); - YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. rgb0 = _mm_packus_epi16(R0, R1); @@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); - YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); - YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. bgr0 = _mm_packus_epi16(B0, B1); @@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, bgr5= _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); } //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbaRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&R, &G, &B, &kAlpha, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); y += 8; u += 4; v += 4; @@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgraRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&B, &G, &R, &kAlpha, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); y += 8; u += 4; v += 4; @@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToArgbRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&kAlpha, &R, &G, &B, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); y += 8; u += 4; v += 4; @@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; - YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); - YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); - YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. rgb0 = _mm_packus_epi16(R0, R1); @@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); y += 32; u += 16; @@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; - YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); - YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); - YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. bgr0 = _mm_packus_epi16(B0, B1); @@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, bgr5 = _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); y += 32; u += 16; @@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, extern void WebPInitSamplersSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { - WebPSamplers[MODE_RGB] = YuvToRgbRow; - WebPSamplers[MODE_RGBA] = YuvToRgbaRow; - WebPSamplers[MODE_BGR] = YuvToBgrRow; - WebPSamplers[MODE_BGRA] = YuvToBgraRow; - WebPSamplers[MODE_ARGB] = YuvToArgbRow; + WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2; + WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2; + WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2; + WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2; + WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2; } //------------------------------------------------------------------------------ @@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { // Function that inserts a value of the second half of the in buffer in between // every two char of the first half. -static WEBP_INLINE void RGB24PackedToPlanarHelper( +static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { out[0] = _mm_unpacklo_epi8(in[0], in[3]); out[1] = _mm_unpackhi_epi8(in[0], in[3]); @@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper( // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. -static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, - __m128i* const out /*out[6]*/) { +static WEBP_INLINE void RGB24PackedToPlanar_SSE2( + const uint8_t* const rgb, __m128i* const out /*out[6]*/) { __m128i tmp[6]; tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); @@ -468,22 +477,22 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); - RGB24PackedToPlanarHelper(tmp, out); - RGB24PackedToPlanarHelper(out, tmp); - RGB24PackedToPlanarHelper(tmp, out); - RGB24PackedToPlanarHelper(out, tmp); - RGB24PackedToPlanarHelper(tmp, out); + RGB24PackedToPlanarHelper_SSE2(tmp, out); + RGB24PackedToPlanarHelper_SSE2(out, tmp); + RGB24PackedToPlanarHelper_SSE2(tmp, out); + RGB24PackedToPlanarHelper_SSE2(out, tmp); + RGB24PackedToPlanarHelper_SSE2(tmp, out); } // Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, - __m128i* const rgb /*in[6]*/) { +static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb, + __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); __m128i a2 = LOAD_16(argb + 8); __m128i a3 = LOAD_16(argb + 12); - VP8L32bToPlanar(&a0, &a1, &a2, &a3); + VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3); rgb[0] = _mm_unpacklo_epi8(a1, zero); rgb[1] = _mm_unpackhi_epi8(a1, zero); rgb[2] = _mm_unpacklo_epi8(a2, zero); @@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, } while (0) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) -static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const Y) { +static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); @@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); } -static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const U, __m128i* const V) { +static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const U, + __m128i* const V) { const __m128i kRG_u = MK_CST_16(-9719, -19081); const __m128i kGB_u = MK_CST_16(0, 28800); const __m128i kRG_v = MK_CST_16(28800, 0); @@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { __m128i rgb_plane[6]; int j; - RGB24PackedToPlanar(rgb, rgb_plane); + RGB24PackedToPlanar_SSE2(rgb, rgb_plane); for (j = 0; j < 2; ++j, i += 16) { const __m128i zero = _mm_setzero_si128(); @@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y0); + ConvertRGBToY_SSE2(&r, &g, &b, &Y0); // Convert to 16-bit Y. r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y1); + ConvertRGBToY_SSE2(&r, &g, &b, &Y1); // Cast to 8-bit and store. STORE_16(_mm_packus_epi16(Y0, Y1), y + i); @@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { __m128i bgr_plane[6]; int j; - RGB24PackedToPlanar(bgr, bgr_plane); + RGB24PackedToPlanar_SSE2(bgr, bgr_plane); for (j = 0; j < 2; ++j, i += 16) { const __m128i zero = _mm_setzero_si128(); @@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y0); + ConvertRGBToY_SSE2(&r, &g, &b, &Y0); // Convert to 16-bit Y. b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y1); + ConvertRGBToY_SSE2(&r, &g, &b, &Y1); // Cast to 8-bit and store. STORE_16(_mm_packus_epi16(Y0, Y1), y + i); @@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { __m128i Y0, Y1, rgb[6]; - RGB32PackedToPlanar(&argb[i], rgb); - ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0); - ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1); + RGB32PackedToPlanar_SSE2(&argb[i], rgb); + ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } for (; i < width; ++i) { // left-over @@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { // Horizontal add (doubled) of two 16b values, result is 16b. // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... -static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, - __m128i* const out) { +static void HorizontalAddPack_SSE2(const __m128i* const A, + const __m128i* const B, + __m128i* const out) { const __m128i k2 = _mm_set1_epi16(2); const __m128i C = _mm_madd_epi16(*A, k2); const __m128i D = _mm_madd_epi16(*B, k2); *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, - int src_width, int do_store) { +static void ConvertARGBToUV_SSE2(const uint32_t* argb, + uint8_t* u, uint8_t* v, + int src_width, int do_store) { const int max_width = src_width & ~31; int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { __m128i rgb[6], U0, V0, U1, V1; - RGB32PackedToPlanar(&argb[i], rgb); - HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); - HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); - HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); - ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); - - RGB32PackedToPlanar(&argb[i + 16], rgb); - HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); - HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); - HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); - ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); + RGB32PackedToPlanar_SSE2(&argb[i], rgb); + HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); + + RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb); + HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); U0 = _mm_packus_epi16(U0, U1); V0 = _mm_packus_epi16(V0, V1); @@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, } // Convert 16 packed ARGB 16b-values to r[], g[], b[] -static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, - __m128i* const r, - __m128i* const g, - __m128i* const b) { +static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( + const uint16_t* const rgbx, + __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... @@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, *b = _mm_unpacklo_epi64(B1, B3); } -static void ConvertRGBA32ToUV(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) { __m128i r, g, b, U0, V0, U1, V1; - RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b); - ConvertRGBToUV(&r, &g, &b, &U0, &V0); - RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b); - ConvertRGBToUV(&r, &g, &b, &U1, &V1); + RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b); + ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0); + RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b); + ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1); STORE_16(_mm_packus_epi16(U0, U1), u); STORE_16(_mm_packus_epi16(V0, V1), v); u += 16; @@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb, extern void WebPInitConvertARGBToYUVSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { - WebPConvertARGBToY = ConvertARGBToY; - WebPConvertARGBToUV = ConvertARGBToUV; + WebPConvertARGBToY = ConvertARGBToY_SSE2; + WebPConvertARGBToUV = ConvertARGBToUV_SSE2; - WebPConvertRGB24ToY = ConvertRGB24ToY; - WebPConvertBGR24ToY = ConvertBGR24ToY; + WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2; + WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2; - WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; } //------------------------------------------------------------------------------ diff --git a/media/libwebp/dsp/yuv_sse41.c b/media/libwebp/dsp/yuv_sse41.c new file mode 100644 index 000000000..95ab7ad14 --- /dev/null +++ b/media/libwebp/dsp/yuv_sse41.c @@ -0,0 +1,613 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// YUV->RGB conversion functions +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "../dsp/yuv.h" + +#if defined(WEBP_USE_SSE41) + +#include "../dsp/common_sse41.h" +#include <stdlib.h> +#include <smmintrin.h> + +//----------------------------------------------------------------------------- +// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. + +// These constants are 14b fixed-point version of ITU-R BT.601 constants. +// R = (19077 * y + 26149 * v - 14234) >> 6 +// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 +// B = (19077 * y + 33050 * u - 17685) >> 6 +static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, + const __m128i* const U0, + const __m128i* const V0, + __m128i* const R, + __m128i* const G, + __m128i* const B) { + const __m128i k19077 = _mm_set1_epi16(19077); + const __m128i k26149 = _mm_set1_epi16(26149); + const __m128i k14234 = _mm_set1_epi16(14234); + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + const __m128i k33050 = _mm_set1_epi16((short)33050); + const __m128i k17685 = _mm_set1_epi16(17685); + const __m128i k6419 = _mm_set1_epi16(6419); + const __m128i k13320 = _mm_set1_epi16(13320); + const __m128i k8708 = _mm_set1_epi16(8708); + + const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); + + const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); + const __m128i R1 = _mm_sub_epi16(Y1, k14234); + const __m128i R2 = _mm_add_epi16(R1, R0); + + const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); + const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); + const __m128i G2 = _mm_add_epi16(Y1, k8708); + const __m128i G3 = _mm_add_epi16(G0, G1); + const __m128i G4 = _mm_sub_epi16(G2, G3); + + // be careful with the saturated *unsigned* arithmetic here! + const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); + const __m128i B1 = _mm_adds_epu16(B0, Y1); + const __m128i B2 = _mm_subs_epu16(B1, k17685); + + // use logical shift for B2, which can be larger than 32767 + *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] + *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] + *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] +} + +// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. +static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); +} + +// Load and replicate the U/V samples +static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); + return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples +} + +// Convert 32 samples of YUV444 to R/G/B +static void YUV444ToRGB_SSE41(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u), + V0 = Load_HI_16_SSE41(v); + ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B); +} + +// Convert 32 samples of YUV420 to R/G/B +static void YUV420ToRGB_SSE41(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u), + V0 = Load_UV_HI_8_SSE41(v); + ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B); +} + +// Pack the planar buffers +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... +static WEBP_INLINE void PlanarTo24b_SSE41( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5, + uint8_t* const rgb) { + // The input is 6 registers of sixteen 8b but for the sake of explanation, + // let's take 6 registers of four 8b values. + // To pack, we will keep taking one every two 8b integer and move it + // around as follows: + // Input: + // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 + // Split the 6 registers in two sets of 3 registers: the first set as the even + // 8b bytes, the second the odd ones: + // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 + // Repeat the same permutations twice more: + // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 + // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 + VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5); + + _mm_storeu_si128((__m128i*)(rgb + 0), *in0); + _mm_storeu_si128((__m128i*)(rgb + 16), *in1); + _mm_storeu_si128((__m128i*)(rgb + 32), *in2); + _mm_storeu_si128((__m128i*)(rgb + 48), *in3); + _mm_storeu_si128((__m128i*)(rgb + 64), *in4); + _mm_storeu_si128((__m128i*)(rgb + 80), *in5); +} + +void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; + + YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); +} + +void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; + + YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5= _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); +} + +//----------------------------------------------------------------------------- +// Arbitrary-length row conversion functions + +static void YuvToRgbRow_SSE41(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + int n; + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; + + YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToRgb(y[0], u[0], v[0], dst); + dst += 3; + y += 1; + u += (n & 1); + v += (n & 1); + } +} + +static void YuvToBgrRow_SSE41(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + int n; + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; + + YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5 = _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToBgr(y[0], u[0], v[0], dst); + dst += 3; + y += 1; + u += (n & 1); + v += (n & 1); + } +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void WebPInitSamplersSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) { + WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41; + WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41; +} + +//------------------------------------------------------------------------------ +// RGB24/32 -> YUV converters + +// Load eight 16b-words from *src. +#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src)) +// Store either 16b-words into *dst +#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V)) + +#define WEBP_SSE41_SHUFF(OUT) do { \ + const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \ + const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \ + const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \ + const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \ + const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \ + const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \ + \ + /* OR everything to get one channel */ \ + const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \ + const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \ + out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \ + out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \ +} while (0); + +// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// Similar to PlanarTo24bHelper(), but in reverse order. +static WEBP_INLINE void RGB24PackedToPlanar_SSE41( + const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); + const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); + const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32)); + const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48)); + const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64)); + const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80)); + + // Compute RR. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(0) + } + // Compute GG. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(2) + } + // Compute BB. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(4) + } +} + +#undef WEBP_SSE41_SHUFF + +// Convert 8 packed ARGB to r[], g[], b[] +static WEBP_INLINE void RGB32PackedToPlanar_SSE41( + const uint32_t* const argb, __m128i* const rgb /*in[6]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i a0 = LOAD_16(argb + 0); + __m128i a1 = LOAD_16(argb + 4); + __m128i a2 = LOAD_16(argb + 8); + __m128i a3 = LOAD_16(argb + 12); + VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3); + rgb[0] = _mm_unpacklo_epi8(a1, zero); + rgb[1] = _mm_unpackhi_epi8(a1, zero); + rgb[2] = _mm_unpacklo_epi8(a2, zero); + rgb[3] = _mm_unpackhi_epi8(a2, zero); + rgb[4] = _mm_unpacklo_epi8(a3, zero); + rgb[5] = _mm_unpackhi_epi8(a3, zero); +} + +// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX +// It's a macro and not a function because we need to use immediate values with +// srai_epi32, e.g. +#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ + ROUNDER, DESCALE_FIX, OUT) do { \ + const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ + const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ + const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ + const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \ + const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \ + const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \ + const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \ + const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \ + const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \ + const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \ + (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \ +} while (0) + +#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) +static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { + const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); + const __m128i kGB_y = MK_CST_16(16384, 6420); + const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); + + const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); + const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); + const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); + const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); +} + +static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const U, + __m128i* const V) { + const __m128i kRG_u = MK_CST_16(-9719, -19081); + const __m128i kGB_u = MK_CST_16(0, 28800); + const __m128i kRG_v = MK_CST_16(28800, 0); + const __m128i kGB_v = MK_CST_16(-24116, -4684); + const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2); + + const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); + const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); + const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); + const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, + kHALF_UV, YUV_FIX + 2, *U); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v, + kHALF_UV, YUV_FIX + 2, *V); +} + +#undef MK_CST_16 +#undef TRANSFORM + +static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { + const int max_width = width & ~31; + int i; + for (i = 0; i < max_width; rgb += 3 * 16 * 2) { + __m128i rgb_plane[6]; + int j; + + RGB24PackedToPlanar_SSE41(rgb, rgb_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); + g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); + b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); + g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); + b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + } + for (; i < width; ++i, rgb += 3) { // left-over + y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); + } +} + +static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { + const int max_width = width & ~31; + int i; + for (i = 0; i < max_width; bgr += 3 * 16 * 2) { + __m128i bgr_plane[6]; + int j; + + RGB24PackedToPlanar_SSE41(bgr, bgr_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); + g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); + r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); + g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); + r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + } + for (; i < width; ++i, bgr += 3) { // left-over + y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); + } +} + +static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) { + const int max_width = width & ~15; + int i; + for (i = 0; i < max_width; i += 16) { + __m128i Y0, Y1, rgb[6]; + RGB32PackedToPlanar_SSE41(&argb[i], rgb); + ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1); + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + for (; i < width; ++i) { // left-over + const uint32_t p = argb[i]; + y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, + YUV_HALF); + } +} + +// Horizontal add (doubled) of two 16b values, result is 16b. +// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... +static void HorizontalAddPack_SSE41(const __m128i* const A, + const __m128i* const B, + __m128i* const out) { + const __m128i k2 = _mm_set1_epi16(2); + const __m128i C = _mm_madd_epi16(*A, k2); + const __m128i D = _mm_madd_epi16(*B, k2); + *out = _mm_packs_epi32(C, D); +} + +static void ConvertARGBToUV_SSE41(const uint32_t* argb, + uint8_t* u, uint8_t* v, + int src_width, int do_store) { + const int max_width = src_width & ~31; + int i; + for (i = 0; i < max_width; i += 32, u += 16, v += 16) { + __m128i rgb[6], U0, V0, U1, V1; + RGB32PackedToPlanar_SSE41(&argb[i], rgb); + HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); + + RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb); + HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); + + U0 = _mm_packus_epi16(U0, U1); + V0 = _mm_packus_epi16(V0, V1); + if (!do_store) { + const __m128i prev_u = LOAD_16(u); + const __m128i prev_v = LOAD_16(v); + U0 = _mm_avg_epu8(U0, prev_u); + V0 = _mm_avg_epu8(V0, prev_v); + } + STORE_16(U0, u); + STORE_16(V0, v); + } + if (i < src_width) { // left-over + WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); + } +} + +// Convert 16 packed ARGB 16b-values to r[], g[], b[] +static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( + const uint16_t* const rgbx, + __m128i* const r, __m128i* const g, __m128i* const b) { + const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x + const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x + const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... + const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... + // aarrggbb as 16-bit. + const __m128i shuff0 = + _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + const __m128i shuff1 = + _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0); + const __m128i A0 = _mm_shuffle_epi8(in0, shuff0); + const __m128i A1 = _mm_shuffle_epi8(in1, shuff1); + const __m128i A2 = _mm_shuffle_epi8(in2, shuff0); + const __m128i A3 = _mm_shuffle_epi8(in3, shuff1); + // R0R1G0G1 + // B0B1**** + // R2R3G2G3 + // B2B3**** + // (OR is used to free port 5 for the unpack) + const __m128i B0 = _mm_unpacklo_epi32(A0, A1); + const __m128i B1 = _mm_or_si128(A0, A1); + const __m128i B2 = _mm_unpacklo_epi32(A2, A3); + const __m128i B3 = _mm_or_si128(A2, A3); + // Gather the channels. + *r = _mm_unpacklo_epi64(B0, B2); + *g = _mm_unpackhi_epi64(B0, B2); + *b = _mm_unpackhi_epi64(B1, B3); +} + +static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { + const int max_width = width & ~15; + const uint16_t* const last_rgb = rgb + 4 * max_width; + while (rgb < last_rgb) { + __m128i r, g, b, U0, V0, U1, V1; + RGBA32PackedToPlanar_16b_SSE41(rgb + 0, &r, &g, &b); + ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0); + RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b); + ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1); + STORE_16(_mm_packus_epi16(U0, U1), u); + STORE_16(_mm_packus_epi16(V0, V1), v); + u += 16; + v += 16; + rgb += 2 * 32; + } + if (max_width < width) { // left-over + WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width); + } +} + +//------------------------------------------------------------------------------ + +extern void WebPInitConvertARGBToYUVSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) { + WebPConvertARGBToY = ConvertARGBToY_SSE41; + WebPConvertARGBToUV = ConvertARGBToUV_SSE41; + + WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41; + WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41; + + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41; +} + +//------------------------------------------------------------------------------ + +#else // !WEBP_USE_SSE41 + +WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41) +WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41) + +#endif // WEBP_USE_SSE41 |