diff options
Diffstat (limited to 'gfx/skia/skia/src/opts/SkChecksum_opts.h')
-rw-r--r-- | gfx/skia/skia/src/opts/SkChecksum_opts.h | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/gfx/skia/skia/src/opts/SkChecksum_opts.h b/gfx/skia/skia/src/opts/SkChecksum_opts.h new file mode 100644 index 000000000..3e1acf08d --- /dev/null +++ b/gfx/skia/skia/src/opts/SkChecksum_opts.h @@ -0,0 +1,216 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkChecksum_opts_DEFINED +#define SkChecksum_opts_DEFINED + +#include "SkChecksum.h" +#include "SkTypes.h" + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 + #include <immintrin.h> +#elif defined(SK_CPU_ARM64) && defined(SK_ARM_HAS_CRC32) + #include <arm_acle.h> +#endif + +namespace SK_OPTS_NS { + +template <typename T> +static inline T unaligned_load(const uint8_t* src) { + T val; + memcpy(&val, src, sizeof(val)); + return val; +} + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) + // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { + auto data = (const uint8_t*)vdata; + + // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. + uint64_t hash = seed; + if (bytes >= 24) { + // We'll create 3 independent hashes, each using _mm_crc32_u64() + // to hash 8 bytes per step. Both 3 and independent are important: + // we can execute 3 of these instructions in parallel on a single core. + uint64_t a = hash, + b = hash, + c = hash; + size_t steps = bytes/24; + while (steps --> 0) { + a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0)); + b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8)); + c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16)); + data += 24; + } + bytes %= 24; + hash = a^b^c; + } + + SkASSERT(bytes < 24); + if (bytes >= 16) { + hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); + bytes -= 8; + data += 8; + } + + SkASSERT(bytes < 16); + if (bytes & 8) { + hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data)); + data += 8; + } + + // The remainder of these _mm_crc32_u*() operate on a 32-bit register. + // We don't lose anything here: only the bottom 32-bits were populated. + auto hash32 = (uint32_t)hash; + + if (bytes & 4) { + hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data)); + data += 4; + } + if (bytes & 2) { + hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data)); + data += 2; + } + if (bytes & 1) { + hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data)); + } + return hash32; + } + +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 + // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { + auto data = (const uint8_t*)vdata; + + if (bytes >= 12) { + // We'll create 3 independent hashes, each using _mm_crc32_u32() + // to hash 4 bytes per step. Both 3 and independent are important: + // we can execute 3 of these instructions in parallel on a single core. + uint32_t a = hash, + b = hash, + c = hash; + size_t steps = bytes/12; + while (steps --> 0) { + a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0)); + b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4)); + c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8)); + data += 12; + } + bytes %= 12; + hash = a^b^c; + } + + SkASSERT(bytes < 12); + if (bytes >= 8) { + hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); + bytes -= 4; + data += 4; + } + + SkASSERT(bytes < 8); + if (bytes & 4) { + hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); + data += 4; + } + if (bytes & 2) { + hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data)); + data += 2; + } + if (bytes & 1) { + hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data)); + } + return hash; + } + +#elif defined(SK_CPU_ARM64) && defined(SK_ARM_HAS_CRC32) + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { + auto data = (const uint8_t*)vdata; + if (bytes >= 24) { + uint32_t a = hash, + b = hash, + c = hash; + size_t steps = bytes/24; + while (steps --> 0) { + a = __crc32d(a, unaligned_load<uint64_t>(data+ 0)); + b = __crc32d(b, unaligned_load<uint64_t>(data+ 8)); + c = __crc32d(c, unaligned_load<uint64_t>(data+16)); + data += 24; + } + bytes %= 24; + hash = a^b^c; + } + + SkASSERT(bytes < 24); + if (bytes >= 16) { + hash = __crc32d(hash, unaligned_load<uint64_t>(data)); + bytes -= 8; + data += 8; + } + + SkASSERT(bytes < 16); + if (bytes & 8) { + hash = __crc32d(hash, unaligned_load<uint64_t>(data)); + data += 8; + } + if (bytes & 4) { + hash = __crc32w(hash, unaligned_load<uint32_t>(data)); + data += 4; + } + if (bytes & 2) { + hash = __crc32h(hash, unaligned_load<uint16_t>(data)); + data += 2; + } + if (bytes & 1) { + hash = __crc32b(hash, unaligned_load<uint8_t>(data)); + } + return hash; + } + +#else + // This is Murmur3. + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { + auto data = (const uint8_t*)vdata; + + size_t original_bytes = bytes; + + // Handle 4 bytes at a time while possible. + while (bytes >= 4) { + uint32_t k = unaligned_load<uint32_t>(data); + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + k *= 0x1b873593; + + hash ^= k; + hash = (hash << 13) | (hash >> 19); + hash *= 5; + hash += 0xe6546b64; + + bytes -= 4; + data += 4; + } + + // Handle last 0-3 bytes. + uint32_t k = 0; + switch (bytes & 3) { + case 3: k ^= data[2] << 16; + case 2: k ^= data[1] << 8; + case 1: k ^= data[0] << 0; + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + k *= 0x1b873593; + hash ^= k; + } + + hash ^= original_bytes; + return SkChecksum::Mix(hash); + } +#endif + +} // namespace SK_OPTS_NS + +#endif//SkChecksum_opts_DEFINED |