/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifdef FREEBL_NO_DEPEND #include "stubs.h" #endif #include "gcm.h" #include "secerr.h" #include <wmmintrin.h> /* clmul */ #define WRITE64(x, bytes) \ (bytes)[0] = (x) >> 56; \ (bytes)[1] = (x) >> 48; \ (bytes)[2] = (x) >> 40; \ (bytes)[3] = (x) >> 32; \ (bytes)[4] = (x) >> 24; \ (bytes)[5] = (x) >> 16; \ (bytes)[6] = (x) >> 8; \ (bytes)[7] = (x); SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) { uint64_t tmp_out[2]; _mm_storeu_si128((__m128i *)tmp_out, ghash->x); /* maxout must be larger than 16 byte (checked by the caller). */ WRITE64(tmp_out[0], outbuf + 8); WRITE64(tmp_out[1], outbuf); return SECSuccess; } SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, unsigned int count) { size_t i; pre_align __m128i z_high post_align; pre_align __m128i z_low post_align; pre_align __m128i C post_align; pre_align __m128i D post_align; pre_align __m128i E post_align; pre_align __m128i F post_align; pre_align __m128i bin post_align; pre_align __m128i Ci post_align; pre_align __m128i tmp post_align; for (i = 0; i < count; i++, buf += 16) { bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], ((uint16_t)buf[2] << 8) | buf[3], ((uint16_t)buf[4] << 8) | buf[5], ((uint16_t)buf[6] << 8) | buf[7], ((uint16_t)buf[8] << 8) | buf[9], ((uint16_t)buf[10] << 8) | buf[11], ((uint16_t)buf[12] << 8) | buf[13], ((uint16_t)buf[14] << 8) | buf[15]); Ci = _mm_xor_si128(bin, ghash->x); /* Do binary mult ghash->X = Ci * ghash->H. */ C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); tmp = _mm_xor_si128(E, F); z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); z_high = _mm_unpackhi_epi64(z_high, D); z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); /* Shift one to the left (multiply by x) as gcm spec is stupid. */ C = _mm_slli_si128(z_low, 8); E = _mm_srli_epi64(C, 63); D = _mm_slli_si128(z_high, 8); F = _mm_srli_epi64(D, 63); /* Carry over */ C = _mm_srli_si128(z_low, 8); D = _mm_srli_epi64(C, 63); z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); /* Reduce */ C = _mm_slli_si128(z_low, 8); /* D = z_low << 127 */ D = _mm_slli_epi64(C, 63); /* E = z_low << 126 */ E = _mm_slli_epi64(C, 62); /* F = z_low << 121 */ F = _mm_slli_epi64(C, 57); /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); C = _mm_srli_si128(z_low, 8); /* D = z_low >> 1 */ D = _mm_slli_epi64(C, 63); D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); /* E = z_low >> 2 */ E = _mm_slli_epi64(C, 62); E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); /* F = z_low >> 7 */ F = _mm_slli_epi64(C, 57); F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ ghash->x = _mm_xor_si128(_mm_xor_si128( _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), F); } return SECSuccess; } SECStatus gcm_HashInit_hw(gcmHashContext *ghash) { ghash->ghash_mul = gcm_HashMult_hw; ghash->x = _mm_setzero_si128(); /* MSVC requires __m64 to load epi64. */ ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, ghash->h_low >> 32, (uint32_t)ghash->h_low); ghash->hw = PR_TRUE; return SECSuccess; } SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash) { ghash->x = _mm_setzero_si128(); return SECSuccess; }