diff options
Diffstat (limited to 'third_party/aom/av1/common/x86/selfguided_sse4.c')
-rw-r--r-- | third_party/aom/av1/common/x86/selfguided_sse4.c | 176 |
1 files changed, 95 insertions, 81 deletions
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index e2e4f51c3..4006b8518 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -3,6 +3,7 @@ #include "./aom_config.h" #include "./av1_rtcd.h" #include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" /* Calculate four consecutive entries of the intermediate A and B arrays (corresponding to the first loop in the C version of @@ -71,8 +72,8 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height, __m128i a, b, x, y, x2, y2; __m128i sum, sum_sq, tmp; - a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j])); - b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j])); + a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j])); + b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j])); sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b)); tmp = _mm_unpacklo_epi16(a, b); @@ -81,7 +82,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[j], sum); _mm_store_si128((__m128i *)&A[j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -91,9 +92,9 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j])); y = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j])); sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x)); @@ -106,7 +107,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -242,9 +243,9 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height, __m128i a, b, c, c2, x, y, x2, y2; __m128i sum, sum_sq, tmp; - a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j])); - b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j])); - c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j])); + a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j])); + b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j])); + c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j])); sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c)); // Important: Since c may be up to 2^8, the result on squaring may @@ -256,7 +257,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[j], sum); _mm_store_si128((__m128i *)&A[j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -264,7 +265,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[buf_stride + j], sum); _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -289,7 +290,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -298,7 +299,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -443,10 +444,10 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, __m128i a, b, c, d, x, y, x2, y2; __m128i sum, sum_sq, tmp, tmp2; - a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j])); - b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j])); - c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j])); - d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j])); + a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j])); + b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j])); + c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j])); + d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j])); sum = _mm_cvtepi16_epi32( _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d))); @@ -458,7 +459,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[j], sum); _mm_store_si128((__m128i *)&A[j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -466,7 +467,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[buf_stride + j], sum); _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -474,7 +475,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum); _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq); - x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j])); + x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j])); sum = _mm_add_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_add_epi32(sum_sq, x2); @@ -483,10 +484,8 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum); _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq); - x = _mm_cvtepu8_epi32( - _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j]))); - y = _mm_cvtepu8_epi32( - _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j]))); + x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j])); + y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j])); sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x)); @@ -499,7 +498,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -508,7 +507,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -517,7 +516,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height, _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq); x = _mm_cvtepu8_epi32( - _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j])); + xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j])); sum = _mm_sub_epi32(sum, x); x2 = _mm_mullo_epi32(x, x); sum_sq = _mm_sub_epi32(sum_sq, x2); @@ -664,38 +663,48 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width, } void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, - int stride, int32_t *dst, int dst_stride, - int r, int eps, int32_t *tmpbuf) { - int32_t *A = tmpbuf; - int32_t *B = A + SGRPROJ_OUTBUF_SIZE; + int dgd_stride, int32_t *dst, + int dst_stride, int r, int eps) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; int i, j; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes for efficiency. - int buf_stride = ((width + 3) & ~3) + 16; + int buf_stride = ((width_ext + 3) & ~3) + 16; // Don't filter tiles with dimensions < 5 on any axis if ((width < 5) || (height < 5)) return; + uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ; if (r == 1) { - selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride); - selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8); + selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B, + buf_stride); + selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8); } else if (r == 2) { - selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride); - selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8); + selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B, + buf_stride); + selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8); } else if (r == 3) { - selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride); - selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8); + selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B, + buf_stride); + selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8); } else { assert(0); } + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; { i = 0; j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + @@ -707,7 +716,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, } for (j = 1; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] + @@ -720,7 +729,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + @@ -735,7 +744,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) + @@ -751,7 +760,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, // Vectorize the innermost loop for (j = 1; j < width - 1; j += 4) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; @@ -804,7 +813,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, // (typically have 2 such pixels, but may have anywhere between 0 and 3) for (; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; const int32_t a = @@ -826,7 +835,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) + @@ -845,7 +854,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + @@ -857,7 +866,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, } for (j = 1; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] + @@ -870,7 +879,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + @@ -1051,7 +1060,6 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -1059,12 +1067,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, sgr_params[eps].corner, sgr_params[eps].edge); #else av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width, - sgr_params[eps].r1, sgr_params[eps].e1, - tmpbuf2); + sgr_params[eps].r1, sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width, - sgr_params[eps].r2, sgr_params[eps].e2, - tmpbuf2); + sgr_params[eps].r2, sgr_params[eps].e2); decode_xq(xqd, xq); __m128i xq0 = _mm_set1_epi32(xq[0]); @@ -1364,43 +1370,52 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width, } void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, - int height, int stride, + int height, int dgd_stride, int32_t *dst, int dst_stride, - int bit_depth, int r, int eps, - int32_t *tmpbuf) { - int32_t *A = tmpbuf; - int32_t *B = A + SGRPROJ_OUTBUF_SIZE; + int bit_depth, int r, int eps) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; int i, j; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes for efficiency. - int buf_stride = ((width + 3) & ~3) + 16; + int buf_stride = ((width_ext + 3) & ~3) + 16; // Don't filter tiles with dimensions < 5 on any axis if ((width < 5) || (height < 5)) return; + uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ; if (r == 1) { - highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B, - buf_stride); - selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth); + highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, + A, B, buf_stride); + selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, + bit_depth); } else if (r == 2) { - highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B, - buf_stride); - selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth); + highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, + A, B, buf_stride); + selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, + bit_depth); } else if (r == 3) { - highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B, - buf_stride); - selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth); + highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, + A, B, buf_stride); + selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, + bit_depth); } else { assert(0); } + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; { i = 0; j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + @@ -1412,7 +1427,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, } for (j = 1; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] + @@ -1425,7 +1440,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + @@ -1440,7 +1455,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) + @@ -1456,7 +1471,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, // Vectorize the innermost loop for (j = 1; j < width - 1; j += 4) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; @@ -1509,7 +1524,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, // (typically have 2 such pixels, but may have anywhere between 0 and 3) for (; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; const int32_t a = @@ -1531,7 +1546,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) + @@ -1550,7 +1565,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, j = 0; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + @@ -1562,7 +1577,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, } for (j = 1; j < width - 1; ++j) { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] + @@ -1575,7 +1590,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, j = width - 1; { const int k = i * buf_stride + j; - const int l = i * stride + j; + const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 3; const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + @@ -1725,7 +1740,6 @@ void apply_selfguided_restoration_highbd_sse4_1( int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -1735,11 +1749,11 @@ void apply_selfguided_restoration_highbd_sse4_1( #else av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1, width, bit_depth, sgr_params[eps].r1, - sgr_params[eps].e1, tmpbuf2); + sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2, width, bit_depth, sgr_params[eps].r2, - sgr_params[eps].e2, tmpbuf2); + sgr_params[eps].e2); decode_xq(xqd, xq); __m128i xq0 = _mm_set1_epi32(xq[0]); |