summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86/selfguided_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/x86/selfguided_sse4.c')
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c176
1 files changed, 95 insertions, 81 deletions
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index e2e4f51c3..4006b8518 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -3,6 +3,7 @@
#include "./aom_config.h"
#include "./av1_rtcd.h"
#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
/* Calculate four consecutive entries of the intermediate A and B arrays
(corresponding to the first loop in the C version of
@@ -71,8 +72,8 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
__m128i a, b, x, y, x2, y2;
__m128i sum, sum_sq, tmp;
- a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+ a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
tmp = _mm_unpacklo_epi16(a, b);
@@ -81,7 +82,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -91,9 +92,9 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
y = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j]));
sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
@@ -106,7 +107,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -242,9 +243,9 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
__m128i a, b, c, c2, x, y, x2, y2;
__m128i sum, sum_sq, tmp;
- a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
- c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+ a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
+ c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
// Important: Since c may be up to 2^8, the result on squaring may
@@ -256,7 +257,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -264,7 +265,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -289,7 +290,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -298,7 +299,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -443,10 +444,10 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
__m128i a, b, c, d, x, y, x2, y2;
__m128i sum, sum_sq, tmp, tmp2;
- a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
- c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
- d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+ a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
+ c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
+ d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
sum = _mm_cvtepi16_epi32(
_mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
@@ -458,7 +459,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[j], sum);
_mm_store_si128((__m128i *)&A[j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -466,7 +467,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -474,7 +475,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
- x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+ x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j]));
sum = _mm_add_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -483,10 +484,8 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
- x = _mm_cvtepu8_epi32(
- _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
- y = _mm_cvtepu8_epi32(
- _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
+ x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j]));
+ y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j]));
sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
@@ -499,7 +498,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -508,7 +507,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -517,7 +516,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
_mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
x = _mm_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
sum = _mm_sub_epi32(sum, x);
x2 = _mm_mullo_epi32(x, x);
sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -664,38 +663,48 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
}
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
- int stride, int32_t *dst, int dst_stride,
- int r, int eps, int32_t *tmpbuf) {
- int32_t *A = tmpbuf;
- int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int r, int eps) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
- int buf_stride = ((width + 3) & ~3) + 16;
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
+ uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
- selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 2) {
- selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 3) {
- selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else {
assert(0);
}
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -707,7 +716,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -720,7 +729,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -735,7 +744,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -751,7 +760,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
@@ -804,7 +813,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
@@ -826,7 +835,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -845,7 +854,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -857,7 +866,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -870,7 +879,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1051,7 +1060,6 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -1059,12 +1067,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
- sgr_params[eps].r1, sgr_params[eps].e1,
- tmpbuf2);
+ sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
- sgr_params[eps].r2, sgr_params[eps].e2,
- tmpbuf2);
+ sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
@@ -1364,43 +1370,52 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
}
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
- int height, int stride,
+ int height, int dgd_stride,
int32_t *dst, int dst_stride,
- int bit_depth, int r, int eps,
- int32_t *tmpbuf) {
- int32_t *A = tmpbuf;
- int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int bit_depth, int r, int eps) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
- int buf_stride = ((width + 3) & ~3) + 16;
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
+ uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
- highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else if (r == 2) {
- highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else if (r == 3) {
- highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else {
assert(0);
}
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -1412,7 +1427,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -1425,7 +1440,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -1440,7 +1455,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1456,7 +1471,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
@@ -1509,7 +1524,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
@@ -1531,7 +1546,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1550,7 +1565,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -1562,7 +1577,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -1575,7 +1590,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1725,7 +1740,6 @@ void apply_selfguided_restoration_highbd_sse4_1(
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -1735,11 +1749,11 @@ void apply_selfguided_restoration_highbd_sse4_1(
#else
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
- sgr_params[eps].e1, tmpbuf2);
+ sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
- sgr_params[eps].e2, tmpbuf2);
+ sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);