From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- gfx/qcms/transform-sse2.c | 243 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 gfx/qcms/transform-sse2.c (limited to 'gfx/qcms/transform-sse2.c') diff --git a/gfx/qcms/transform-sse2.c b/gfx/qcms/transform-sse2.c new file mode 100644 index 000000000..dc9f495e7 --- /dev/null +++ b/gfx/qcms/transform-sse2.c @@ -0,0 +1,243 @@ +#include + +#include "qcmsint.h" + +/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ +#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) +#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) +static const ALIGN float floatScaleX4[4] = + { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; +static const ALIGN float clampMaxValueX4[4] = + { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; + +void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, + unsigned char *src, + unsigned char *dest, + size_t length) +{ + unsigned int i; + float (*mat)[4] = transform->matrix; + char input_back[32]; + /* Ensure we have a buffer that's 16 byte aligned regardless of the original + * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) + * because they don't work on stack variables. gcc 4.4 does do the right thing + * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ + float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); + /* share input and output locations to save having to keep the + * locations in separate registers */ + uint32_t const * output = (uint32_t*)input; + + /* deref *transform now to avoid it in loop */ + const float *igtbl_r = transform->input_gamma_table_r; + const float *igtbl_g = transform->input_gamma_table_g; + const float *igtbl_b = transform->input_gamma_table_b; + + /* deref *transform now to avoid it in loop */ + const uint8_t *otdata_r = &transform->output_table_r->data[0]; + const uint8_t *otdata_g = &transform->output_table_g->data[0]; + const uint8_t *otdata_b = &transform->output_table_b->data[0]; + + /* input matrix values never change */ + const __m128 mat0 = _mm_load_ps(mat[0]); + const __m128 mat1 = _mm_load_ps(mat[1]); + const __m128 mat2 = _mm_load_ps(mat[2]); + + /* these values don't change, either */ + const __m128 max = _mm_load_ps(clampMaxValueX4); + const __m128 min = _mm_setzero_ps(); + const __m128 scale = _mm_load_ps(floatScaleX4); + + /* working variables */ + __m128 vec_r, vec_g, vec_b, result; + + /* CYA */ + if (!length) + return; + + /* one pixel is handled outside of the loop */ + length--; + + /* setup for transforming 1st pixel */ + vec_r = _mm_load_ss(&igtbl_r[src[0]]); + vec_g = _mm_load_ss(&igtbl_g[src[1]]); + vec_b = _mm_load_ss(&igtbl_b[src[2]]); + src += 3; + + /* transform all but final pixel */ + + for (i=0; imatrix; + char input_back[32]; + /* Ensure we have a buffer that's 16 byte aligned regardless of the original + * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) + * because they don't work on stack variables. gcc 4.4 does do the right thing + * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ + float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); + /* share input and output locations to save having to keep the + * locations in separate registers */ + uint32_t const * output = (uint32_t*)input; + + /* deref *transform now to avoid it in loop */ + const float *igtbl_r = transform->input_gamma_table_r; + const float *igtbl_g = transform->input_gamma_table_g; + const float *igtbl_b = transform->input_gamma_table_b; + + /* deref *transform now to avoid it in loop */ + const uint8_t *otdata_r = &transform->output_table_r->data[0]; + const uint8_t *otdata_g = &transform->output_table_g->data[0]; + const uint8_t *otdata_b = &transform->output_table_b->data[0]; + + /* input matrix values never change */ + const __m128 mat0 = _mm_load_ps(mat[0]); + const __m128 mat1 = _mm_load_ps(mat[1]); + const __m128 mat2 = _mm_load_ps(mat[2]); + + /* these values don't change, either */ + const __m128 max = _mm_load_ps(clampMaxValueX4); + const __m128 min = _mm_setzero_ps(); + const __m128 scale = _mm_load_ps(floatScaleX4); + + /* working variables */ + __m128 vec_r, vec_g, vec_b, result; + unsigned char alpha; + + /* CYA */ + if (!length) + return; + + /* one pixel is handled outside of the loop */ + length--; + + /* setup for transforming 1st pixel */ + vec_r = _mm_load_ss(&igtbl_r[src[0]]); + vec_g = _mm_load_ss(&igtbl_g[src[1]]); + vec_b = _mm_load_ss(&igtbl_b[src[2]]); + alpha = src[3]; + src += 4; + + /* transform all but final pixel */ + + for (i=0; i