diff options
Diffstat (limited to 'media')
-rw-r--r-- | media/libcubeb/src/cubeb.c | 6 | ||||
-rw-r--r-- | media/libcubeb/src/cubeb_sndio.c | 2 | ||||
-rw-r--r-- | media/libcubeb/src/cubeb_sun.c | 504 | ||||
-rw-r--r-- | media/libcubeb/src/moz.build | 8 | ||||
-rw-r--r-- | media/libcubeb/tests/moz.build | 2 | ||||
-rw-r--r-- | media/libpng/moz.build | 3 | ||||
-rw-r--r-- | media/libstagefright/moz.build | 2 | ||||
-rw-r--r-- | media/libwebp/dsp/alpha_processing_neon.c | 191 | ||||
-rw-r--r-- | media/libwebp/dsp/filters_neon.c | 329 | ||||
-rw-r--r-- | media/libwebp/dsp/moz.build | 6 | ||||
-rw-r--r-- | media/libwebp/dsp/yuv_neon.c | 288 | ||||
-rw-r--r-- | media/libwebp/update.sh | 3 | ||||
-rw-r--r-- | media/mtransport/third_party/nICEr/src/ice/ice_component.c | 7 |
13 files changed, 1346 insertions, 5 deletions
diff --git a/media/libcubeb/src/cubeb.c b/media/libcubeb/src/cubeb.c index eb22a9b94..a239319a4 100644 --- a/media/libcubeb/src/cubeb.c +++ b/media/libcubeb/src/cubeb.c @@ -54,6 +54,9 @@ int audiotrack_init(cubeb ** context, char const * context_name); #if defined(USE_KAI) int kai_init(cubeb ** context, char const * context_name); #endif +#if defined(USE_SUN) +int sunaudio_init(cubeb ** context, char const * context_name); +#endif static int @@ -141,6 +144,9 @@ cubeb_init(cubeb ** context, char const * context_name) #if defined(USE_KAI) kai_init, #endif +#if defined(USE_SUN) + sunaudio_init, +#endif }; int i; diff --git a/media/libcubeb/src/cubeb_sndio.c b/media/libcubeb/src/cubeb_sndio.c index 793789765..c7ac18446 100644 --- a/media/libcubeb/src/cubeb_sndio.c +++ b/media/libcubeb/src/cubeb_sndio.c @@ -245,7 +245,7 @@ sndio_stream_init(cubeb * context, s->data_cb = data_callback; s->state_cb = state_callback; s->arg = user_ptr; - s->mtx = PTHREAD_MUTEX_INITIALIZER; + s->mtx = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; s->rdpos = s->wrpos = 0; if (output_stream_params->format == CUBEB_SAMPLE_FLOAT32LE) { s->conv = 1; diff --git a/media/libcubeb/src/cubeb_sun.c b/media/libcubeb/src/cubeb_sun.c new file mode 100644 index 000000000..b768bca56 --- /dev/null +++ b/media/libcubeb/src/cubeb_sun.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2013, 2017 Ginn Chen <ginnchen@gmail.com> + * + * This program is made available under an ISC-style license. See the + * accompanying file LICENSE for details. + */ +#include <poll.h> +#include <pthread.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/audio.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/stropts.h> +#include "cubeb/cubeb.h" +#include "cubeb-internal.h" + +/* Macros copied from audio_oss.h */ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 4Front Technologies 1996-2008. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#define OSSIOCPARM_MASK 0x1fff /* parameters must be < 8192 bytes */ +#define OSSIOC_VOID 0x00000000 /* no parameters */ +#define OSSIOC_OUT 0x20000000 /* copy out parameters */ +#define OSSIOC_IN 0x40000000 /* copy in parameters */ +#define OSSIOC_INOUT (OSSIOC_IN|OSSIOC_OUT) +#define OSSIOC_SZ(t) ((sizeof (t) & OSSIOCPARM_MASK) << 16) +#define __OSSIO(x, y) ((int)(OSSIOC_VOID|(x<<8)|y)) +#define __OSSIOR(x, y, t) ((int)(OSSIOC_OUT|OSSIOC_SZ(t)|(x<<8)|y)) +#define __OSSIOWR(x, y, t) ((int)(OSSIOC_INOUT|OSSIOC_SZ(t)|(x<<8)|y)) +#define SNDCTL_DSP_SPEED __OSSIOWR('P', 2, int) +#define SNDCTL_DSP_CHANNELS __OSSIOWR('P', 6, int) +#define SNDCTL_DSP_SETFMT __OSSIOWR('P', 5, int) /* Selects ONE fmt */ +#define SNDCTL_DSP_GETODELAY __OSSIOR('P', 23, int) +#define SNDCTL_DSP_HALT_OUTPUT __OSSIO('P', 34) +#define AFMT_S16_LE 0x00000010 +#define AFMT_S16_BE 0x00000020 + +#if defined(WORDS_BIGENDIAN) || defined(__BIG_ENDIAN__) +#define AFMT_S16_NE AFMT_S16_BE +#else +#define AFMT_S16_NE AFMT_S16_LE +#endif + +#define DEFAULT_AUDIO_DEVICE "/dev/audio" +#define DEFAULT_DSP_DEVICE "/dev/dsp" + +#define BUF_SIZE_MS 10 + +#if defined(CUBEB_SUNAUDIO_DEBUG) +#define DPR(...) fprintf(stderr, __VA_ARGS__); +#else +#define DPR(...) do {} while(0) +#endif + +static struct cubeb_ops const sunaudio_ops; + +struct cubeb { + struct cubeb_ops const * ops; +}; + +struct cubeb_stream { + cubeb * context; + pthread_t th; /* to run real-time audio i/o */ + pthread_mutex_t mutex; /* protects fd and frm_played */ + int fd; /* link us to sunaudio */ + int active; /* cubec_start() called */ + int conv; /* need float->s16 conversion */ + int using_oss; + unsigned char *buf; /* data is prepared here */ + unsigned int rate; + unsigned int n_channles; + unsigned int bytes_per_ch; + unsigned int n_frm; + unsigned int buffer_size; + int64_t frm_played; + cubeb_data_callback data_cb; /* cb to preapare data */ + cubeb_state_callback state_cb; /* cb to notify about state changes */ + void *arg; /* user arg to {data,state}_cb */ +}; + +static void +float_to_s16(void *ptr, long nsamp) +{ + int16_t *dst = ptr; + float *src = ptr; + + while (nsamp-- > 0) + *(dst++) = *(src++) * 32767; +} + +static void * +sunaudio_mainloop(void *arg) +{ + struct cubeb_stream *s = arg; + int state; + + DPR("sunaudio_mainloop()\n"); + + s->state_cb(s, s->arg, CUBEB_STATE_STARTED); + + pthread_mutex_lock(&s->mutex); + DPR("sunaudio_mainloop(), started\n"); + + for (;;) { + if (!s->active) { + DPR("sunaudio_mainloop() stopped\n"); + state = CUBEB_STATE_STOPPED; + break; + } + + if (!s->using_oss) { + audio_info_t info; + ioctl(s->fd, AUDIO_GETINFO, &info); + if (s->frm_played > info.play.samples + 3 * s->n_frm) { + pthread_mutex_unlock(&s->mutex); + struct timespec ts = {0, 10000}; // 10 ms + nanosleep(&ts, NULL); + pthread_mutex_lock(&s->mutex); + continue; + } + } + + pthread_mutex_unlock(&s->mutex); + unsigned int got = s->data_cb(s, s->arg, NULL, s->buf, s->n_frm); + DPR("sunaudio_mainloop() ask %d got %d\n", s->n_frm, got); + pthread_mutex_lock(&s->mutex); + + if (got < 0) { + DPR("sunaudio_mainloop() cb err\n"); + state = CUBEB_STATE_ERROR; + break; + } + + if (s->conv) { + float_to_s16(s->buf, got * s->n_channles); + } + + unsigned int avail = got * 2 * s->n_channles; // coverted to s16 + unsigned int pos = 0; + + while (avail > 0 && s->active) { + int written = write(s->fd, s->buf + pos, avail); + if (written == -1) { + if (errno != EINTR && errno != EWOULDBLOCK) { + DPR("sunaudio_mainloop() write err\n"); + state = CUBEB_STATE_ERROR; + break; + } + pthread_mutex_unlock(&s->mutex); + struct timespec ts = {0, 10000}; // 10 ms + nanosleep(&ts, NULL); + pthread_mutex_lock(&s->mutex); + } else { + pos += written; + DPR("sunaudio_mainloop() write %d pos %d\n", written, pos); + s->frm_played += written / 2 / s->n_channles; + avail -= written; + } + } + + if ((got < s->n_frm)) { + DPR("sunaudio_mainloop() drained\n"); + state = CUBEB_STATE_DRAINED; + break; + } + } + + pthread_mutex_unlock(&s->mutex); + s->state_cb(s, s->arg, state); + + return NULL; +} + +/*static*/ int +sunaudio_init(cubeb **context, char const *context_name) +{ + DPR("sunaudio_init(%s)\n", context_name); + *context = malloc(sizeof(*context)); + (*context)->ops = &sunaudio_ops; + (void)context_name; + return CUBEB_OK; +} + +static char const * +sunaudio_get_backend_id(cubeb *context) +{ + return "sunaudio"; +} + +static void +sunaudio_destroy(cubeb *context) +{ + DPR("sunaudio_destroy()\n"); + free(context); +} + +static int +sunaudio_stream_init(cubeb *context, + cubeb_stream **stream, + char const *stream_name, + cubeb_devid input_device, + cubeb_stream_params * input_stream_params, + cubeb_devid output_device, + cubeb_stream_params * output_stream_params, + unsigned int latency, + cubeb_data_callback data_callback, + cubeb_state_callback state_callback, + void *user_ptr) +{ + struct cubeb_stream *s; + DPR("sunaudio_stream_init(%s)\n", stream_name); + size_t size; + + s = malloc(sizeof(struct cubeb_stream)); + if (s == NULL) + return CUBEB_ERROR; + s->context = context; + + // If UTAUDIODEV is set, use it with Sun Audio interface + char * sa_device_name = getenv("UTAUDIODEV"); + char * dsp_device_name = NULL; + if (!sa_device_name) { + dsp_device_name = getenv("AUDIODSP"); + if (!dsp_device_name) { + dsp_device_name = DEFAULT_DSP_DEVICE; + } + sa_device_name = getenv("AUDIODEV"); + if (!sa_device_name) { + sa_device_name = DEFAULT_AUDIO_DEVICE; + } + } + + s->using_oss = 0; + // Try to use OSS if available + if (dsp_device_name) { + s->fd = open(dsp_device_name, O_WRONLY | O_NONBLOCK); + if (s->fd >= 0) { + s->using_oss = 1; + } + } + + // Try Sun Audio + if (!s->using_oss) { + s->fd = open(sa_device_name, O_WRONLY | O_NONBLOCK); + } + + if (s->fd < 0) { + free(s); + DPR("sunaudio_stream_init(), open() failed\n"); + return CUBEB_ERROR; + } + + if (s->using_oss) { + if (ioctl(s->fd, SNDCTL_DSP_SPEED, &output_stream_params->rate) < 0) { + DPR("ioctl SNDCTL_DSP_SPEED failed.\n"); + close(s->fd); + free(s); + return CUBEB_ERROR_INVALID_FORMAT; + } + + if (ioctl(s->fd, SNDCTL_DSP_CHANNELS, &output_stream_params->channels) < 0) { + DPR("ioctl SNDCTL_DSP_CHANNELS failed.\n"); + close(s->fd); + free(s); + return CUBEB_ERROR_INVALID_FORMAT; + } + + int format = AFMT_S16_NE; + if (ioctl(s->fd, SNDCTL_DSP_SETFMT, &format) < 0) { + DPR("ioctl SNDCTL_DSP_SETFMT failed.\n"); + close(s->fd); + free(s); + return CUBEB_ERROR_INVALID_FORMAT; + } + } else { + audio_info_t audio_info; + AUDIO_INITINFO(&audio_info) + audio_info.play.sample_rate = output_stream_params->rate; + audio_info.play.channels = output_stream_params->channels; + audio_info.play.encoding = AUDIO_ENCODING_LINEAR; + audio_info.play.precision = 16; + if (ioctl(s->fd, AUDIO_SETINFO, &audio_info) == -1) { + DPR("ioctl AUDIO_SETINFO failed.\n"); + close(s->fd); + free(s); + return CUBEB_ERROR_INVALID_FORMAT; + } + } + + s->conv = 0; + switch (output_stream_params->format) { + case CUBEB_SAMPLE_S16NE: + s->bytes_per_ch = 2; + break; + case CUBEB_SAMPLE_FLOAT32NE: + s->bytes_per_ch = 4; + s->conv = 1; + break; + default: + DPR("sunaudio_stream_init() unsupported format\n"); + close(s->fd); + free(s); + return CUBEB_ERROR_INVALID_FORMAT; + } + + s->active = 0; + s->rate = output_stream_params->rate; + s->n_channles = output_stream_params->channels; + s->data_cb = data_callback; + s->state_cb = state_callback; + s->arg = user_ptr; + if (pthread_mutex_init(&s->mutex, NULL) != 0) { + free(s); + return CUBEB_ERROR; + } + s->frm_played = 0; + s->n_frm = s->rate * BUF_SIZE_MS / 1000; + s->buffer_size = s->bytes_per_ch * s->n_channles * s->n_frm; + s->buf = malloc(s->buffer_size); + if (s->buf == NULL) { + close(s->fd); + free(s); + return CUBEB_ERROR; + } + + *stream = s; + DPR("sunaudio_stream_init() end, ok\n"); + return CUBEB_OK; +} + +static void +sunaudio_stream_destroy(cubeb_stream *s) +{ + DPR("sunaudio_stream_destroy()\n"); + if (s->fd > 0) { + // Flush buffer + if (s->using_oss) { + ioctl(s->fd, SNDCTL_DSP_HALT_OUTPUT); + } else { + ioctl(s->fd, I_FLUSH); + } + close(s->fd); + } + free(s->buf); + free(s); +} + +static int +sunaudio_stream_start(cubeb_stream *s) +{ + int err; + + DPR("sunaudio_stream_start()\n"); + s->active = 1; + err = pthread_create(&s->th, NULL, sunaudio_mainloop, s); + if (err) { + s->active = 0; + return CUBEB_ERROR; + } + return CUBEB_OK; +} + +static int +sunaudio_stream_stop(cubeb_stream *s) +{ + void *dummy; + + DPR("sunaudio_stream_stop()\n"); + if (s->active) { + s->active = 0; + pthread_join(s->th, &dummy); + } + return CUBEB_OK; +} + +static int +sunaudio_stream_get_position(cubeb_stream *s, uint64_t *p) +{ + int rv = CUBEB_OK; + pthread_mutex_lock(&s->mutex); + if (s->active && s->fd > 0) { + if (s->using_oss) { + int delay; + ioctl(s->fd, SNDCTL_DSP_GETODELAY, &delay); + int64_t t = s->frm_played - delay / s->n_channles / 2; + if (t < 0) { + *p = 0; + } else { + *p = t; + } + } else { + audio_info_t info; + ioctl(s->fd, AUDIO_GETINFO, &info); + *p = info.play.samples; + } + DPR("sunaudio_stream_get_position() %lld\n", *p); + } else { + rv = CUBEB_ERROR; + } + pthread_mutex_unlock(&s->mutex); + return rv; +} + +static int +sunaudio_get_max_channel_count(cubeb * ctx, uint32_t * max_channels) +{ + if (!ctx || !max_channels) + return CUBEB_ERROR; + + *max_channels = 2; + + return CUBEB_OK; +} + +static int +sunaudio_get_preferred_sample_rate(cubeb * ctx, uint32_t * rate) +{ + if (!ctx || !rate) + return CUBEB_ERROR; + + // XXX Not yet implemented. + *rate = 44100; + + return CUBEB_OK; +} + +static int +sunaudio_get_min_latency(cubeb * ctx, cubeb_stream_params params, uint32_t * latency_ms) +{ + if (!ctx || !latency_ms) + return CUBEB_ERROR; + + // XXX Not yet implemented. + *latency_ms = 20; + + return CUBEB_OK; +} + +static int +sunaudio_stream_get_latency(cubeb_stream * s, uint32_t * latency) +{ + if (!s || !latency) + return CUBEB_ERROR; + + int rv = CUBEB_OK; + pthread_mutex_lock(&s->mutex); + if (s->active && s->fd > 0) { + if (s->using_oss) { + int delay; + ioctl(s->fd, SNDCTL_DSP_GETODELAY, &delay); + *latency = delay / s->n_channles / 2 / s->rate; + } else { + audio_info_t info; + ioctl(s->fd, AUDIO_GETINFO, &info); + *latency = (s->frm_played - info.play.samples) / s->rate; + } + DPR("sunaudio_stream_get_position() %lld\n", *p); + } else { + rv = CUBEB_ERROR; + } + pthread_mutex_unlock(&s->mutex); + return rv; +} + +static struct cubeb_ops const sunaudio_ops = { + .init = sunaudio_init, + .get_backend_id = sunaudio_get_backend_id, + .destroy = sunaudio_destroy, + .get_preferred_sample_rate = sunaudio_get_preferred_sample_rate, + .stream_init = sunaudio_stream_init, + .stream_destroy = sunaudio_stream_destroy, + .stream_start = sunaudio_stream_start, + .stream_stop = sunaudio_stream_stop, + .stream_get_position = sunaudio_stream_get_position, + .get_max_channel_count = sunaudio_get_max_channel_count, + .get_min_latency = sunaudio_get_min_latency, + .stream_get_latency = sunaudio_stream_get_latency +}; diff --git a/media/libcubeb/src/moz.build b/media/libcubeb/src/moz.build index 2ca3a2f54..772aa6d39 100644 --- a/media/libcubeb/src/moz.build +++ b/media/libcubeb/src/moz.build @@ -39,12 +39,18 @@ if CONFIG['MOZ_JACK']: ] DEFINES['USE_JACK'] = True -if CONFIG['OS_ARCH'] == 'OpenBSD': +if CONFIG['MOZ_SNDIO']: SOURCES += [ 'cubeb_sndio.c', ] DEFINES['USE_SNDIO'] = True +if CONFIG['OS_ARCH'] == 'SunOS': + SOURCES += [ + 'cubeb_sun.c', + ] + DEFINES['USE_SUN'] = True + if CONFIG['OS_TARGET'] == 'Darwin': SOURCES += [ 'cubeb_audiounit.cpp', diff --git a/media/libcubeb/tests/moz.build b/media/libcubeb/tests/moz.build index 1b17c7b1c..ca63a4d8f 100644 --- a/media/libcubeb/tests/moz.build +++ b/media/libcubeb/tests/moz.build @@ -68,7 +68,7 @@ elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'uikit': '-framework CoreFoundation', '-framework AudioToolbox', ] -elif CONFIG['OS_TARGET'] == 'OpenBSD': +elif CONFIG['MOZ_SNDIO']: OS_LIBS += [ 'sndio', ] diff --git a/media/libpng/moz.build b/media/libpng/moz.build index 9146a8d5a..f2538484b 100644 --- a/media/libpng/moz.build +++ b/media/libpng/moz.build @@ -51,3 +51,6 @@ FINAL_LIBRARY = 'gkmedias' # We allow warnings for third-party code that can be updated from upstream. ALLOW_COMPILER_WARNINGS = True + +if CONFIG['GNU_CC']: + CFLAGS += ['-std=c89'] diff --git a/media/libstagefright/moz.build b/media/libstagefright/moz.build index 5a8c9521a..87e162112 100644 --- a/media/libstagefright/moz.build +++ b/media/libstagefright/moz.build @@ -7,7 +7,7 @@ DEFINES['ANDROID_SMP'] = 0 DEFINES['LOG_NDEBUG'] = 1 -if CONFIG['OS_TARGET'] != 'WINNT': +if CONFIG['OS_TARGET'] != 'WINNT' and CONFIG['OS_TARGET'] != 'SunOS': DEFINES['_GLIBCXX_OS_DEFINES'] = True if CONFIG['OS_TARGET'] == 'WINNT': diff --git a/media/libwebp/dsp/alpha_processing_neon.c b/media/libwebp/dsp/alpha_processing_neon.c new file mode 100644 index 000000000..53dfce2b3 --- /dev/null +++ b/media/libwebp/dsp/alpha_processing_neon.c @@ -0,0 +1,191 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Utilities for processing transparent channel, NEON version. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "../dsp/dsp.h" + +#if defined(WEBP_USE_NEON) + +#include "../dsp/neon.h" + +//------------------------------------------------------------------------------ + +#define MULTIPLIER(a) ((a) * 0x8081) +#define PREMULTIPLY(x, m) (((x) * (m)) >> 23) + +#define MULTIPLY_BY_ALPHA(V, ALPHA, OTHER) do { \ + const uint8x8_t alpha = (V).val[(ALPHA)]; \ + const uint16x8_t r1 = vmull_u8((V).val[1], alpha); \ + const uint16x8_t g1 = vmull_u8((V).val[2], alpha); \ + const uint16x8_t b1 = vmull_u8((V).val[(OTHER)], alpha); \ + /* we use: v / 255 = (v + 1 + (v >> 8)) >> 8 */ \ + const uint16x8_t r2 = vsraq_n_u16(r1, r1, 8); \ + const uint16x8_t g2 = vsraq_n_u16(g1, g1, 8); \ + const uint16x8_t b2 = vsraq_n_u16(b1, b1, 8); \ + const uint16x8_t r3 = vaddq_u16(r2, kOne); \ + const uint16x8_t g3 = vaddq_u16(g2, kOne); \ + const uint16x8_t b3 = vaddq_u16(b2, kOne); \ + (V).val[1] = vshrn_n_u16(r3, 8); \ + (V).val[2] = vshrn_n_u16(g3, 8); \ + (V).val[(OTHER)] = vshrn_n_u16(b3, 8); \ +} while (0) + +static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first, + int w, int h, int stride) { + const uint16x8_t kOne = vdupq_n_u16(1u); + while (h-- > 0) { + uint32_t* const rgbx = (uint32_t*)rgba; + int i = 0; + if (alpha_first) { + for (; i + 8 <= w; i += 8) { + // load aaaa...|rrrr...|gggg...|bbbb... + uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i)); + MULTIPLY_BY_ALPHA(RGBX, 0, 3); + vst4_u8((uint8_t*)(rgbx + i), RGBX); + } + } else { + for (; i + 8 <= w; i += 8) { + uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i)); + MULTIPLY_BY_ALPHA(RGBX, 3, 0); + vst4_u8((uint8_t*)(rgbx + i), RGBX); + } + } + // Finish with left-overs. + for (; i < w; ++i) { + uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); + const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); + const uint32_t a = alpha[4 * i]; + if (a != 0xff) { + const uint32_t mult = MULTIPLIER(a); + rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); + rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); + rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); + } + } + rgba += stride; + } +} +#undef MULTIPLY_BY_ALPHA +#undef MULTIPLIER +#undef PREMULTIPLY + +//------------------------------------------------------------------------------ + +static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint8_t* dst, int dst_stride) { + uint32_t alpha_mask = 0xffffffffu; + uint8x8_t mask8 = vdup_n_u8(0xff); + uint32_t tmp[2]; + int i, j; + for (j = 0; j < height; ++j) { + // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb + // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store. + // Hence the test with 'width - 1' instead of just 'width'. + for (i = 0; i + 8 <= width - 1; i += 8) { + uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(dst + 4 * i)); + const uint8x8_t alphas = vld1_u8(alpha + i); + rgbX.val[0] = alphas; + vst4_u8((uint8_t*)(dst + 4 * i), rgbX); + mask8 = vand_u8(mask8, alphas); + } + for (; i < width; ++i) { + const uint32_t alpha_value = alpha[i]; + dst[4 * i] = alpha_value; + alpha_mask &= alpha_value; + } + alpha += alpha_stride; + dst += dst_stride; + } + vst1_u8((uint8_t*)tmp, mask8); + alpha_mask &= tmp[0]; + alpha_mask &= tmp[1]; + return (alpha_mask != 0xffffffffu); +} + +static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride) { + int i, j; + uint8x8x4_t greens; // leave A/R/B channels zero'd. + greens.val[0] = vdup_n_u8(0); + greens.val[2] = vdup_n_u8(0); + greens.val[3] = vdup_n_u8(0); + for (j = 0; j < height; ++j) { + for (i = 0; i + 8 <= width; i += 8) { + greens.val[1] = vld1_u8(alpha + i); + vst4_u8((uint8_t*)(dst + i), greens); + } + for (; i < width; ++i) dst[i] = alpha[i] << 8; + alpha += alpha_stride; + dst += dst_stride; + } +} + +static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + uint32_t alpha_mask = 0xffffffffu; + uint8x8_t mask8 = vdup_n_u8(0xff); + uint32_t tmp[2]; + int i, j; + for (j = 0; j < height; ++j) { + // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb + // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store. + // Hence the test with 'width - 1' instead of just 'width'. + for (i = 0; i + 8 <= width - 1; i += 8) { + const uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(argb + 4 * i)); + const uint8x8_t alphas = rgbX.val[0]; + vst1_u8((uint8_t*)(alpha + i), alphas); + mask8 = vand_u8(mask8, alphas); + } + for (; i < width; ++i) { + alpha[i] = argb[4 * i]; + alpha_mask &= alpha[i]; + } + argb += argb_stride; + alpha += alpha_stride; + } + vst1_u8((uint8_t*)tmp, mask8); + alpha_mask &= tmp[0]; + alpha_mask &= tmp[1]; + return (alpha_mask == 0xffffffffu); +} + +static void ExtractGreen_NEON(const uint32_t* argb, + uint8_t* alpha, int size) { + int i; + for (i = 0; i + 16 <= size; i += 16) { + const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i)); + const uint8x16_t greens = rgbX.val[1]; + vst1q_u8(alpha + i, greens); + } + for (; i < size; ++i) alpha[i] = (argb[i] >> 8) & 0xff; +} + +//------------------------------------------------------------------------------ + +extern void WebPInitAlphaProcessingNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingNEON(void) { + WebPApplyAlphaMultiply = ApplyAlphaMultiply_NEON; + WebPDispatchAlpha = DispatchAlpha_NEON; + WebPDispatchAlphaToGreen = DispatchAlphaToGreen_NEON; + WebPExtractAlpha = ExtractAlpha_NEON; + WebPExtractGreen = ExtractGreen_NEON; +} + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingNEON) + +#endif // WEBP_USE_NEON diff --git a/media/libwebp/dsp/filters_neon.c b/media/libwebp/dsp/filters_neon.c new file mode 100644 index 000000000..4788118c9 --- /dev/null +++ b/media/libwebp/dsp/filters_neon.c @@ -0,0 +1,329 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// NEON variant of alpha filters +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "../dsp/dsp.h" + +#if defined(WEBP_USE_NEON) + +#include <assert.h> +#include "../dsp/neon.h" + +//------------------------------------------------------------------------------ +// Helpful macros. + +# define SANITY_CHECK(in, out) \ + assert(in != NULL); \ + assert(out != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ + (void)height; // Silence unused warning. + +// load eight u8 and widen to s16 +#define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A)) +#define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A)) + +// shift left or right by N byte, inserting zeros +#define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N)) +#define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16) + +// rotate left by N bytes +#define ROTATE_LEFT_N(A, N) vext_u8((A), (A), (N)) +// rotate right by N bytes +#define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8) + +static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred, + uint8_t* dst, int length) { + int i; + assert(length >= 0); + for (i = 0; i + 16 <= length; i += 16) { + const uint8x16_t A = vld1q_u8(&src[i]); + const uint8x16_t B = vld1q_u8(&pred[i]); + const uint8x16_t C = vsubq_u8(A, B); + vst1q_u8(&dst[i], C); + } + for (; i < length; ++i) dst[i] = src[i] - pred[i]; +} + +// Special case for left-based prediction (when preds==dst-1 or preds==src-1). +static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) { + PredictLine_NEON(src, src - 1, dst, length); +} + +//------------------------------------------------------------------------------ +// Horizontal filter. + +static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { + const size_t start_offset = row * stride; + const int last_row = row + num_rows; + SANITY_CHECK(in, out); + in += start_offset; + out += start_offset; + + if (row == 0) { + // Leftmost pixel is the same as input for topmost scanline. + out[0] = in[0]; + PredictLineLeft_NEON(in + 1, out + 1, width - 1); + row = 1; + in += stride; + out += stride; + } + + // Filter line-by-line. + while (row < last_row) { + // Leftmost pixel is predicted from above. + out[0] = in[0] - in[-stride]; + PredictLineLeft_NEON(in + 1, out + 1, width - 1); + ++row; + in += stride; + out += stride; + } +} + +static void HorizontalFilter_NEON(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_NEON(data, width, height, stride, 0, height, + filtered_data); +} + +//------------------------------------------------------------------------------ +// Vertical filter. + +static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + uint8_t* out) { + const size_t start_offset = row * stride; + const int last_row = row + num_rows; + SANITY_CHECK(in, out); + in += start_offset; + out += start_offset; + + if (row == 0) { + // Very first top-left pixel is copied. + out[0] = in[0]; + // Rest of top scan-line is left-predicted. + PredictLineLeft_NEON(in + 1, out + 1, width - 1); + row = 1; + in += stride; + out += stride; + } + + // Filter line-by-line. + while (row < last_row) { + PredictLine_NEON(in, in - stride, out, width); + ++row; + in += stride; + out += stride; + } +} + +static void VerticalFilter_NEON(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_NEON(data, width, height, stride, 0, height, + filtered_data); +} + +//------------------------------------------------------------------------------ +// Gradient filter. + +static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) { + const int g = a + b - c; + return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit +} + +static void GradientPredictDirect_NEON(const uint8_t* const row, + const uint8_t* const top, + uint8_t* const out, int length) { + int i; + for (i = 0; i + 8 <= length; i += 8) { + const uint8x8_t A = vld1_u8(&row[i - 1]); + const uint8x8_t B = vld1_u8(&top[i + 0]); + const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B)); + const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]); + const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D)); + const uint8x8_t F = vld1_u8(&row[i + 0]); + vst1_u8(&out[i], vsub_u8(F, E)); + } + for (; i < length; ++i) { + out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]); + } +} + +static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { + const size_t start_offset = row * stride; + const int last_row = row + num_rows; + SANITY_CHECK(in, out); + in += start_offset; + out += start_offset; + + // left prediction for top scan-line + if (row == 0) { + out[0] = in[0]; + PredictLineLeft_NEON(in + 1, out + 1, width - 1); + row = 1; + in += stride; + out += stride; + } + + // Filter line-by-line. + while (row < last_row) { + out[0] = in[0] - in[-stride]; + GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1); + ++row; + in += stride; + out += stride; + } +} + +static void GradientFilter_NEON(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_NEON(data, width, height, stride, 0, height, + filtered_data); +} + +#undef SANITY_CHECK + +//------------------------------------------------------------------------------ +// Inverse transforms + +static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + int i; + const uint8x16_t zero = vdupq_n_u8(0); + uint8x16_t last; + out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + if (width <= 1) return; + last = vsetq_lane_u8(out[0], zero, 0); + for (i = 1; i + 16 <= width; i += 16) { + const uint8x16_t A0 = vld1q_u8(&in[i]); + const uint8x16_t A1 = vaddq_u8(A0, last); + const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1); + const uint8x16_t A3 = vaddq_u8(A1, A2); + const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2); + const uint8x16_t A5 = vaddq_u8(A3, A4); + const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4); + const uint8x16_t A7 = vaddq_u8(A5, A6); + const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8); + const uint8x16_t A9 = vaddq_u8(A7, A8); + vst1q_u8(&out[i], A9); + last = SHIFT_RIGHT_N_Q(A9, 15); + } + for (; i < width; ++i) out[i] = in[i] + out[i - 1]; +} + +static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter_NEON(NULL, in, out, width); + } else { + int i; + assert(width >= 0); + for (i = 0; i + 16 <= width; i += 16) { + const uint8x16_t A = vld1q_u8(&in[i]); + const uint8x16_t B = vld1q_u8(&prev[i]); + const uint8x16_t C = vaddq_u8(A, B); + vst1q_u8(&out[i], C); + } + for (; i < width; ++i) out[i] = in[i] + prev[i]; + } +} + +// GradientUnfilter_NEON is correct but slower than the C-version, +// at least on ARM64. For armv7, it's a wash. +// So best is to disable it for now, but keep the idea around... +#if !defined(USE_GRADIENT_UNFILTER) +#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE +#endif + +#if (USE_GRADIENT_UNFILTER == 1) +#define GRAD_PROCESS_LANE(L) do { \ + const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \ + const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \ + const uint8x8_t delta = vqmovun_s16(tmp2); \ + pred = vadd_u8(D, delta); \ + out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1); \ +} while (0) + +static void GradientPredictInverse_NEON(const uint8_t* const in, + const uint8_t* const top, + uint8_t* const row, int length) { + if (length > 0) { + int i; + uint8x8_t pred = vdup_n_u8(row[-1]); // left sample + uint8x8_t out = vdup_n_u8(0); + for (i = 0; i + 8 <= length; i += 8) { + const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]); + const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]); + const int16x8_t BC = vsubq_s16(B, C); // unclipped gradient basis B - C + const uint8x8_t D = vld1_u8(&in[i]); // base input + GRAD_PROCESS_LANE(0); + GRAD_PROCESS_LANE(1); + GRAD_PROCESS_LANE(2); + GRAD_PROCESS_LANE(3); + GRAD_PROCESS_LANE(4); + GRAD_PROCESS_LANE(5); + GRAD_PROCESS_LANE(6); + GRAD_PROCESS_LANE(7); + vst1_u8(&row[i], out); + } + for (; i < length; ++i) { + row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]); + } + } +} +#undef GRAD_PROCESS_LANE + +static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter_NEON(NULL, in, out, width); + } else { + out[0] = in[0] + prev[0]; // predict from above + GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1); + } +} + +#endif // USE_GRADIENT_UNFILTER + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8FiltersInitNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) { + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON; +#if (USE_GRADIENT_UNFILTER == 1) + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON; +#endif + + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON; +} + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(VP8FiltersInitNEON) + +#endif // WEBP_USE_NEON diff --git a/media/libwebp/dsp/moz.build b/media/libwebp/dsp/moz.build index fa6df9e9e..f3c2bdd0b 100644 --- a/media/libwebp/dsp/moz.build +++ b/media/libwebp/dsp/moz.build @@ -9,6 +9,7 @@ with Files('**'): SOURCES += [ 'alpha_processing.c', + 'alpha_processing_neon.c', 'alpha_processing_sse2.c', 'alpha_processing_sse41.c', 'dec.c', @@ -17,6 +18,7 @@ SOURCES += [ 'dec_sse2.c', 'dec_sse41.c', 'filters.c', + 'filters_neon.c', 'filters_sse2.c', 'lossless.c', 'lossless_neon.c', @@ -29,15 +31,19 @@ SOURCES += [ 'upsampling_sse2.c', 'upsampling_sse41.c', 'yuv.c', + 'yuv_neon.c', 'yuv_sse2.c', 'yuv_sse41.c', ] if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']: + SOURCES['alpha_processing_neon.c'].flags += CONFIG['NEON_FLAGS'] SOURCES['dec_neon.c'].flags += CONFIG['NEON_FLAGS'] + SOURCES['filters_neon.c'].flags += CONFIG['NEON_FLAGS'] SOURCES['lossless_neon.c'].flags += CONFIG['NEON_FLAGS'] SOURCES['rescaler_neon.c'].flags += CONFIG['NEON_FLAGS'] SOURCES['upsampling_neon.c'].flags += CONFIG['NEON_FLAGS'] + SOURCES['yuv_neon.c'].flags += CONFIG['NEON_FLAGS'] elif CONFIG['INTEL_ARCHITECTURE']: SOURCES['alpha_processing_sse2.c'].flags += CONFIG['SSE2_FLAGS'] SOURCES['alpha_processing_sse41.c'].flags += CONFIG['SSE2_FLAGS'] diff --git a/media/libwebp/dsp/yuv_neon.c b/media/libwebp/dsp/yuv_neon.c new file mode 100644 index 000000000..81f00fe5a --- /dev/null +++ b/media/libwebp/dsp/yuv_neon.c @@ -0,0 +1,288 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// YUV->RGB conversion functions +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "../dsp/yuv.h" + +#if defined(WEBP_USE_NEON) + +#include <assert.h> +#include <stdlib.h> + +#include "../dsp/neon.h" + +//----------------------------------------------------------------------------- + +static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, + const uint8x8_t G, + const uint8x8_t B) { + const uint16x8_t r = vmovl_u8(R); + const uint16x8_t g = vmovl_u8(G); + const uint16x8_t b = vmovl_u8(B); + const uint16x4_t r_lo = vget_low_u16(r); + const uint16x4_t r_hi = vget_high_u16(r); + const uint16x4_t g_lo = vget_low_u16(g); + const uint16x4_t g_hi = vget_high_u16(g); + const uint16x4_t b_lo = vget_low_u16(b); + const uint16x4_t b_hi = vget_high_u16(b); + const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); + const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); + const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); + const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); + const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); + const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); + const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), + vrshrn_n_u32(tmp2_hi, 16)); + const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); + return vqmovn_u16(Y2); +} + +static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { + const uint8x8x3_t RGB = vld3_u8(rgb); + const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i, rgb += 3) { // left-over + y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); + } +} + +static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { + const uint8x8x3_t BGR = vld3_u8(bgr); + const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i, bgr += 3) { // left-over + y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); + } +} + +static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8) { + const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); + const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i) { // left-over + const uint32_t p = argb[i]; + y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, + YUV_HALF); + } +} + +//----------------------------------------------------------------------------- + +// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST +#define MULTIPLY_16b_PREAMBLE(r, g, b) \ + const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \ + const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \ + const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \ + const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \ + const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \ + const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b)) + +#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \ + const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \ + const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \ + const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \ + const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \ + const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \ + const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \ + const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \ + vshrn_n_s32(tmp2_hi, 16)); \ + DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \ +} while (0) + +// This needs to be a macro, since (128 << SHIFT) needs to be an immediate. +#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \ + MULTIPLY_16b_PREAMBLE(r, g, b); \ + MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \ + MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \ +} while (0) + +static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { + const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb); + int16x8_t U, V; + CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V); + vst1_u8(u + i, vqrshrun_n_s16(U, 2)); + vst1_u8(v + i, vqrshrun_n_s16(V, 2)); + } + for (; i < width; i += 1, rgb += 4) { + const int r = rgb[0], g = rgb[1], b = rgb[2]; + u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2); + v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2); + } +} + +static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, + int src_width, int do_store) { + int i; + for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { + const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); + const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds + const uint16x8_t G = vpaddlq_u8(RGB.val[1]); + const uint16x8_t B = vpaddlq_u8(RGB.val[0]); + int16x8_t U_tmp, V_tmp; + CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); + { + const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); + const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); + if (do_store) { + vst1_u8(u, U); + vst1_u8(v, V); + } else { + const uint8x8_t prev_u = vld1_u8(u); + const uint8x8_t prev_v = vld1_u8(v); + vst1_u8(u, vrhadd_u8(U, prev_u)); + vst1_u8(v, vrhadd_u8(V, prev_v)); + } + } + } + if (i < src_width) { // left-over + WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); + } +} + + +//------------------------------------------------------------------------------ + +extern void WebPInitConvertARGBToYUVNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) { + WebPConvertRGB24ToY = ConvertRGB24ToY_NEON; + WebPConvertBGR24ToY = ConvertBGR24ToY_NEON; + WebPConvertARGBToY = ConvertARGBToY_NEON; + WebPConvertARGBToUV = ConvertARGBToUV_NEON; + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON; +} + +//------------------------------------------------------------------------------ + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y_NEON(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + int i; + const int16x8_t zero = vdupq_n_s16(0); + const int16x8_t max = vdupq_n_s16(MAX_Y); + uint64x2_t sum = vdupq_n_u64(0); + uint64_t diff; + + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); + const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); + const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); + const int16x8_t D = vsubq_s16(A, B); // diff_y + const int16x8_t F = vaddq_s16(C, D); // new_y + const uint16x8_t H = + vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); + const int16x8_t I = vabsq_s16(D); // abs(diff_y) + vst1q_u16(dst + i, H); + sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); + } + diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)(dst[i]) + diff_y; + dst[i] = clip_y_NEON(new_y); + diff += (uint64_t)(abs(diff_y)); + } + return diff; +} + +static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i; + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vld1q_s16(ref + i); + const int16x8_t B = vld1q_s16(src + i); + const int16x8_t C = vld1q_s16(dst + i); + const int16x8_t D = vsubq_s16(A, B); // diff_uv + const int16x8_t E = vaddq_s16(C, D); // new_uv + vst1q_s16(dst + i, E); + } + for (; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const int16x8_t max = vdupq_n_s16(MAX_Y); + const int16x8_t zero = vdupq_n_s16(0); + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t a0 = vld1q_s16(A + i + 0); + const int16x8_t a1 = vld1q_s16(A + i + 1); + const int16x8_t b0 = vld1q_s16(B + i + 0); + const int16x8_t b1 = vld1q_s16(B + i + 1); + const int16x8_t a0b1 = vaddq_s16(a0, b1); + const int16x8_t a1b0 = vaddq_s16(a1, b0); + const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 + const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) + const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) + const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); + const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); + const int16x8_t d0 = vaddq_s16(c1, a0); + const int16x8_t d1 = vaddq_s16(c0, a1); + const int16x8_t e0 = vrshrq_n_s16(d0, 1); + const int16x8_t e1 = vrshrq_n_s16(d1, 1); + const int16x8x2_t f = vzipq_s16(e0, e1); + const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); + const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); + const int16x8_t h0 = vaddq_s16(g0, f.val[0]); + const int16x8_t h1 = vaddq_s16(g1, f.val[1]); + const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); + const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); + vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); + vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); + } + for (; i < len; ++i) { + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1); + } +} +#undef MAX_Y + +//------------------------------------------------------------------------------ + +extern void WebPInitSharpYUVNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) { + WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON; + WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON; + WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON; +} + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON) +WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON) + +#endif // WEBP_USE_NEON diff --git a/media/libwebp/update.sh b/media/libwebp/update.sh index 652993004..4fff43d69 100644 --- a/media/libwebp/update.sh +++ b/media/libwebp/update.sh @@ -38,6 +38,7 @@ cp $1/src/demux/demux.c demux mkdir -p dsp cp $1/src/dsp/*.h dsp cp $1/src/dsp/alpha_processing.c dsp +cp $1/src/dsp/alpha_processing_neon.c dsp cp $1/src/dsp/alpha_processing_sse2.c dsp cp $1/src/dsp/alpha_processing_sse41.c dsp cp $1/src/dsp/dec.c dsp @@ -46,6 +47,7 @@ cp $1/src/dsp/dec_neon.c dsp cp $1/src/dsp/dec_sse2.c dsp cp $1/src/dsp/dec_sse41.c dsp cp $1/src/dsp/filters.c dsp +cp $1/src/dsp/filters_neon.c dsp cp $1/src/dsp/filters_sse2.c dsp cp $1/src/dsp/lossless.c dsp cp $1/src/dsp/lossless_neon.c dsp @@ -58,6 +60,7 @@ cp $1/src/dsp/upsampling_neon.c dsp cp $1/src/dsp/upsampling_sse2.c dsp cp $1/src/dsp/upsampling_sse41.c dsp cp $1/src/dsp/yuv.c dsp +cp $1/src/dsp/yuv_neon.c dsp cp $1/src/dsp/yuv_sse2.c dsp cp $1/src/dsp/yuv_sse41.c dsp diff --git a/media/mtransport/third_party/nICEr/src/ice/ice_component.c b/media/mtransport/third_party/nICEr/src/ice/ice_component.c index 2be25efca..11b4fcbc1 100644 --- a/media/mtransport/third_party/nICEr/src/ice/ice_component.c +++ b/media/mtransport/third_party/nICEr/src/ice/ice_component.c @@ -909,7 +909,6 @@ static int nr_ice_component_process_incoming_check(nr_ice_component *comp, nr_tr nr_ice_candidate_pair_set_state(pair->pctx,pair,NR_ICE_PAIR_STATE_FROZEN); if(r=nr_ice_component_insert_pair(comp,pair)) { *error=(r==R_NO_MEMORY)?500:400; - nr_ice_candidate_pair_destroy(&pair); ABORT(r); } @@ -1615,6 +1614,7 @@ int nr_ice_component_finalize(nr_ice_component *lcomp, nr_ice_component *rcomp) int nr_ice_component_insert_pair(nr_ice_component *pcomp, nr_ice_cand_pair *pair) { int r,_status; + int pair_inserted=0; /* Pairs for peer reflexive are marked SUCCEEDED immediately */ if (pair->state != NR_ICE_PAIR_STATE_FROZEN && @@ -1626,6 +1626,8 @@ int nr_ice_component_insert_pair(nr_ice_component *pcomp, nr_ice_cand_pair *pair if(r=nr_ice_candidate_pair_insert(&pair->remote->stream->check_list,pair)) ABORT(r); + pair_inserted=1; + /* Make sure the check timer is running, if the stream was previously * started. We will not start streams just because a pair was created, * unless it is the first pair to be created across all streams. */ @@ -1642,6 +1644,9 @@ int nr_ice_component_insert_pair(nr_ice_component *pcomp, nr_ice_cand_pair *pair _status=0; abort: + if (_status && !pair_inserted) { + nr_ice_candidate_pair_destroy(&pair); + } return(_status); } |