1) convert default processing functions to __attribute__((weak)) so they can be overrided with architecture specific accelerated functions (ie. NEON, MMX, Altivec, etc)
2) override gst_audio_quantize_quantize_signed_tpdf_none() to use NEON vector instructions 3) override gst_audio_convert_unpack_float_le() to use NEON vector instructions This speeds up audioconvert ~10x, at least for the 32b float -> 16b int conversion needed to play AC-3 audio (ie. DVD's) via ALSA. --- gst/audioconvert/Makefile.am | 1 + gst/audioconvert/armv7.c | 209 +++++++++++++++++++++++++++++++++++ gst/audioconvert/audioconvert.c | 20 ++-- gst/audioconvert/gstaudioquantize.c | 4 +- gst/audioconvert/gstchannelmix.c | 4 +- 5 files changed, 224 insertions(+), 14 deletions(-) create mode 100644 gst/audioconvert/armv7.c diff --git a/gst/audioconvert/Makefile.am b/gst/audioconvert/Makefile.am index 94978bb..2d273db 100644 --- a/gst/audioconvert/Makefile.am +++ b/gst/audioconvert/Makefile.am @@ -5,6 +5,7 @@ libgstaudioconvert_la_SOURCES = \ audioconvert.c \ gstchannelmix.c \ gstaudioquantize.c \ + armv7.c \ plugin.c libgstaudioconvert_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) diff --git a/gst/audioconvert/armv7.c b/gst/audioconvert/armv7.c new file mode 100644 index 0000000..e39d29d --- /dev/null +++ b/gst/audioconvert/armv7.c @@ -0,0 +1,209 @@ +/* GStreamer + * + * Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/ + * + * Description: NEON/VFP accelerated functions for armv7 architecture + * Created on: Aug 8, 2009 + * Author: Rob Clark <[hidden email]> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef __ARM_NEON__ +#include <arm_neon.h> +#include <string.h> + +#include "audioconvert.h" + + +void +gst_audio_quantize_quantize_signed_tpdf_none (AudioConvertCtx *ctx, + gint32 *src, gint32 *dst, gint count) +{ + static guint32 state[4] = { + 0xdeadbeef, + 0x305b8cc9, + 0x6c46ec93, + 0xad13b0cd + }; + + gint scale = ctx->out_scale; + count *= ctx->out.channels; + + if (scale > 0) { + guint32 mask = 0xffffffff & (0xffffffff << scale); + guint32 bias = (1U << (scale - 1)) >> 1; + gint32 dither = (1<<(scale - 1)); + + int32x4_t vrand; + uint32x4_t vstate; + uint32x4_t v12345; + int32x4_t vtmp; + uint32x4_t vmask; + + vstate = vld1q_u32 (state); + v12345 = vmovq_n_u32 (12345); + vmask = vmovq_n_u32 (mask); + + /* until we have less 4 words less to process, use vector instructions + * to do everything 4x at a time: + */ + for (;;count-=4) { + int64x2_t vtmp_lo; + int64x2_t vtmp_hi; + uint32x4_t vstate2; + int32x2_t vrand_lo; + int32x2_t vrand_hi; + + /* generate next eight random words: (see gst_fast_random_uint32()) + * + * state = state * 1103515245 + 12345 + */ + vstate2 = vmulq_n_u32 (vstate, 1103515245); + vstate2 = vaddq_u32 (vstate2, v12345); + vstate = vmulq_n_u32 (vstate2, 1103515245); + vstate = vaddq_u32 (vstate2, v12345); + + /* generate next four scaled random values: + * + * gint32 start = bias - dither; + * gint32 end = bias + dither - 1; + * gint64 tmp1 = gst_fast_random_uint32 (); + * gint64 tmp2 = gst_fast_random_uint32 (); + * rand = (gint32)(((tmp1+tmp2) * (end - start)) / (1LLU<<32) + start); + * + * need to split vstate and vstate2 into 2*2 int64x2_t and add.... + */ + vstate2 = vaddq_u32 (vstate, vstate2); /* tmp1+tmp2 */ + vtmp_lo = vreinterpretq_s64_u64 ( /* * (end-start) */ + vmull_n_u32 (vget_low_u32 (vstate2), (2*dither) - 1)); + vtmp_hi = vreinterpretq_s64_u64 ( /* * (end-start) */ + vmull_n_u32 (vget_high_u32 (vstate2), (2*dither) - 1)); + + vtmp_lo = vshrq_n_s64 (vtmp_lo, 32); /* / (1LLU<<32) */ + vtmp_hi = vshrq_n_s64 (vtmp_hi, 32); /* / (1LLU<<32) */ + + + /* now want to put vtmp_hi and vtmp_lo back together.. + * then add 'start' (bias-dither).. which is negative.. + */ + vrand_lo = vmovn_s64 (vtmp_lo); + vrand_hi = vmovn_s64 (vtmp_hi); + vrand = vcombine_s32 (vrand_lo, vrand_hi); + vrand = vaddq_s32 (vrand, vmovq_n_s32 (bias-dither)); + + /* load next 4 words: + */ + vtmp = vld1q_s32 (src); + src += 4; + + /* perform saturating add of random noise... we don't want the + * value to wrap around: + * + * XXX I *think* vqaddq will handle saturation for underflow too.. + */ + vtmp = vqaddq_s32 (vtmp, vrand); + vtmp = vreinterpretq_s32_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vtmp), vmask)); + + /* we check for less than four remaining words at the end, before + * we store the result back.. the assumption is that it shouldn't + * cause a segfault to read past the end of 'src', and there is no + * harm in processing a few garbage words. But we definitely don't + * want to write past the end of 'dst' + */ + if (count<4) break; + + /* store 4 words to result: + */ + vst1q_s32 (dst, vtmp); + dst += 4; + } + + vst1q_u32 (state, vstate); + + /* at this point, we could have 0-3 result bytes in vtmp to write + * back out to 'dst': + */ + if (count) { + gint32 tmpdst[4]; + gint32 *tmpp = tmpdst; + + vst1q_s32 (tmpdst, vtmp); + + while (count--) { + *dst++ = *tmpp++; + } + } + + } else { + memmove (dst, src, count); + } +} + +void +gst_audio_convert_unpack_float_le (gfloat * src, gint32 * dst, gint s, gint count) +{ + float32x4_t vsrc; + float32x4_t v05; + int32x4_t vdst; + + v05 = vmovq_n_f32 (0.5); + + for (;;count-=4) { + + /* load next 4 words: + */ + vsrc = vld1q_f32 ((float32_t *)src); + src += 4; + + /* convert to int: + */ + vsrc = vmulq_n_f32 (vsrc, 2147483647.0); + vsrc = vaddq_f32 (vsrc, v05); + vdst = vcvtq_s32_f32 (vsrc); + + /* we check for less than four remaining words at the end, before + * we store the result back.. the assumption is that it shouldn't + * cause a segfault to read past the end of 'src', and there is no + * harm in processing a few garbage words. But we definitely don't + * want to write past the end of 'dst' + */ + if (count<4) break; + + /* store 4 words to result: + */ + vst1q_s32 (dst, vdst); + dst += 4; + } + + /* at this point, we could have 0-3 result bytes in vtmp to write + * back out to 'dst': + */ + if (count) { + gint32 tmpdst[4]; + gint32 *tmpp = tmpdst; + + vst1q_s32 (tmpdst, vdst); + + while (count--) { + *dst++ = *tmpp++; + } + } +} + + +#endif diff --git a/gst/audioconvert/audioconvert.c b/gst/audioconvert/audioconvert.c index 4780324..c18d217 100644 --- a/gst/audioconvert/audioconvert.c +++ b/gst/audioconvert/audioconvert.c @@ -38,11 +38,11 @@ * unpack code */ #define MAKE_UNPACK_FUNC_NAME(name) \ -audio_convert_unpack_##name +gst_audio_convert_unpack_##name /* unpack from integer to signed integer 32 */ #define MAKE_UNPACK_FUNC_II(name, stride, sign, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \ gint scale, gint count) \ { \ @@ -54,7 +54,7 @@ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \ /* unpack from float to signed integer 32 */ #define MAKE_UNPACK_FUNC_FI(name, type, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \ { \ gdouble temp; \ @@ -68,7 +68,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \ /* unpack from float to float 64 (double) */ #define MAKE_UNPACK_FUNC_FF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \ gint count) \ { \ @@ -78,7 +78,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \ /* unpack from int to float 64 (double) */ #define MAKE_UNPACK_FUNC_IF(name, stride, sign, READ_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_UNPACK_FUNC_NAME (name) (guint8 * src, gdouble * dst, gint scale, \ gint count) \ { \ @@ -158,7 +158,7 @@ audio_convert_pack_##name /* pack from signed integer 32 to integer */ #define MAKE_PACK_FUNC_II(name, stride, sign, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \ gint scale, gint count) \ { \ @@ -172,7 +172,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \ /* pack from signed integer 32 to float */ #define MAKE_PACK_FUNC_IF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \ gint count) \ { \ @@ -182,7 +182,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \ /* pack from float 64 (double) to float */ #define MAKE_PACK_FUNC_FF(name, type, FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \ gint count) \ { \ @@ -194,7 +194,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \ * the floats are already in the correct range. Only a cast is needed. */ #define MAKE_PACK_FUNC_FI_S(name, stride, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ gint count) \ { \ @@ -212,7 +212,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ * and an addition of 2^(target_depth-1) to get in the correct unsigned * range. */ #define MAKE_PACK_FUNC_FI_U(name, stride, WRITE_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \ gint count) \ { \ diff --git a/gst/audioconvert/gstaudioquantize.c b/gst/audioconvert/gstaudioquantize.c index 2155397..be959c4 100644 --- a/gst/audioconvert/gstaudioquantize.c +++ b/gst/audioconvert/gstaudioquantize.c @@ -46,7 +46,7 @@ gst_audio_quantize_quantize_##name #define MAKE_QUANTIZE_FUNC_I(name, DITHER_INIT_FUNC, ADD_DITHER_FUNC, \ ROUND_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \ gint32 *dst, gint count) \ { \ @@ -86,7 +86,7 @@ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \ #define MAKE_QUANTIZE_FUNC_F(name, DITHER_INIT_FUNC, NS_INIT_FUNC, \ ADD_NS_FUNC, ADD_DITHER_FUNC, \ UPDATE_ERROR_FUNC) \ -static void \ +void __attribute__((weak)) \ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gdouble *src, \ gdouble *dst, gint count) \ { \ diff --git a/gst/audioconvert/gstchannelmix.c b/gst/audioconvert/gstchannelmix.c index 0f9b945..aac8957 100644 --- a/gst/audioconvert/gstchannelmix.c +++ b/gst/audioconvert/gstchannelmix.c @@ -659,7 +659,7 @@ gst_channel_mix_passthrough (AudioConvertCtx * this) /* IMPORTANT: out_data == in_data is possible, make sure to not overwrite data * you might need later on! */ -void +void __attribute__((weak)) gst_channel_mix_mix_int (AudioConvertCtx * this, gint32 * in_data, gint32 * out_data, gint samples) { @@ -698,7 +698,7 @@ gst_channel_mix_mix_int (AudioConvertCtx * this, } } -void +void __attribute__((weak)) gst_channel_mix_mix_float (AudioConvertCtx * this, gdouble * in_data, gdouble * out_data, gint samples) { -- 1.6.3.2 ------------------------------------------------------------------------------ Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day trial. Simplify your report design, integration and deployment - and focus on what you do best, core application coding. Discover what's new with Crystal Reports now. http://p.sf.net/sfu/bobj-july _______________________________________________ gstreamer-devel mailing list [hidden email] https://lists.sourceforge.net/lists/listinfo/gstreamer-devel |
Am Montag, den 10.08.2009, 09:41 -0500 schrieb Rob Clark:
> 1) convert default processing functions to __attribute__((weak)) so they can be overrided with > architecture specific accelerated functions (ie. NEON, MMX, Altivec, etc) > 2) override gst_audio_quantize_quantize_signed_tpdf_none() to use NEON vector instructions > 3) override gst_audio_convert_unpack_float_le() to use NEON vector instructions > > This speeds up audioconvert ~10x, at least for the 32b float -> 16b int conversion needed to play > AC-3 audio (ie. DVD's) via ALSA Hi, first of all, could you file a bug for this and attach the bug there? :) and then some comments on the patch itself: - Don't use __atribute__(weak), it's not portable. Instead use liboil to detect at runtime if the CPU supports a specific instruction set and then use the appropiate function pointer to the unpack/quantize function - Add a configure check to see if the compiler supports the specific instruction set and only compile that ARMv7 code then - The start of a buffer might not be 16 byte aligned or what alignment is required by VFP. It's only guaranteed to be aligned to the sample type, i.e. 2 byte aligned for 16 bit samples, etc In general this patch is a good idea though, something like this really needs to go into audioconvert at critical places for other architectures too. FYI, David Schleef has partially converted audioconvert to use orc[0]. Together with the orc VFP backend this would obsolete your patch I guess. [0] http://cgit.freedesktop.org/~ds/gst-plugins-base/log/?h=orc ------------------------------------------------------------------------------ Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day trial. Simplify your report design, integration and deployment - and focus on what you do best, core application coding. Discover what's new with Crystal Reports now. http://p.sf.net/sfu/bobj-july _______________________________________________ gstreamer-devel mailing list [hidden email] https://lists.sourceforge.net/lists/listinfo/gstreamer-devel signature.asc (204 bytes) Download Attachment |
On Aug 10, 2009, at 9:59 AM, Sebastian Dröge wrote: > Am Montag, den 10.08.2009, 09:41 -0500 schrieb Rob Clark: >> 1) convert default processing functions to __attribute__((weak)) so >> they can be overrided with >> architecture specific accelerated functions (ie. NEON, MMX, >> Altivec, etc) >> 2) override gst_audio_quantize_quantize_signed_tpdf_none() to use >> NEON vector instructions >> 3) override gst_audio_convert_unpack_float_le() to use NEON vector >> instructions >> >> This speeds up audioconvert ~10x, at least for the 32b float -> 16b >> int conversion needed to play >> AC-3 audio (ie. DVD's) via ALSA > > Hi, > first of all, could you file a bug for this and attach the bug > there? :) [RC] Hi Sebastian, I just wanted to send patch here, because it might be interesting to others working on ARM (armv7) based processors. liboil / orc based solution is probably better long term solution, although I'm not sure of the current state of liboil / orc on armv7. That, and I wanted an excuse to teach myself about NEON ;-) So I don't know if you want to integrate this patch as-is, which is why I didn't create an issue in bugzilla yet. I guess my next side- project is to learn a bit more about liboil / orc. > and then some comments on the patch itself: > - Don't use __atribute__(weak), it's not portable. Instead use > liboil to > detect at runtime if the CPU supports a specific instruction set and > then use the appropiate function pointer to the unpack/quantize > function [RC] oh, darn.. it was such a clever trick too.. > - Add a configure check to see if the compiler supports the specific > instruction set and only compile that ARMv7 code then [RC] I did put the whole file within a '#ifdef __ARM_NEON__ / #endif'.. which should also work even if the compiler supports NEON but user doesn't give '-mfpu=neon'. But I admit that my configure- foo is weak, so there is certainly a better way to do this. > - The start of a buffer might not be 16 byte aligned or what alignment > is required by VFP. It's only guaranteed to be aligned to the sample > type, i.e. 2 byte aligned for 16 bit samples, etc > [RC] AFAIK, VLDR/VSTR doesn't require 128bit alignment, although the cycle count is lower for aligned accesses. So I guess it could be made a bit faster by handling alignment a little better. As-is, it is a night and day difference and the gstaudioconvert related functions only show up a couple pages down in oprofile output. Now it is liba52 that needs some optimization ;-) > In general this patch is a good idea though, something like this > really > needs to go into audioconvert at critical places for other > architectures > too. > > FYI, David Schleef has partially converted audioconvert to use orc[0]. > Together with the orc VFP backend this would obsolete your patch I > guess. > > [0] http://cgit.freedesktop.org/~ds/gst-plugins-base/log/?h=orc > <signature.asc><ATT00001.txt><ATT00002.txt> [RC] ok, I'll check out his patch.. that is almost certainly the better long term approach. I just didn't know what was the current state of ORC for NEON/VFP.. BR, -R ------------------------------------------------------------------------------ Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day trial. Simplify your report design, integration and deployment - and focus on what you do best, core application coding. Discover what's new with Crystal Reports now. http://p.sf.net/sfu/bobj-july _______________________________________________ gstreamer-devel mailing list [hidden email] https://lists.sourceforge.net/lists/listinfo/gstreamer-devel |
In reply to this post by Sebastian Dröge-7
On Aug 10, 2009, at 9:59 AM, Sebastian Dröge wrote: > > FYI, David Schleef has partially converted audioconvert to use orc[0]. > Together with the orc VFP backend this would obsolete your patch I > guess. > > [0] http://cgit.freedesktop.org/~ds/gst-plugins-base/log/?h=orc ahh, looks like so far just the integer unpack functions. So not the two functions I needed (unpack_float_le and quantize_signed_tpdf_none) Any rough estimate on timeframe for orc-ified audioconvert? Is it worthwhile for me to try and cleanup my current patch to integrate as a temporary solution. Or should I just try and learn orc and see if I could help there. Also.. I saw something on David's blog about a NEON backend for orc, but not opensrc. Anyone know if this is something that will eventually be opensrc? To be honest, in most use-cases most of the multimedia related heavy- lifting on our platform is done by hw or DSP. Most of the time we don't use audioconvert, audioresample, software volume, software colorspace conversion, etc, etc. It is only a few cases (like this) where we need some NEON acceleration. Is it possible to plug-in hand- written optimized versions of a few functions without needing a whole NEON backend for orc? BR, -R ------------------------------------------------------------------------------ Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day trial. Simplify your report design, integration and deployment - and focus on what you do best, core application coding. Discover what's new with Crystal Reports now. http://p.sf.net/sfu/bobj-july _______________________________________________ gstreamer-devel mailing list [hidden email] https://lists.sourceforge.net/lists/listinfo/gstreamer-devel |
On Mon, Aug 10, 2009 at 11:38:25AM -0500, Rob Clark wrote:
> > [0] http://cgit.freedesktop.org/~ds/gst-plugins-base/log/?h=orc > > ahh, looks like so far just the integer unpack functions. So not the > two functions I needed (unpack_float_le and quantize_signed_tpdf_none) By the way, the reason these functions are slow is because of gcc's rather unenlighted use of NEON/VFP instructions for floating point code, often related to usage of floor() or casting from float to int. Gcc outputs code that moves data from a NEON/VFP register to a GP register, which causes a 20+ cycle pipeline stall on Cortex-A8. I imagine it would be quite a bit of effort to teach gcc to not do this, even in the simple case where that integer is just written out to memory. > Any rough estimate on timeframe for orc-ified audioconvert? There are still a few blocker features missing from Orc, such as floating point support in the main library, some new multiplication and special purpose opcodes, and an ABI bump. The main goal is to be a complete replacement for liboil. And then we need to figure out a migration strategy. > Also.. I saw something on David's blog about a NEON backend for orc, > but not opensrc. Anyone know if this is something that will > eventually be opensrc? Yes. dave... ------------------------------------------------------------------------------ Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day trial. Simplify your report design, integration and deployment - and focus on what you do best, core application coding. Discover what's new with Crystal Reports now. http://p.sf.net/sfu/bobj-july _______________________________________________ gstreamer-devel mailing list [hidden email] https://lists.sourceforge.net/lists/listinfo/gstreamer-devel |
Free forum by Nabble | Edit this page |