From 99f5d475409cdcc68c5efc32d80fc1706e5e10e3 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Fri, 25 Nov 2022 16:39:10 -0800 Subject: [PATCH] clang: improve general performance with vectorization/unrolling Clang has a tendency to *heavily* unroll loops all over the place: https://github.com/llvm/llvm-project/issues/42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan --- miniaudio.h | 83 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/miniaudio.h b/miniaudio.h index ce4488e8..12801983 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -42843,7 +42843,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_ } } -MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor) +MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor) { ma_uint64 iSample; @@ -43138,10 +43138,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 sampleCount = frameCount * channels; if (volume == 1) { +#pragma clang loop vectorize(enable) for (iSample = 0; iSample < sampleCount; iSample += 1) { pDst[iSample] += pSrc[iSample]; } } else { +#pragma clang loop vectorize(enable) for (iSample = 0; iSample < sampleCount; iSample += 1) { pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume); } @@ -45442,7 +45444,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed( const float a1 = pBQ->a1.f32; const float a2 = pBQ->a2.f32; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); +#pragma clang loop vectorize(assume_safety) for (c = 0; c < channels; c += 1) { float r1 = pBQ->pR1[c].f32; float r2 = pBQ->pR2[c].f32; @@ -45474,7 +45477,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed( const ma_int32 a1 = pBQ->a1.s32; const ma_int32 a2 = pBQ->a2.s32; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); +#pragma clang loop vectorize(assume_safety) for (c = 0; c < channels; c += 1) { ma_int32 r1 = pBQ->pR1[c].s32; ma_int32 r2 = pBQ->pR2[c].s32; @@ -45748,22 +45752,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF) return MA_SUCCESS; } -static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX) +static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX) { ma_uint32 c; const ma_uint32 channels = pLPF->channels; const float a = pLPF->a.f32; const float b = 1 - a; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); +#pragma clang loop vectorize(assume_safety) for (c = 0; c < channels; c += 1) { float r1 = pLPF->pR1[c].f32; - float x = pX[c]; + float x = pX[c]; float y; - y = b*x + a*r1; + y = b * x + a * r1; - pY[c] = y; + pY[c] = y; pLPF->pR1[c].f32 = y; } } @@ -45775,7 +45780,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY, const ma_int32 a = pLPF->a.s32; const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a); - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); +#pragma clang loop vectorize(assume_safety) for (c = 0; c < channels; c += 1) { ma_int32 r1 = pLPF->pR1[c].s32; ma_int32 x = pX[c]; @@ -46628,7 +46634,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co const float a = 1 - pHPF->a.f32; const float b = 1 - a; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); for (c = 0; c < channels; c += 1) { float r1 = pHPF->pR1[c].f32; float x = pX[c]; @@ -46648,7 +46654,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY, const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32); const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a); - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); for (c = 0; c < channels; c += 1) { ma_int32 r1 = pHPF->pR1[c].s32; ma_int32 x = pX[c]; @@ -48756,6 +48762,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte ma_uint64 iFrame; ma_uint32 iChannel; ma_uint64 interpolatedFrameCount; + const ma_uint32 channels = pGainer->config.channels; MA_ASSERT(pGainer != NULL); @@ -48795,12 +48802,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames; float d = 1.0f / pGainer->config.smoothTimeInFrames; - if (pGainer->config.channels <= 32) { + if (channels <= 32) { float pRunningGain[32]; float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */ /* Initialize the running gain. */ - for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) { + for (iChannel = 0; iChannel < channels; iChannel += 1) { float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume; pRunningGainDelta[iChannel] = t * d; pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a); @@ -48809,7 +48816,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte iFrame = 0; /* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */ - if (pGainer->config.channels == 2) { + if (channels == 2) { #if defined(MA_SUPPORT_SSE2) if (ma_has_sse2()) { ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1; @@ -48857,6 +48864,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte iFrame = unrolledLoopCount << 1; #else + #pragma clang loop vectorize(enable) for (; iFrame < interpolatedFrameCount; iFrame += 1) { for (iChannel = 0; iChannel < 2; iChannel += 1) { pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel]; @@ -48868,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte } #endif } - } else if (pGainer->config.channels == 6) { + } else if (channels == 6) { #if defined(MA_SUPPORT_SSE2) if (ma_has_sse2()) { /* @@ -48912,7 +48920,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte } } } - } else if (pGainer->config.channels == 8) { + } else if (channels == 8) { /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */ #if defined(MA_SUPPORT_SSE2) if (ma_has_sse2()) { @@ -48933,11 +48941,13 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte { /* This is crafted so that it auto-vectorizes when compiled with Clang. */ for (; iFrame < interpolatedFrameCount; iFrame += 1) { + #pragma clang loop vectorize(enable) for (iChannel = 0; iChannel < 8; iChannel += 1) { pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel]; } /* Move the running gain forward towards the new gain. */ + #pragma clang loop vectorize(enable) for (iChannel = 0; iChannel < 8; iChannel += 1) { pRunningGain[iChannel] += pRunningGainDelta[iChannel]; } @@ -48945,17 +48955,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte } } + #pragma clang loop unroll(disable) for (; iFrame < interpolatedFrameCount; iFrame += 1) { - for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) { - pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel]; + #pragma clang loop vectorize(enable) + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel]; pRunningGain[iChannel] += pRunningGainDelta[iChannel]; } } } else { /* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */ + #pragma clang loop unroll(disable) for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) { - for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) { - pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume; + #pragma clang loop vectorize(enable) + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume; } a += d; @@ -48974,18 +48988,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte /* All we need to do here is apply the new gains using an optimized path. */ if (pFramesOut != NULL && pFramesIn != NULL) { - if (pGainer->config.channels <= 32) { + if (channels <= 32) { float gains[32]; - for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) { + for (iChannel = 0; iChannel < channels; iChannel += 1) { gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume; } - ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains); + ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains); } else { /* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */ +#pragma clang loop unroll(disable) for (iFrame = 0; iFrame < frameCount; iFrame += 1) { - for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) { - ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume; +#pragma clang loop vectorize(enable) + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume; } } } @@ -51355,7 +51371,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); for (c = 0; c < channels; c += 1) { ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift); pFrameOut[c] = s; @@ -51374,7 +51390,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); for (c = 0; c < channels; c += 1) { float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a); pFrameOut[c] = s; @@ -51541,7 +51557,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler* } -static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut) +static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut) { const float* pFramesInF32; /* */ float* pFramesOutF32; @@ -51615,7 +51631,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear return MA_SUCCESS; } -static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut) +static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut) { const float* pFramesInF32; /* */ float* pFramesOutF32; @@ -52926,6 +52942,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut, #endif { for (iFrame = 0; iFrame < frameCount; iFrame += 1) { + #pragma clang loop vectorize(enable) for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) { pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame]; } @@ -52953,6 +52970,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut, #endif { for (iFrame = 0; iFrame < frameCount; iFrame += 1) { + #pragma clang loop vectorize(enable) for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) { pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame]; } @@ -52970,6 +52988,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut, #endif { for (iFrame = 0; iFrame < frameCount; iFrame += 1) { + #pragma clang loop vectorize(enable) for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) { pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame]; } @@ -66059,7 +66078,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi ma_uint64 iFrame; ma_uint32 iChannel; const ma_uint32 channels = pNoise->config.channels; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); if (pNoise->config.format == ma_format_f32) { float* pFramesOutF32 = (float*)pFramesOut; @@ -66178,7 +66197,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void ma_uint64 iFrame; ma_uint32 iChannel; const ma_uint32 channels = pNoise->config.channels; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); if (pNoise->config.format == ma_format_f32) { float* pFramesOutF32 = (float*)pFramesOut; @@ -66260,7 +66279,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise, ma_uint64 iFrame; ma_uint32 iChannel; const ma_uint32 channels = pNoise->config.channels; - MA_ASSUME(channels > 0); + MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS); if (pNoise->config.format == ma_format_f32) { float* pFramesOutF32 = (float*)pFramesOut;