From 749da940259e802f9606d372cfb9c6a22609caae Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Fri, 25 Nov 2022 16:39:10 -0800
Subject: [PATCH] clang: improve general performance with
 vectorization/unrolling

Clang has a tendency to *heavily* unroll loops all over the place:
    https://github.com/llvm/llvm-project/issues/42332

Disable loop unrolling wherever it goes too nuts, enable vectorization
where it doesn't do so automatically, etc.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 84 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 30 deletions(-)

diff --git a/miniaudio.h b/miniaudio.h
index 03329d4c..d220397d 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -42864,7 +42864,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
     }
 }
 
-MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
+MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
 {
     ma_uint64 iSample;
 
@@ -43159,10 +43159,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
     sampleCount = frameCount * channels;
 
     if (volume == 1) {
+#pragma clang loop vectorize(enable)
         for (iSample = 0; iSample < sampleCount; iSample += 1) {
             pDst[iSample] += pSrc[iSample];
         }
     } else {
+#pragma clang loop vectorize(enable)
         for (iSample = 0; iSample < sampleCount; iSample += 1) {
             pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
         }
@@ -45463,7 +45465,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
     const float a1 = pBQ->a1.f32;
     const float a2 = pBQ->a2.f32;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop unroll(disable)
     for (c = 0; c < channels; c += 1) {
         float r1 = pBQ->pR1[c].f32;
         float r2 = pBQ->pR2[c].f32;
@@ -45495,7 +45498,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
     const ma_int32 a1 = pBQ->a1.s32;
     const ma_int32 a2 = pBQ->a2.s32;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop unroll(disable)
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pBQ->pR1[c].s32;
         ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45769,22 +45773,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
     return MA_SUCCESS;
 }
 
-static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
+static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
 {
     ma_uint32 c;
     const ma_uint32 channels = pLPF->channels;
     const float a = pLPF->a.f32;
     const float b = 1 - a;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop unroll(disable)
     for (c = 0; c < channels; c += 1) {
         float r1 = pLPF->pR1[c].f32;
-        float x  = pX[c];
+        float x = pX[c];
         float y;
 
-        y = b*x + a*r1;
+        y = b * x + a * r1;
 
-        pY[c]           = y;
+        pY[c] = y;
         pLPF->pR1[c].f32 = y;
     }
 }
@@ -45796,7 +45801,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
     const ma_int32 a = pLPF->a.s32;
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop unroll(disable)
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pLPF->pR1[c].s32;
         ma_int32 x  = pX[c];
@@ -46649,7 +46655,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
     const float a = 1 - pHPF->a.f32;
     const float b = 1 - a;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float r1 = pHPF->pR1[c].f32;
         float x  = pX[c];
@@ -46669,7 +46675,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
     const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pHPF->pR1[c].s32;
         ma_int32 x  = pX[c];
@@ -48777,6 +48783,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     ma_uint64 interpolatedFrameCount;
+    const ma_uint32 channels = pGainer->config.channels;
 
     MA_ASSERT(pGainer != NULL);
 
@@ -48816,12 +48823,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
             float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
             float d = 1.0f / pGainer->config.smoothTimeInFrames;
 
-            if (pGainer->config.channels <= 32) {
+            if (channels <= 32) {
                 float pRunningGain[32];
                 float pRunningGainDelta[32];    /* Could this be heap-allocated as part of the ma_gainer object? */
 
                 /* Initialize the running gain. */
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
                     pRunningGainDelta[iChannel] = t * d;
                     pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48830,7 +48837,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                 iFrame = 0;
 
                 /* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
-                if (pGainer->config.channels == 2) {
+                if (channels == 2) {
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
                         ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48878,6 +48885,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
 
                         iFrame = unrolledLoopCount << 1;
                     #else
+                        #pragma clang loop vectorize(enable)
                         for (; iFrame < interpolatedFrameCount; iFrame += 1) {
                             for (iChannel = 0; iChannel < 2; iChannel += 1) {
                                 pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48889,7 +48897,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                         }
                     #endif
                     }
-                } else if (pGainer->config.channels == 6) {
+                } else if (channels == 6) {
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
                         /*
@@ -48922,6 +48930,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                     } else
                 #endif
                     {
+                        #pragma clang loop vectorize(enable)
                         for (; iFrame < interpolatedFrameCount; iFrame += 1) {
                             for (iChannel = 0; iChannel < 6; iChannel += 1) {
                                 pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48933,7 +48942,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                             }
                         }
                     }
-                } else if (pGainer->config.channels == 8) {
+                } else if (channels == 8) {
                     /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
@@ -48953,6 +48962,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                 #endif
                     {
                         /* This is crafted so that it auto-vectorizes when compiled with Clang. */
+                        #pragma clang loop vectorize(enable)
                         for (; iFrame < interpolatedFrameCount; iFrame += 1) {
                             for (iChannel = 0; iChannel < 8; iChannel += 1) {
                                 pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48966,17 +48976,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                     }
                 }
 
+#pragma clang loop unroll(disable)
                 for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
+#pragma clang loop vectorize(enable)
+                    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
                         pRunningGain[iChannel] += pRunningGainDelta[iChannel];
                     }
                 }
             } else {
                 /* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
+#pragma clang loop unroll(disable)
                 for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
+#pragma clang loop vectorize(enable)
+                    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
                     }
 
                     a += d;
@@ -48995,18 +49009,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
 
     /* All we need to do here is apply the new gains using an optimized path. */
     if (pFramesOut != NULL && pFramesIn != NULL) {
-        if (pGainer->config.channels <= 32) {
+        if (channels <= 32) {
             float gains[32];
-            for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+#pragma clang loop unroll(disable)
+            for (iChannel = 0; iChannel < channels; iChannel += 1) {
                 gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
             }
 
-            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
+            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
         } else {
             /* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
+#pragma clang loop unroll(disable)
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                    ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
+#pragma clang loop vectorize(enable)
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
                 }
             }
         }
@@ -51376,7 +51393,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 
     a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
         pFrameOut[c] = s;
@@ -51395,7 +51412,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
 
     a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
         pFrameOut[c] = s;
@@ -52630,6 +52647,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
     ma_uint64 iFrame;
     ma_uint32 iChannelOut;
 
+#pragma clang loop unroll(disable)
     for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
         for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
             ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52650,6 +52668,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
     ma_uint64 iFrame;
     ma_uint32 iChannelOut;
 
+#pragma clang loop unroll(disable)
     for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
         for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
             ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52692,6 +52711,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
     ma_uint64 iFrame;
     ma_uint32 iChannelOut;
 
+#pragma clang loop unroll(disable)
     for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
         for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
             ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52712,6 +52732,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
     ma_uint64 iFrame;
     ma_uint32 iChannelOut;
 
+#pragma clang loop unroll(disable)
     for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
         for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
             ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52946,6 +52967,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                             } else
                         #endif
                             {
+                                #pragma clang loop vectorize(enable)
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                                     for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
                                         pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52973,6 +52995,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                             } else
                         #endif
                             {
+                                #pragma clang loop vectorize(enable)
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                                     for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
                                         pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52990,6 +53013,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                             } else
                         #endif
                             {
+                                #pragma clang loop vectorize(enable)
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                                     for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
                                         pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66313,7 +66337,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -66432,7 +66456,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -66514,7 +66538,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;