From 99f5d475409cdcc68c5efc32d80fc1706e5e10e3 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Fri, 25 Nov 2022 16:39:10 -0800
Subject: [PATCH] clang: improve general performance with
 vectorization/unrolling

Clang has a tendency to *heavily* unroll loops all over the place:
    https://github.com/llvm/llvm-project/issues/42332

Disable loop unrolling wherever it goes too nuts, enable vectorization
where it doesn't do so automatically, etc.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 83 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 32 deletions(-)

diff --git a/miniaudio.h b/miniaudio.h
index ce4488e8..12801983 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -42843,7 +42843,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
     }
 }
 
-MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
+MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
 {
     ma_uint64 iSample;
 
@@ -43138,10 +43138,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
     sampleCount = frameCount * channels;
 
     if (volume == 1) {
+#pragma clang loop vectorize(enable)
         for (iSample = 0; iSample < sampleCount; iSample += 1) {
             pDst[iSample] += pSrc[iSample];
         }
     } else {
+#pragma clang loop vectorize(enable)
         for (iSample = 0; iSample < sampleCount; iSample += 1) {
             pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
         }
@@ -45442,7 +45444,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
     const float a1 = pBQ->a1.f32;
     const float a2 = pBQ->a2.f32;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop vectorize(assume_safety)
     for (c = 0; c < channels; c += 1) {
         float r1 = pBQ->pR1[c].f32;
         float r2 = pBQ->pR2[c].f32;
@@ -45474,7 +45477,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
     const ma_int32 a1 = pBQ->a1.s32;
     const ma_int32 a2 = pBQ->a2.s32;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop vectorize(assume_safety)
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pBQ->pR1[c].s32;
         ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45748,22 +45752,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
     return MA_SUCCESS;
 }
 
-static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
+static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
 {
     ma_uint32 c;
     const ma_uint32 channels = pLPF->channels;
     const float a = pLPF->a.f32;
     const float b = 1 - a;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop vectorize(assume_safety)
     for (c = 0; c < channels; c += 1) {
         float r1 = pLPF->pR1[c].f32;
-        float x  = pX[c];
+        float x = pX[c];
         float y;
 
-        y = b*x + a*r1;
+        y = b * x + a * r1;
 
-        pY[c]           = y;
+        pY[c] = y;
         pLPF->pR1[c].f32 = y;
     }
 }
@@ -45775,7 +45780,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
     const ma_int32 a = pLPF->a.s32;
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
+#pragma clang loop vectorize(assume_safety)
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pLPF->pR1[c].s32;
         ma_int32 x  = pX[c];
@@ -46628,7 +46634,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
     const float a = 1 - pHPF->a.f32;
     const float b = 1 - a;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float r1 = pHPF->pR1[c].f32;
         float x  = pX[c];
@@ -46648,7 +46654,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
     const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pHPF->pR1[c].s32;
         ma_int32 x  = pX[c];
@@ -48756,6 +48762,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     ma_uint64 interpolatedFrameCount;
+    const ma_uint32 channels = pGainer->config.channels;
 
     MA_ASSERT(pGainer != NULL);
 
@@ -48795,12 +48802,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
             float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
             float d = 1.0f / pGainer->config.smoothTimeInFrames;
 
-            if (pGainer->config.channels <= 32) {
+            if (channels <= 32) {
                 float pRunningGain[32];
                 float pRunningGainDelta[32];    /* Could this be heap-allocated as part of the ma_gainer object? */
 
                 /* Initialize the running gain. */
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
                     pRunningGainDelta[iChannel] = t * d;
                     pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48809,7 +48816,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                 iFrame = 0;
 
                 /* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
-                if (pGainer->config.channels == 2) {
+                if (channels == 2) {
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
                         ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48857,6 +48864,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
 
                         iFrame = unrolledLoopCount << 1;
                     #else
+                        #pragma clang loop vectorize(enable)
                         for (; iFrame < interpolatedFrameCount; iFrame += 1) {
                             for (iChannel = 0; iChannel < 2; iChannel += 1) {
                                 pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48868,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                         }
                     #endif
                     }
-                } else if (pGainer->config.channels == 6) {
+                } else if (channels == 6) {
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
                         /*
@@ -48912,7 +48920,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                             }
                         }
                     }
-                } else if (pGainer->config.channels == 8) {
+                } else if (channels == 8) {
                     /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
                 #if defined(MA_SUPPORT_SSE2)
                     if (ma_has_sse2()) {
@@ -48933,11 +48941,13 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                     {
                         /* This is crafted so that it auto-vectorizes when compiled with Clang. */
                         for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                            #pragma clang loop vectorize(enable)
                             for (iChannel = 0; iChannel < 8; iChannel += 1) {
                                 pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
                             }
 
                             /* Move the running gain forward towards the new gain. */
+                            #pragma clang loop vectorize(enable)
                             for (iChannel = 0; iChannel < 8; iChannel += 1) {
                                 pRunningGain[iChannel] += pRunningGainDelta[iChannel];
                             }
@@ -48945,17 +48955,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
                     }
                 }
 
+                #pragma clang loop unroll(disable)
                 for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
+                    #pragma clang loop vectorize(enable)
+                    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
                         pRunningGain[iChannel] += pRunningGainDelta[iChannel];
                     }
                 }
             } else {
                 /* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
+                #pragma clang loop unroll(disable)
                 for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
+                    #pragma clang loop vectorize(enable)
+                    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
                     }
 
                     a += d;
@@ -48974,18 +48988,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
 
     /* All we need to do here is apply the new gains using an optimized path. */
     if (pFramesOut != NULL && pFramesIn != NULL) {
-        if (pGainer->config.channels <= 32) {
+        if (channels <= 32) {
             float gains[32];
-            for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+            for (iChannel = 0; iChannel < channels; iChannel += 1) {
                 gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
             }
 
-            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
+            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
         } else {
             /* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
+#pragma clang loop unroll(disable)
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                    ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
+#pragma clang loop vectorize(enable)
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
                 }
             }
         }
@@ -51355,7 +51371,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 
     a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
         pFrameOut[c] = s;
@@ -51374,7 +51390,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
 
     a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
 
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
         pFrameOut[c] = s;
@@ -51541,7 +51557,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
 }
 
 
-static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
 {
     const float* pFramesInF32;
     /* */ float* pFramesOutF32;
@@ -51615,7 +51631,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
     return MA_SUCCESS;
 }
 
-static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
 {
     const float* pFramesInF32;
     /* */ float* pFramesOutF32;
@@ -52926,6 +52942,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                         #endif
                             {
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    #pragma clang loop vectorize(enable)
                                     for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
                                         pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
                                     }
@@ -52953,6 +52970,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                         #endif
                             {
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    #pragma clang loop vectorize(enable)
                                     for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
                                         pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
                                     }
@@ -52970,6 +52988,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
                         #endif
                             {
                                 for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    #pragma clang loop vectorize(enable)
                                     for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
                                         pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
                                     }
@@ -66059,7 +66078,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -66178,7 +66197,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -66260,7 +66279,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;